def main(max_threads=5): scrape_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() urls = [] temple = scrape_callback.seed_url[0:-2] for i in range(1, 1189, 1): urls.append(temple + str(i) + '/') while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler( urls, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=30, host=urlparse.urlparse(scrape_callback.seed_url).netloc, user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ) else: print 'pass:' + str(now) pass time.sleep(3600)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl this website in multiple threads """ # the queue of URL's that still need to be crawled # crawl_queue = Queue.deque([seed_url]) # crawl_queue = [seed_url] crawl_queue = MongoQueue() crawl_queue.push(seed_url) # the URL's that have been seen seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: # crawl queue is empty break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen.add(link) # add this new link to queue crawl_queue.complete(link) # crawl_queue.append(link) # wait for all download threads to finish threads = [] while threads or crawl_queue: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) while len(threads) < max_threads and crawl_queue: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) # all threads have been processed # sleep temporarily so CPU can focus execution on other threads time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\ user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() webpage_cache = MongoCache() # crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) if (500 <= webpage_cache[url]['code'] < 600) | (webpage_cache[url]['code'] == -999): crawl_queue.reset(url) else: crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def url_to_mongoqueue(url): """ 把URL写入MongoDB的队列了 """ crawl_queue = MongoQueue('crawl_queue') response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') all_a = soup.find('div', class_='all').find_all('a') for a in all_a: title = a.get_text() url = a['href'] print(title) print(url) crawl_queue.push(url, title)
def link_crawler(seed_url, link_regex=None, proxies=None, delay=1, max_depth=-1, timeout=5, max_thread=5, sleep_time=1, cache=None, scraping_callback=None, debug=False): crawl_queue = MongoQueue() crawl_queue.push(seed_url) d = Downloader(cache=MongoCache(), delay=delay, proxies=proxies, timeout=timeout, debug=debug) def thread_crawl(): while True: try: url = crawl_queue.pop() html = d(url) except KeyError: break except Exception: pass else: links = scraping_callback(url, html) if scraping_callback else [] for link in links: crawl_queue.push(link) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_thread and crawl_queue: t = threading.Thread(target=thread_crawl) t.setDaemon(True) t.start() threads.append(t) time.sleep(sleep_time)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wu_being', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() ###################### crawl_queue.clear() ###################### crawl_queue.push(seed_url) ###################### D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() ###################### except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: ############# # add this new link to queue###################### crawl_queue.push(normalize( seed_url, link)) ###################### crawl_queue.complete(url) ###################### # wait for all download threads to finish threads = [] while threads or crawl_queue: ###################### for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek( ): ####################### # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled urllist = MongoQueue() #查找是否有状态为0的数据,返回一个true或者false def process_queue(): while True: # keep track that are processing url try: url = urllist.pop() print('url', url) D = Download() D.Downloader(url) except KeyError: # currently no urls to process break # wait for all download threads to finish threads = [] while threads or urllist: for thread in threads: if not thread.is_alive(): threads.remove(thread) print(urllist.peek() is True) if urllist.peek(): while len(threads) < max_threads: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) else: break time.sleep(SLEEP_TIME)
def __init__(self, max_urls=1000): self.max_urls = max_urls #http://m.biquge.biz/top/allvisit_1/ #http://m.benbenwx.com/top/allvisit_1/ #http://m.moliwenxue.com/top/allvisit_1/ #http://m.boluoxs.com/top/allvisit_1/ self.seed_url = 'http://m.boluoxs.com/top/allvisit_1/' self.queue = MongoQueue() self.book_data = BooKListDao()
def thread_crawler(seed_url, user_agent="wswp", headers=None, proxies=None, num_retries=2, cache=None, scrape_callback=None, max_threads_num=5): """crawl webpage use multipe threads""" crawl_queue = MongoQueue() crawl_queue.push(seed_url) D = Downloader(1, user_agent, headers, proxies, num_retries, cache) def process_task(): while True: try: url = crawl_queue.pop() except KeyError: print("currentlt no urls to process") break else: print("Downloading Thread name is ", sys.thread_info.name) html = D(url) if scrape_callback: try: links = scrape_callback() or [] except Exception as e: print("Error in callback for {}: {}".format(url, e)) else: for link in links: link = normalize(seed_url, link) crawl_queue.push(link) crawl_queue.complete(url) threads = [] while threads or crawl_queue: # the crawl is still alive for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads_num and crawl_queue: thread = Thread(target=process_task) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def thread_crawl(seed_url, max_threads=10, delay=5, user_agent='Aurora-Twinkle', proxies=None, max_retries=1, scrape_callback=None, cache=None): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, max_retries=max_retries, cache=cache) rp = get_robots(seed_url) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: if rp.can_fetch(user_agent, url): html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print("Error in callback for :{}:{}".format( url, e)) else: for link in links: link = format_link(seed_url, link) crawl_queue.push(link) crawl_queue.complete(url) else: print( 'user_agent: "' + user_agent + '" Blocked by robots.txt:', url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled urllist = MongoQueue() #查找是否有状态为0的数据,返回一个true或者false D = Download() loop = asyncio.get_event_loop() while True: try: url = urllist.pop() print(url) tasks = [asyncio.ensure_future(D.Downloader(url))] * 10 loop.run_until_complete(asyncio.wait(tasks)) urllist.complete(url) except KeyError: break '''
def __init__(self, max_urls=1000): self.max_urls = max_urls #http://m.biquge.biz/top/allvisit_1/ #http://m.benbenwx.com/top/allvisit_1/ #http://m.moliwenxue.com/top/allvisit_1/ #http://m.boluoxs.com/top/allvisit_1/ #self.seed_url = 'http://m.junzige.la/top/allvisit_400/' self.urls = [] # self.seed_url = 'http://www.junzige.la/' self.seed_url = 'http://www.boluoxs.com/' self.queue = MongoQueue() self.book_data = BooKCatlogDao()
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Download(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print(f'Error in callback for:{url}:{e}') else: for link in links: crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def main(max_threads = 5): catlog_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() client = MongoClient('localhost', 27017, connect=False) #create collection to store cached webpages, # which is the equivalent of a table in a relational database db = client.cache cursor = db.books.find() urls = [] while cursor.alive: temp = cursor.next() temp = temp['link'] if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la': temp = '/novel' + temp[5:-4] + '/' temp = normalize(catlog_callback.seed_url, temp) elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com': temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/' print temp urls.append(temp) print urls[0] while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36') # every time finished, clear the job queue queue.clear() else: print 'pass:' + str(now) pass time.sleep(3600)
import pandas as pd import re import numpy as np from process_crawler import process_crawler from mongo_queue import MongoQueue from mongo_cache import MongoCache from mongo_info import MongoInfo from downloader import Downloader from lxml import etree crawl_queue = MongoQueue() webpage_cache = MongoCache() DEFAULT_AGENT = {} DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 100 DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv' DEFAULT_COOKIE = {} D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def usere(regex, getcontent): #定义使用正则表达式的函数 pattern = re.compile(regex) content = re.findall(pattern, getcontent) return content #Obtain target urls startdate = '20180414' enddate = '20180415'
import pandas as pd import re from process_crawler import process_crawler from mongo_queue import MongoQueue from mongo_cache import MongoCache from mongo_info import MongoInfo from downloader import Downloader crawl_queue = MongoQueue() crawl_queue.turn_down() webpage_cache = MongoCache() DEFAULT_AGENT = {} DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 100 DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv' DEFAULT_COOKIE = {} D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def usere(regex, getcontent): #定义使用正则表达式的函数 pattern = re.compile(regex) content = re.findall(pattern, getcontent) return content #Clear Cache # crawl_queue.clear()
class Mzitu_crawler(): current_dir = 'E:\sunchengquan\PycharmProjects\mzitu' #os.path.dirname(__file__) crawl_queue = MongoQueue('crawl_queue') img_queue = MongoQueue('img_queue') max_threads = 16 sleep_time = 1 def url_open(self, url, headers={}): """使用代理IP打开链接""" response = "" while response == "": try: print("代理ip:", self.proxy) response = get_page(url, proxies=self.proxy, timeout=30, options=headers) return response except: self.proxy = MongoClient().random() continue def pageurl_crawler(self, lock): while 1: try: url = self.crawl_queue.pop() print(url) except KeyError: print('队列没有数据') break else: img_urls = {} title = self.crawl_queue.pop_title(url) title = re.sub('[?,。;:、,.;:?!!·]', '', title) self.mkdir(title) response = requests.get(url) web_title = BeautifulSoup(response.text, 'lxml').find('title').get_text() if '妹子图' in web_title: max_span = BeautifulSoup(response.text, 'lxml').find( 'div', class_='pagenavi').find_all('span')[-2].get_text() lock.acquire() path = self.current_dir + '\\' + title for page in range(1, int(max_span) + 1): page_url = url + '/' + str(page) img_url = BeautifulSoup( requests.get(page_url).text, 'lxml').find( 'div', class_='main-image').find('img')['src'] img_urls[img_url] = page_url self.save(img_url, page_url, path) self.crawl_queue.complete(url) self.img_queue.push_imgurl(title, img_urls) lock.release() def mkdir(self, path): path = path.strip() isExists = os.path.exists(os.path.join(self.current_dir, path)) if not isExists: print('建了一个名字叫做', path, '的文件夹') os.makedirs(os.path.join(self.current_dir, path)) return True else: print('名字叫做', path, '的文件夹已经存在了') return False def save(self, img_url, page_url, path): name = img_url[-9:-4] print('开始保存:', img_url) header = {'Referer': page_url} img = self.url_open(img_url, headers=header) content = img.content time.sleep(0.5) f = open(path + '\\' + name + '.jpg', 'wb') f.write(content) f.close() def thread_crawler(self): threads = [] while threads or self.crawl_queue: """ 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 """ for thread in threads: if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 threads.remove(thread) while len(threads) < self.max_threads: ##线程池中的线程少于max_threads lock = threading.Lock() thread = threading.Thread(target=self.pageurl_crawler, args=(lock, )) ##创建线程 thread.setDaemon(True) ##设置守护线程 thread.start() ##启动线程 threads.append(thread) ##添加进线程队列 time.sleep(self.sleep_time)
# dateclick = '//*[@id="TAB_QueryConditionItem291"]' # dyrclick = '//*[@id="TAB_queryTextItem_82"]' # dyr = '//*[@id="TAB_queryTextItem_82"]' # tdytclick = '//*[@id="TAB_QueryConditionItem282"]' # tdyt = '//*[@id="TAB_queryTblEnumItem_282"]' # searchkey = '//*[@id="TAB_QueryButtonControl"]' #转让的Key startkey = '//*[@id="TAB_queryDateItem_277_1"]' endkey = '//*[@id="TAB_queryDateItem_277_2"]' dateclick = '//*[@id="TAB_QueryConditionItem277"]' userkey = '//*[@id="TAB_queryTextItem_275"]' userclick = '//*[@id="TAB_QueryConditionItem275"]' searchclick = '//*[@id="TAB_QueryButtonControl"]' crawl_queue = MongoQueue() browser = webdriver.Chrome() browser.get(url) browser.find_element_by_xpath(dateclick).click() browser.find_element_by_xpath(userclick).click() datelist = [] for year in range(2009, 2017): for month in range(1, 13): startday, endday = calendar.monthrange(year, month) datelist.append([str(year) + '-' + str(month) + '-' + str(1),\ str(year) + '-' + str(month) + '-' + str(endday)]) for date in datelist: returnlist = list(pd.read_csv(CONNECT_PATH + 'waiting.csv')['url']) browser.find_element_by_xpath(startkey).clear() browser.find_element_by_xpath(endkey).clear()
def threaded_crawler(seed_url, link_regex=None, delay=1, cache=None, scrape_callback=None, user_agent='Safari', proxies=None, num_retries=1, max_threads=10, timeout=60): """ 多线程爬虫 多个线程处理一个队列 使用mongo作为队列 """ # crawl_queue = [seed_url] crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) # seen = set([seed_url]) # 黑名单网站 block_filename = os.path.join(BASEDIR, 'blocked_urls.txt') blocked_urls = [i.strip() for i in open(block_filename) if i.strip()] \ if os.path.isfile(block_filename) else [] # save_cache=False为测试需要 D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout, cache=cache, save_cache=False, blocked_urls=blocked_urls) def process_queue(): while 1: try: url = crawl_queue.pop() except (IndexError, KeyError): # 队列为空则停止 break else: html = D(url) if url else None if html and scrape_callback: try: links = scrape_callback(url, html) or [] if link_regex: links.extend(link for link in get_links(html) if re.match(link_regex, link)) except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) crawl_queue.push(link) # 入列 # if link not in seen: # seen.add(link) # print html # if html: # # 标记为已完成 # crawl_queue.complete(url) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", ] headers2 = headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} headers = [headers1, headers2] baike_queue = MongoQueue("scrawler", "baidubaike") start_url = ["https://baike.baidu.com/item/%E5%88%98%E5%BE%B7%E5%8D%8E/114923", "https://baike.baidu.com/item/%E4%B8%AD%E5%9B%BD/1122445", "https://baike.baidu.com/item/%E9%98%BF%E5%B0%94%E4%BC%AF%E7%89%B9%C2%B7%E7%88%B1%E5%9B%A0%E6%96%AF%E5%9D%A6/127535?fromtitle=%E7%88%B1%E5%9B%A0%E6%96%AF%E5%9D%A6&fromid=122624&fr=aladdin", "https://baike.baidu.com/item/%E6%95%B0%E5%AD%A6/107037?fr=aladdin", "https://baike.baidu.com/item/%E4%BD%93%E8%82%B2", "https://baike.baidu.com/item/%E7%BE%8E%E9%A3%9F", "https://baike.baidu.com/item/%E5%8C%BB%E5%AD%A6", "https://baike.baidu.com/item/%E7%A7%91%E5%AD%A6%E6%8A%80%E6%9C%AF?fromtitle=%E7%A7%91%E6%8A%80&fromid=662906", "https://baike.baidu.com/item/%E8%8B%B9%E6%9E%9C/5670" ] adapter = requests.adapters.HTTPAdapter(max_retries=20) def start(): if baike_queue.db.count() == 0: for url in start_url: