def crawl(self): # time = 20 url = self.que.get() if url not in self.visited: if self.rlock.acquire(): self.visited.add(url) self.rlock.release() else: return try: # html = urllib.request.urlopen(url).read().decode('gbk') req = urllib.request.Request(url, headers={ 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }) html = urllib.request.urlopen(req, timeout=30).read().decode('gbk') if html: soup = BeautifulSoup(html, 'lxml') if self.is_news_url(url): # print(self.news) print(url) if url not in self.news: news = News(soup, url) lenth = len(news.content) if lenth > 0: try: news.add() if self.rlock.acquire(): self.news.add(url) self.rlock.release() except Exception as e: raise e else: print(url + ' is exist') links = soup.find_all('a') for link in links: # get url in link link_url = link.get('href') if self.is_crawl_url(link_url): # print(link_url) # insert into the queue # self.is_news_url(link_url) if not self.is_news_url(link_url): self.que.put(link_url) else: self.que.put(self.is_news_url(link_url)) except Exception as e: if self.is_news_url(url): print('err:' + url) print('exception:' + str(e))
# coding:utf-8 from JLQueue import JLQueue from JLThread import ThreadPool from JLModel import News from JLModel import app g_news = News.get_all_news() g_queue = JLQueue() g_visited = set() # g_news = set() g_threads = [] inital_url = 'http://news.163.com/' g_queue.put(inital_url) g_threadpool = ThreadPool(11, g_threads, g_queue, g_visited, g_news) g_threadpool.start() # run server # JLServer.run() app.run(host="127.0.0.1", port=25000) g_threadpool.wait_for_complete() # print(g_visited)