コード例 #1
0
    def crawl(self):
        # time = 20
        url = self.que.get()
        if url not in self.visited:
            if self.rlock.acquire():
                self.visited.add(url)
                self.rlock.release()
        else:
            return

        try:
            # html = urllib.request.urlopen(url).read().decode('gbk')
            req = urllib.request.Request(url, headers={
                'Connection': 'Keep-Alive',
                'Accept': 'text/html, application/xhtml+xml, */*',
                'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
            })
            html = urllib.request.urlopen(req, timeout=30).read().decode('gbk')
            if html:
                soup = BeautifulSoup(html, 'lxml')
                if self.is_news_url(url):
                    # print(self.news)
                    print(url)
                    if url not in self.news:
                        news = News(soup, url)
                        lenth = len(news.content)
                        if lenth > 0:
                            try:
                                news.add()
                                if self.rlock.acquire():
                                    self.news.add(url)
                                    self.rlock.release()
                            except Exception as e:
                                raise e
                    else:
                        print(url + ' is exist')
                links = soup.find_all('a')
                for link in links:
                    # get url in link
                    link_url = link.get('href')
                    if self.is_crawl_url(link_url):
                        # print(link_url)
                        # insert into the queue
                        # self.is_news_url(link_url)
                        if not self.is_news_url(link_url):
                            self.que.put(link_url)
                        else:
                            self.que.put(self.is_news_url(link_url))
        except Exception as e:
            if self.is_news_url(url):
                print('err:' + url)
                print('exception:' + str(e))
コード例 #2
0
ファイル: run.py プロジェクト: ldp940622/netease-crawler
# coding:utf-8
from JLQueue import JLQueue
from JLThread import ThreadPool
from JLModel import News
from JLModel import app

g_news = News.get_all_news()
g_queue = JLQueue()
g_visited = set()
# g_news = set()
g_threads = []
inital_url = 'http://news.163.com/'

g_queue.put(inital_url)
g_threadpool = ThreadPool(11, g_threads, g_queue, g_visited, g_news)
g_threadpool.start()
# run server
# JLServer.run()
app.run(host="127.0.0.1", port=25000)
g_threadpool.wait_for_complete()
# print(g_visited)