Example #1
0
 def __init__(self):
     self.linker = Linker()
     self.err_handler = hanerr.ErrorHandler()
Example #2
0
class BBCSpider:
    q = Queue()
    cnx = mysql.connector.connect(user='******', password='******'.format(password), database='crawl')
    cursor = cnx.cursor()

    def __init__(self):
        self.linker = Linker()
        self.err_handler = hanerr.ErrorHandler()

    def catch_news_urls(self):
        # 将BBC中的搜索结果的每一条新闻的 链接和对应的新闻名 都写入数据库
        i = 0
        flag = True
        while flag:
            # 自动翻页直至页面内容为空(只有空符号)
            i += 1
            page_url = 'http://www.bbc.co.uk/search/more?page={}&q=china%20vietnam'.format(i)
            print 'page:' + str(i)
            for try_times in range(5):
                try:
                    response = self.linker.direct(page_url, change=0)
                    response.raise_for_status()
                except:
                    continue
                if response.text.strip():
                    selector = etree.HTML(response.text)
                    urls = selector.xpath('//ol[@class="search-results results"]//a[@class="rs_touch"]/@href')
                    titles = selector.xpath('//ol[@class="search-results results"]/li//div/h1//text()')
                    BBCSpider.q.put((urls, titles))
                    break
                else:
                    print '[END]'
                    flag = False
                    break

    def catch_html(self, news_id, html, url):
        # 返回关于url的源代码
        if html:
            print '[自动跳过]'
            return
        for i in range(5):
            try:
                r = self.linker.direct(url)
                html = r.text
                BBCSpider.q.put((news_id, html))
                break
            except BaseException as err:
                self.err_handler.handle_linkerror(news_id, err)

    @staticmethod
    def catch_news_message(news_list):
        # 匹配新闻内容(发布时间、新闻简介、新闻主体)
        # 判断是否已经爬取

        # each[0]为id, each[1]为html, each[2]为match_time
        for each in news_list:
            if each[2]:
                print '[自动跳过]'
                continue
            try:
                # 调用MatchManager匹配内容
                body, public_time = match_message.MatchManager(str(each[1])).main_match()
                match_time = datetime.now().date()
                BBCSpider.q.put((each[0], public_time, body, match_time))  # (id, 发布时间, 新闻内容, 匹配时间)
            except BaseException as err:
                hanerr.ErrorHandler.err_q.put((each[0], err))  # (id, 错误)

    @staticmethod
    def threading_caturl():
        threads = [threading.Thread(target=BBCSpider().catch_news_urls), threading.Thread(target=insert_bbc_url)]
        for each in threads:
            each.start()
        for each in threads:
            each.join()

    @staticmethod
    def threading_download():
        query_sql = 'SELECT ID, html, url FROM bbc_test'
        BBCSpider.cursor.execute(query_sql)
        temp = BBCSpider.cursor.fetchall()
        threads = []
        for each in temp:
            threading.Thread(target=BBCSpider().catch_html, args=(each[0], each[1], each[2])).start()
        threading.Thread(target=update_bbc_html).start()
        threading.Thread(target=hanerr.basic_handle).start()

    @staticmethod
    def threading_match():
        query_sql = 'SELECT ID, html, match_time FROM bbc_test'
        BBCSpider.cursor.execute(query_sql)
        news_list = BBCSpider.cursor.fetchall()
        threads = [threading.Thread(target=BBCSpider.catch_news_message, args=(news_list,)),
                   threading.Thread(target=update_bbc_message),
                   threading.Thread(target=hanerr.basic_handle)]
        for each in threads:
            each.start()
        for each in threads:
            each.join()

    @staticmethod
    def has_url(url):
        query = 'SELECT 1 FROM bbc_test WHERE url="{}" limit 1'.format(url)
        BBCSpider.cursor.execute(query)
        if BBCSpider.cursor.fetchone():
            return True
        return False