def __init__(self): self.linker = Linker() self.err_handler = hanerr.ErrorHandler()
class BBCSpider: q = Queue() cnx = mysql.connector.connect(user='******', password='******'.format(password), database='crawl') cursor = cnx.cursor() def __init__(self): self.linker = Linker() self.err_handler = hanerr.ErrorHandler() def catch_news_urls(self): # 将BBC中的搜索结果的每一条新闻的 链接和对应的新闻名 都写入数据库 i = 0 flag = True while flag: # 自动翻页直至页面内容为空(只有空符号) i += 1 page_url = 'http://www.bbc.co.uk/search/more?page={}&q=china%20vietnam'.format(i) print 'page:' + str(i) for try_times in range(5): try: response = self.linker.direct(page_url, change=0) response.raise_for_status() except: continue if response.text.strip(): selector = etree.HTML(response.text) urls = selector.xpath('//ol[@class="search-results results"]//a[@class="rs_touch"]/@href') titles = selector.xpath('//ol[@class="search-results results"]/li//div/h1//text()') BBCSpider.q.put((urls, titles)) break else: print '[END]' flag = False break def catch_html(self, news_id, html, url): # 返回关于url的源代码 if html: print '[自动跳过]' return for i in range(5): try: r = self.linker.direct(url) html = r.text BBCSpider.q.put((news_id, html)) break except BaseException as err: self.err_handler.handle_linkerror(news_id, err) @staticmethod def catch_news_message(news_list): # 匹配新闻内容(发布时间、新闻简介、新闻主体) # 判断是否已经爬取 # each[0]为id, each[1]为html, each[2]为match_time for each in news_list: if each[2]: print '[自动跳过]' continue try: # 调用MatchManager匹配内容 body, public_time = match_message.MatchManager(str(each[1])).main_match() match_time = datetime.now().date() BBCSpider.q.put((each[0], public_time, body, match_time)) # (id, 发布时间, 新闻内容, 匹配时间) except BaseException as err: hanerr.ErrorHandler.err_q.put((each[0], err)) # (id, 错误) @staticmethod def threading_caturl(): threads = [threading.Thread(target=BBCSpider().catch_news_urls), threading.Thread(target=insert_bbc_url)] for each in threads: each.start() for each in threads: each.join() @staticmethod def threading_download(): query_sql = 'SELECT ID, html, url FROM bbc_test' BBCSpider.cursor.execute(query_sql) temp = BBCSpider.cursor.fetchall() threads = [] for each in temp: threading.Thread(target=BBCSpider().catch_html, args=(each[0], each[1], each[2])).start() threading.Thread(target=update_bbc_html).start() threading.Thread(target=hanerr.basic_handle).start() @staticmethod def threading_match(): query_sql = 'SELECT ID, html, match_time FROM bbc_test' BBCSpider.cursor.execute(query_sql) news_list = BBCSpider.cursor.fetchall() threads = [threading.Thread(target=BBCSpider.catch_news_message, args=(news_list,)), threading.Thread(target=update_bbc_message), threading.Thread(target=hanerr.basic_handle)] for each in threads: each.start() for each in threads: each.join() @staticmethod def has_url(url): query = 'SELECT 1 FROM bbc_test WHERE url="{}" limit 1'.format(url) BBCSpider.cursor.execute(query) if BBCSpider.cursor.fetchone(): return True return False