def parse_roll(response): page_url = response.url data = article_util.get_page_setting(page_url) try: link = LinkExtractor( restrict_xpaths= '//div[@class="paneT"]//ul[@class="iconBoxT14"]//li/a') links = link.extract_links(response) for link in links: url = link.url if url: if url.find('index.html'): continue data['link_text'] = link.text data['link_url'] = link.url data['page_function'] = 'parse_article' request = Request(url, dont_filter=True, priority=10, meta=data) yield request except: r.sadd('article:crawl:news:error_page', page_url) article_util.remove_page_setting(page_url)
def parse_roll(response): page_url = response.url data = article_util.get_page_setting(page_url) # 解析数据 text1 = response.text try: json1 = json.loads(text1) except: traceback.print_exc() r.sadd('article:crawl:news:error_page', response.url) return # 处理数据 if 'result' in json1 and 'status' in json1['result'] and 'code' in json1['result']['status'] \ and json1['result']['status']['code'] == 0: if 'data' in json1['result']: # 新闻列表 items = json1['result']['data'] # 采集新闻详情 for item in items: url = item['url'] data['link_text'] = item['title'] data['link_url'] = item['url'] data['page_function'] = 'parse_article' request = Request(url, dont_filter=True, priority=10, meta=data) yield request else: r.sadd('article:crawl:news:error_page', response.url) article_util.remove_page_setting(page_url)
def parse_roll(response): page_url = response.url data = article_util.get_page_setting(page_url) # 解析数据 text1 = response.text try: json1 = json.loads(text1) datas = json1['data'] except: traceback.print_exc() r.sadd('article:crawl:news:error_page', page_url) return # 处理数据 if data: # 采集新闻详情 for item in datas: date = item['focus_date'] url = item['url'] data['link_text'] = item['title'] data['link_url'] = item['url'] data['page_function'] = 'parse_article' data['page_name'] = parse_page_name(page_url) data['title'] = item['title'] data['original_source'] = item['source'] data['publish_time_str'] = date data['publish_time'] = parse_time(date) request = Request(url, dont_filter=True, priority=10, meta=data) yield request else: r.sadd('article:crawl:news:error_page', response.url) article_util.remove_page_setting(page_url)
def parse_mil_ent(response): page_url = response.url data = article_util.get_page_setting(page_url) text1 = re.sub(u"\\(|\\)", '', response.text) try: json1 = json.loads(text1) except: traceback.print_exc() r.sadd('article:crawl:news:error_page', response.url) return # 处理数据 if 'data' in json1 and 'status' in json1 and json1['status'] == 0: # 新闻列表 items = json1['data']['list'] # 采集新闻详情 for item in items: url = item['LinkUrl'] data['link_text'] = item['Title'] data['link_url'] = item['LinkUrl'] data['page_function'] = 'parse_article' request = Request(url, dont_filter=True, priority=10, meta=data) yield request else: r.sadd('article:crawl:news:error_page', response.url) article_util.remove_page_setting(page_url)
def parse_china(response): link = LinkExtractor(restrict_xpaths=['//li']) links = link.extract_links(response) page_url = response.url data = article_util.get_page_setting(page_url) # 设置数据 for link in links: url = link.url if url.startswith('https://news.sina.com.cn/c'): data['link_text'] = link.text data['link_url'] = link.url data['page_function'] = 'parse_china_article' request = Request(url, dont_filter=True, priority=10, meta=data) yield request
def parse_roll(response): page_url = response.url data = article_util.get_page_setting(page_url) tree_node = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8')) c_node = tree_node.xpath( '//ul[contains(@class,"fin_newsList") and contains(@class,"cfix")]') cc_node = c_node[0].xpath('//li[@class="cfix"]/h2') if cc_node: for ccc_node in cc_node: cc_url = ccc_node.xpath('a/@href')[0] data['link_text'] = ccc_node.xpath('a/text()')[0] data['link_url'] = cc_url data['page_function'] = 'parse_article' yield Request(cc_url, dont_filter=True, priority=10, meta=data) else: r.sadd('article:crawl:news:error_page', response.url) article_util.remove_page_setting(page_url)
def parse_roll_keji(response): page_url = response.url data = article_util.get_page_setting(page_url) try: link = LinkExtractor( restrict_xpaths='//div[@class="right_content"]/div[2]/ul/li') links = link.extract_links(response) for link in links: url = link.url if url: data['link_text'] = link.text data['link_url'] = link.url data['page_function'] = 'parse_article' request = Request(url, dont_filter=True, priority=10, meta=data) yield request except: r.sadd('article:crawl:news:error_page', page_url) article_util.remove_page_setting(page_url)
def parse_roll(response): page_url = response.url data = article_util.get_page_setting(page_url) try: link = LinkExtractor(restrict_xpaths='//dd[@class="dd6401"]/a') links = link.extract_links(response) for link in links: url = link.url if url: if url.find('https://mp.weixin.qq.com') > -1: continue data['link_text'] = link.text data['link_url'] = link.url data['page_function'] = 'parse_article' request = Request(url, dont_filter=True, priority=10, meta=data) yield request except: r.sadd('article:crawl:news:error_page', page_url) article_util.remove_page_setting(page_url)
def parse(self, response): page_url = response.url data = article_util.get_page_setting(page_url) # 判断页面配置是否为空 if not data: return None print(data['page_module'], data['page_package'], data['page_function']) # 动态加载文件 lib = importlib.import_module('.' + data['page_module'], data['page_package']) # 动态执行文件结果 for item in eval('lib.%s(response)' % data['page_function']): # 返回 Request if isinstance(item, Request): request = item data = request.meta url = data['link_url'] main_url = data['page_url'] dupe_key = main_url + ':' + url dupe_key = hashlib.md5(dupe_key.encode('utf-8')).hexdigest() # 检查是否已采集 if self.r.sismember('article:crawl:news:urls', dupe_key) or self.r.sismember( 'article:crawl:news:error_page', url): print('已采集-->' + url) continue # 设为已采集 request.meta['dupe_key'] = dupe_key self.r.sadd('article:crawl:news:urls', dupe_key) # 设置request request.callback = self.parse_article request.errback = self.errback yield request