def parse(text, db, sub_url, main_text_pattern, date_pattern, source_pattern, title_pattern, id_, title, date_): title_in_page, new_date = title, date_ request_text = remove_js_css(text) selector = Selector(text=request_text) if date_pattern or source_pattern or title_pattern: new_date, source, title_in_page = parse_struct_info(selector, date_pattern, source_pattern, title_pattern) if new_date: update_db(db, id_, 'pub_date', new_date) else: new_date = date_ if source: update_db(db, id_, 'source', source) if title_in_page and len(utils.get_chinese(title_in_page)) > len(utils.get_chinese(title)) and title_in_page[-3:] != '...': update_db(db, id_, 'title', title_in_page) else: title_in_page = utils.fulfill_title(title, selector) if title_in_page[-3:] != '...': update_db(db, id_, 'title', title_in_page) else: if title[-3:] == '...': title_in_page = utils.fulfill_title(title, selector) if title_in_page[-3:] != '...': update_db(db, id_, 'title', title_in_page) if main_text_pattern: main_text, attachment, img = parse_main_text(sub_url, selector, main_text_pattern) else: try: task = parse_context.MAIN_TEXT(sub_url, request_text) result_dict = task.main() main_text = result_dict['content'] img = ','.join(result_dict['img']) attachment = ','.join(result_dict['attachment']) except Exception: main_text, img, attachment = '', '', '' return main_text, attachment, img, title_in_page, new_date
def solve_one_line(line, url, date_in_detail): """ 从单个信息行中寻找链接文本,地址和包含的时间 :param date_in_detail: 传入的date位置的配置文件 :return: :param line: 信息行 :param url: 当前url :return: None if find nothing,tuple(title,sub_link,date) if find something """ tmp_dict = {} date = None a_list = line.xpath('.//descendant-or-self::a') # 寻找line中隐藏的时间 text_list = line.xpath('.//text()').extract() clean_list = [item.strip() for item in text_list if item.strip()] for content in clean_list: date = utils.search_date_time(content) if date: break if (not date) and (not date_in_detail): # 检验时间指标的完整性 return None, None, None # 提取子链接,规则为包含中文最多的链接 for a in a_list: text_in_title = a.xpath('./@title').extract() text_li = [item.strip() for item in a.xpath('.//text()').extract()] if text_li: text = text_li[0] for tmp in text_li: if len(tmp) > len(text): text = tmp else: text = '' href = a.xpath('./@href').extract_first() if href: sub_link = urljoin(url, href) else: sub_link = '' if not text_in_title: title = text.strip() elif len(utils.get_chinese(text_in_title[0])) < len( utils.get_chinese(text)): title = text.strip() else: title = text_in_title[0].strip() tmp_dict[title] = sub_link final_title_list = list(tmp_dict.keys()) if not final_title_list: # 无任何提取到的内容 return None, None, None else: final_title = final_title_list[0] for title_text in final_title_list: tmp1, tmp2 = utils.get_chinese(title_text), utils.get_chinese( final_title) if len(tmp1) > len(tmp2): final_title = title_text return final_title, tmp_dict[ final_title], date # title, sub_link,date
def ichinese(): urls = get_test_urls() res = defaultdict(int) for url in urls: html = get_or_cache(url) chs = get_chinese(html) for ch in chs: res[ch] += 1 res = '|'.join([a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()), key=lambda x: -x[1])]) save_json('chineses.json', res)
def ichinese(): urls = get_test_urls() res = defaultdict(int) for url in urls: html = get_or_cache(url) chs = get_chinese(html) for ch in chs: res[ch] += 1 res = '|'.join([ a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()), key=lambda x: -x[1]) ]) save_json('chineses.json', res)
def find_item_by_config(self, url, date_in_detail): # 使用配置文件寻找子链接 ret_dict = dict() line_list = self.selector.xpath(self.item_pattern) for line in line_list: title, sub_link, date = self.solve_one_line( line, url, date_in_detail) if not title: continue # find nothing else: if len(utils.get_chinese(title)) > 3: # 如果是增量更新模式的话,这里还要跟数据库比对 ret_dict[title] = (sub_link, date) return ret_dict
def find_item_auto(self, url, date_in_detail): # 首先要找到两个最长的链接 ret_dict = {} if date_in_detail: # 如果没有时间约束的话,就不能使用全局root root_node = self.find_root_node() else: root_node = self.selector for line in root_node.xpath('.//*'): title, sub_link, date = self.solve_one_line( line, url, date_in_detail) if not title: continue # find nothing else: if len(utils.get_chinese(title)) > 3: ret_dict[title] = (sub_link, date) return ret_dict
def parse_struct_info(self, selector0): remove_list = ['稿源', '来源', '发布机构', '发布日期', '发文机关'] if self.date_pattern: try: date_raw = selector0.xpath(self.date_pattern).extract_first() except Exception: date = '' else: if isinstance(date_raw, str): date = utils.search_date_time(date_raw) else: date = '' else: date = '' if self.source_pattern and self.source_pattern[0] == '/': self.source_pattern = self.source_pattern.replace('tbody', '') if 'text()' not in self.source_pattern: self.source_pattern = self.source_pattern + '//text()' try: source = utils.get_chinese(''.join( selector0.xpath(self.source_pattern).extract())) except Exception: source = '' else: for item in remove_list: source = source.replace(item, '') else: source = self.source_pattern if self.title_pattern: self.title_pattern = self.title_pattern.replace('tbody', '') if 'text()' not in self.title_pattern: self.title_pattern = self.title_pattern + '//text()' try: title = ''.join(selector0.xpath( self.title_pattern).extract()).strip() except Exception: title = '' else: if title[:2] == "名称" or title[:2] == "标题": title = title[3:] else: title = '' return date, source, title
async def parse_detail(self, sub_links, frame_list, index): for title in list(sub_links.keys()): self.sub_url_already_crawl[title] = sub_links[ title] # 无论结果如何都要存进去,避免僵死 sub_url = sub_links[title][0] if sub_url[-4:] in { '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv' }: self.save_to_data({ 'title': title, 'sub_url': sub_url, 'date': sub_links[title][1], 'main_text': '', 'source': '', 'attachment': sub_url, 'img': '' }) continue else: xpath = "//a[contains(., '{}') or @title='{}']".format( title, title) try: button_selector = frame_list[index].selector.xpath(xpath) button = await frame_list[index].raw_frame.xpath(xpath) except Exception: self.save_to_data({ 'title': title, 'sub_url': sub_url, 'date': sub_links[title][1], 'main_text': '', 'source': '', 'attachment': '', 'img': '' }) continue else: if not button: self.save_to_data({ 'title': title, 'sub_url': sub_url, 'date': sub_links[title][1], 'main_text': '', 'source': '', 'attachment': '', 'img': '' }) continue await button[0].click() if button_selector.xpath( './@target').extract_first() == '_blank': await asyncio.sleep(1) while True: pages = await self.browser.pages() if len(pages) == 2: break else: await asyncio.sleep(1) content = await utils.get_content(pages[-1]) sub_url = pages[-1].url await pages[-1].close() else: await asyncio.sleep(6) pages = await self.browser.pages() if len(pages) == 2: content = await utils.get_content(pages[-1]) sub_url = pages[-1].url await pages[-1].close() else: content = await utils.get_content(self.page) sub_url = self.page.url await asyncio.wait([ self.page.goBack(), self.page.waitForXPath( xpath=xpath, timeout=CRAWL_SPEED['CLICK_SUB_URL_MAX_DELAY']) ]) request_text = utils.remove_js_css(content) selector = Selector(text=request_text) frame_list = await self.get_frame() date, source, title_in_page = self.parse_struct_info(selector) if not date: # 为date上双保险 date = sub_links[title][1] if title_in_page: # 为title上双保险 save_title = title_in_page if (not save_title) or save_title[-3:] == '...' or (len( utils.get_chinese(save_title)) < len( utils.get_chinese(title))): # 对title再做一次完整性检查 save_title = utils.fulfill_title(title, selector) else: if title[-3:] == '...': save_title = utils.fulfill_title(title, selector) else: save_title = title if self.main_text_pattern: main_text, attachment, img = self.parse_main_text( sub_url, request_text) self.save_to_data({ 'title': save_title, 'sub_url': sub_url, 'date': date, 'main_text': main_text, 'source': source, 'attachment': attachment, 'img': img }) else: try: task = parse_context.MAIN_TEXT(sub_url, request_text) result_dict = task.main() main_text = result_dict['content'] img = ','.join(result_dict['img']) attachment = ','.join(result_dict['attachment']) except Exception: main_text = '' img = '' attachment = '' self.save_to_data({ 'title': save_title, 'sub_url': sub_url, 'date': date, 'main_text': main_text, 'source': source, 'attachment': attachment, 'img': img }) return frame_list