Ejemplo n.º 1
0
def parse(text, db, sub_url, main_text_pattern, date_pattern, source_pattern, title_pattern, id_, title, date_):
    title_in_page, new_date = title, date_
    request_text = remove_js_css(text)
    selector = Selector(text=request_text)
    if date_pattern or source_pattern or title_pattern:
        new_date, source, title_in_page = parse_struct_info(selector, date_pattern, source_pattern, title_pattern)
        if new_date:
            update_db(db, id_, 'pub_date', new_date)
        else:
            new_date = date_
        if source:
            update_db(db, id_, 'source', source)
        if title_in_page and len(utils.get_chinese(title_in_page)) > len(utils.get_chinese(title)) and title_in_page[-3:] != '...':
            update_db(db, id_, 'title', title_in_page)
        else:
            title_in_page = utils.fulfill_title(title, selector)
            if title_in_page[-3:] != '...':
                update_db(db, id_, 'title', title_in_page)
    else:
        if title[-3:] == '...':
            title_in_page = utils.fulfill_title(title, selector)
            if title_in_page[-3:] != '...':
                update_db(db, id_, 'title', title_in_page)
    if main_text_pattern:
        main_text, attachment, img = parse_main_text(sub_url, selector, main_text_pattern)
    else:
        try:
            task = parse_context.MAIN_TEXT(sub_url, request_text)
            result_dict = task.main()
            main_text = result_dict['content']
            img = ','.join(result_dict['img'])
            attachment = ','.join(result_dict['attachment'])
        except Exception:
            main_text, img, attachment = '', '', ''
    return main_text, attachment, img, title_in_page, new_date
Ejemplo n.º 2
0
 def solve_one_line(line, url, date_in_detail):
     """
     从单个信息行中寻找链接文本,地址和包含的时间
     :param date_in_detail: 传入的date位置的配置文件
     :return:
     :param line: 信息行
     :param url: 当前url
     :return: None if find nothing,tuple(title,sub_link,date) if find something
     """
     tmp_dict = {}
     date = None
     a_list = line.xpath('.//descendant-or-self::a')
     # 寻找line中隐藏的时间
     text_list = line.xpath('.//text()').extract()
     clean_list = [item.strip() for item in text_list if item.strip()]
     for content in clean_list:
         date = utils.search_date_time(content)
         if date:
             break
     if (not date) and (not date_in_detail):  # 检验时间指标的完整性
         return None, None, None
     # 提取子链接,规则为包含中文最多的链接
     for a in a_list:
         text_in_title = a.xpath('./@title').extract()
         text_li = [item.strip() for item in a.xpath('.//text()').extract()]
         if text_li:
             text = text_li[0]
             for tmp in text_li:
                 if len(tmp) > len(text):
                     text = tmp
         else:
             text = ''
         href = a.xpath('./@href').extract_first()
         if href:
             sub_link = urljoin(url, href)
         else:
             sub_link = ''
         if not text_in_title:
             title = text.strip()
         elif len(utils.get_chinese(text_in_title[0])) < len(
                 utils.get_chinese(text)):
             title = text.strip()
         else:
             title = text_in_title[0].strip()
         tmp_dict[title] = sub_link
     final_title_list = list(tmp_dict.keys())
     if not final_title_list:  # 无任何提取到的内容
         return None, None, None
     else:
         final_title = final_title_list[0]
         for title_text in final_title_list:
             tmp1, tmp2 = utils.get_chinese(title_text), utils.get_chinese(
                 final_title)
             if len(tmp1) > len(tmp2):
                 final_title = title_text
         return final_title, tmp_dict[
             final_title], date  # title, sub_link,date
Ejemplo n.º 3
0
def ichinese():
	urls = get_test_urls()
	res = defaultdict(int)
	for url in urls:
		html = get_or_cache(url)
		chs = get_chinese(html)
		for ch in chs:
			res[ch] += 1

	res = '|'.join([a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()), key=lambda x: -x[1])])
	save_json('chineses.json', res)
Ejemplo n.º 4
0
def ichinese():
    urls = get_test_urls()
    res = defaultdict(int)
    for url in urls:
        html = get_or_cache(url)
        chs = get_chinese(html)
        for ch in chs:
            res[ch] += 1

    res = '|'.join([
        a for a, b in sorted(filter(lambda x: x[1] >= 40, res.iteritems()),
                             key=lambda x: -x[1])
    ])
    save_json('chineses.json', res)
Ejemplo n.º 5
0
 def find_item_by_config(self, url, date_in_detail):
     # 使用配置文件寻找子链接
     ret_dict = dict()
     line_list = self.selector.xpath(self.item_pattern)
     for line in line_list:
         title, sub_link, date = self.solve_one_line(
             line, url, date_in_detail)
         if not title:
             continue  # find nothing
         else:
             if len(utils.get_chinese(title)) > 3:
                 # 如果是增量更新模式的话,这里还要跟数据库比对
                 ret_dict[title] = (sub_link, date)
     return ret_dict
Ejemplo n.º 6
0
 def find_item_auto(self, url, date_in_detail):
     # 首先要找到两个最长的链接
     ret_dict = {}
     if date_in_detail:  # 如果没有时间约束的话,就不能使用全局root
         root_node = self.find_root_node()
     else:
         root_node = self.selector
     for line in root_node.xpath('.//*'):
         title, sub_link, date = self.solve_one_line(
             line, url, date_in_detail)
         if not title:
             continue  # find nothing
         else:
             if len(utils.get_chinese(title)) > 3:
                 ret_dict[title] = (sub_link, date)
     return ret_dict
Ejemplo n.º 7
0
 def parse_struct_info(self, selector0):
     remove_list = ['稿源', '来源', '发布机构', '发布日期', '发文机关']
     if self.date_pattern:
         try:
             date_raw = selector0.xpath(self.date_pattern).extract_first()
         except Exception:
             date = ''
         else:
             if isinstance(date_raw, str):
                 date = utils.search_date_time(date_raw)
             else:
                 date = ''
     else:
         date = ''
     if self.source_pattern and self.source_pattern[0] == '/':
         self.source_pattern = self.source_pattern.replace('tbody', '')
         if 'text()' not in self.source_pattern:
             self.source_pattern = self.source_pattern + '//text()'
         try:
             source = utils.get_chinese(''.join(
                 selector0.xpath(self.source_pattern).extract()))
         except Exception:
             source = ''
         else:
             for item in remove_list:
                 source = source.replace(item, '')
     else:
         source = self.source_pattern
     if self.title_pattern:
         self.title_pattern = self.title_pattern.replace('tbody', '')
         if 'text()' not in self.title_pattern:
             self.title_pattern = self.title_pattern + '//text()'
         try:
             title = ''.join(selector0.xpath(
                 self.title_pattern).extract()).strip()
         except Exception:
             title = ''
         else:
             if title[:2] == "名称" or title[:2] == "标题":
                 title = title[3:]
     else:
         title = ''
     return date, source, title
Ejemplo n.º 8
0
 async def parse_detail(self, sub_links, frame_list, index):
     for title in list(sub_links.keys()):
         self.sub_url_already_crawl[title] = sub_links[
             title]  # 无论结果如何都要存进去,避免僵死
         sub_url = sub_links[title][0]
         if sub_url[-4:] in {
                 '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.txt', '.csv'
         }:
             self.save_to_data({
                 'title': title,
                 'sub_url': sub_url,
                 'date': sub_links[title][1],
                 'main_text': '',
                 'source': '',
                 'attachment': sub_url,
                 'img': ''
             })
             continue
         else:
             xpath = "//a[contains(., '{}') or @title='{}']".format(
                 title, title)
             try:
                 button_selector = frame_list[index].selector.xpath(xpath)
                 button = await frame_list[index].raw_frame.xpath(xpath)
             except Exception:
                 self.save_to_data({
                     'title': title,
                     'sub_url': sub_url,
                     'date': sub_links[title][1],
                     'main_text': '',
                     'source': '',
                     'attachment': '',
                     'img': ''
                 })
                 continue
             else:
                 if not button:
                     self.save_to_data({
                         'title': title,
                         'sub_url': sub_url,
                         'date': sub_links[title][1],
                         'main_text': '',
                         'source': '',
                         'attachment': '',
                         'img': ''
                     })
                     continue
             await button[0].click()
             if button_selector.xpath(
                     './@target').extract_first() == '_blank':
                 await asyncio.sleep(1)
                 while True:
                     pages = await self.browser.pages()
                     if len(pages) == 2:
                         break
                     else:
                         await asyncio.sleep(1)
                 content = await utils.get_content(pages[-1])
                 sub_url = pages[-1].url
                 await pages[-1].close()
             else:
                 await asyncio.sleep(6)
                 pages = await self.browser.pages()
                 if len(pages) == 2:
                     content = await utils.get_content(pages[-1])
                     sub_url = pages[-1].url
                     await pages[-1].close()
                 else:
                     content = await utils.get_content(self.page)
                     sub_url = self.page.url
                     await asyncio.wait([
                         self.page.goBack(),
                         self.page.waitForXPath(
                             xpath=xpath,
                             timeout=CRAWL_SPEED['CLICK_SUB_URL_MAX_DELAY'])
                     ])
             request_text = utils.remove_js_css(content)
             selector = Selector(text=request_text)
             frame_list = await self.get_frame()
         date, source, title_in_page = self.parse_struct_info(selector)
         if not date:  # 为date上双保险
             date = sub_links[title][1]
         if title_in_page:  # 为title上双保险
             save_title = title_in_page
             if (not save_title) or save_title[-3:] == '...' or (len(
                     utils.get_chinese(save_title)) < len(
                         utils.get_chinese(title))):
                 # 对title再做一次完整性检查
                 save_title = utils.fulfill_title(title, selector)
         else:
             if title[-3:] == '...':
                 save_title = utils.fulfill_title(title, selector)
             else:
                 save_title = title
         if self.main_text_pattern:
             main_text, attachment, img = self.parse_main_text(
                 sub_url, request_text)
             self.save_to_data({
                 'title': save_title,
                 'sub_url': sub_url,
                 'date': date,
                 'main_text': main_text,
                 'source': source,
                 'attachment': attachment,
                 'img': img
             })
         else:
             try:
                 task = parse_context.MAIN_TEXT(sub_url, request_text)
                 result_dict = task.main()
                 main_text = result_dict['content']
                 img = ','.join(result_dict['img'])
                 attachment = ','.join(result_dict['attachment'])
             except Exception:
                 main_text = ''
                 img = ''
                 attachment = ''
             self.save_to_data({
                 'title': save_title,
                 'sub_url': sub_url,
                 'date': date,
                 'main_text': main_text,
                 'source': source,
                 'attachment': attachment,
                 'img': img
             })
     return frame_list