def parseDocument(self, doc): doc = pq(doc); wrapparent = self.articleRule.wrapparent pageparent = self.articleRule.pageparent content_re = ""; #子页面url urls = [] #文本数据内容 content = "" article = doc.find(wrapparent); #pages if pageparent: urls = self.parsePage(article, pageparent) #need title, tags extrarules = self.articleRule.extrarules #只有文章是有content if len(extrarules): for key, rule, fetch_all in extrarules: field = Field(field_id=key, rule=rule); value = getElementData(doc, rule, self.data["images"], fetch_all) self.data[field.get('name')] = field if field.is_article_content(): content_re = field.get("rule") content = value elif field.is_gallery_content(): content_re = field.get("rule") content = [] if (isinstance(value, list)): content += value else: field.value = value #采集分页内容 if len(urls) > 0 and content_re: for next_url in urls: next_page = Fetch(next_url, charset = self.seed["charset"], timeout = self.seed["timeout"]).read() if next_page is not None: next_page = self._getContent(next_page, wrapparent, content_re); if next_page: if isinstance(content, list): content.append(next_page) else: content += next_page if content and content_re: if isinstance(content, list): self.data['content'].value = content self.data['images'] += content else: content = Readability(content, self.url, self.articleRule.filters) images = content.getImages(); self.data['content'].value = content.getContent(); self.data['images'] += images
def parseListPage(self, site, doc, listurl): ''' 分析采集回来的页面 @param site Fetch instance @param doc 页面String stream @param url link ''' doc = pq(doc); list = doc.find(self.listRule.getListParent()); extrarules = self.listRule.extrarules if list: def entry(i, e): #link urlParent = self.listRule.getContentUrl() if e.tag == "a": link = e.get("href") else: link = getElementData(e, urlParent) if link is not None: link = urlparse.urljoin(listurl, link); _item = Item({ "type" : self.seed_type, "images" : [] }) for field_id, _rule, fetch_all in extrarules: field = Field(field_id = field_id, rule=_rule) value = getElementData(e, _rule, _item["images"]) #TODO: # filter HOOK field.value = value _item[field["name"]] = field if (link is not None): _item['url'] = link # get item guid if self.guid_rule: guid = self.getItemGUID(_item) elif self.seed_type in self.dont_craw_content: self.guid_rule = [] for f in _item.fields: self.guid_rule.append(_item[f]["id"]) guid = self.getItemGUID(_item) self.guid_rule = None else: self.guid_rule = "url" guid = self.getItemGUID(_item) self.guid_rule = None self.items[guid] = _item if len(self.listRule.getEntryItem()) == 0: list.children().map(entry) else: list.find(self.listRule.getEntryItem()).map(entry)
def _getContent(self, html, wrapparent, content_re): if not html: return html = pq(html).find(wrapparent) _content = getElementData(html, content_re); if _content: return _content
def getHtml(self): #自动加上html标记 if self.content.find("<html>") == -1: content = "<html><body>" + self.content + "</body></html>" self.html = pq(content)
def getElementData(obj, rule, images=None, fetch_all=0): """ 根据rule对obj的进行解析 obj可以是pq后的对象, 也可以是html页面 images将会把解析过程的image连接插入此表中 规则可以有两种模式: 1. DOM selector 1.1 选择器类似于jquery 比如你要某个a的url >> a.attr("href") 1.2 需要一个标签内的文本内容 >> div[id="content"].text() 1.3 需要获得某个子元素中的内容 >> li.eq(1).text() #li元素组中的第2个文本内容 2. 正则模式 正则模式需要的内容使用[arg]标签,其余可以使用(*)填充 """ if not isinstance(obj, pq): obj = pq(obj); old_rule = rule rule = rule.split(".") #避免有url链接 if len(rule) > 1 and old_rule.find("[arg]") == -1: #第一个永远是dom选择 selectRule = rule.pop(0) #移除 ( ) selectRule = selectRule.replace("(", ""); selectRule = selectRule.replace(")", ""); selecteddom = obj.find(selectRule); for attr in rule: m = attrParrent.match(attr) if m: action, v = m.groups() if v: v = v.encode("utf-8") #去除引号 v = v.strip("\'").strip('\"'); if action == "attr" and hasattr(selecteddom, "attr") and v: if fetch_all == 1: values = [] dom_count = len(selecteddom) for i in range(dom_count): vv = selecteddom.eq(i).attr(v) if vv: values.append(vv) if is_image(vv): images.append(vv) return values else: value = selecteddom.attr(v) if selecteddom and selecteddom[0].tag == "img" and v == "src" and images is not None: images.append(value) return value elif action == "eq" and hasattr(selecteddom, "eq"): _rules = attr.split(" ") if len(rule) > 1: selecteddom = selecteddom.eq(int(v)) if len(_rules) > 1: ''' 假设eq后面还有子元素 eq(1) a ''' _rules.pop(0) _dom = " ".join(_rules) selecteddom = selecteddom.find(_dom) else: return selecteddom.eq(int(v)) elif action == "text" and hasattr(selecteddom, "text"): return safeunicode(selecteddom.text()).strip() elif action == "html" and hasattr(selecteddom, "html"): return safeunicode(selecteddom.html()).strip() elif len(rule) == 1: rule = rule.pop() #正则模式 if rule.find('[arg]'): content = obj.html() content_text = obj.text() rule = rule.replace('[arg]', '(.+)?') rule = rule.replace('(*)', '.+?') if isinstance(content, unicode): rule = safeunicode(rule) else: rule = safestr(rule) parrent = re.compile(rule, re.MULTILINE | re.UNICODE) try: result = parrent.search(content) if result is not None: result = safeunicode(result.group(1)).strip() return result else: result = parrent.search(content_text) if result is not None: result = safeunicode(result.group(1)).strip() return result except: return None return None