Esempio n. 1
0
    def parseDocument(self, doc):
        doc = pq(doc);

	wrapparent = self.articleRule.wrapparent
	pageparent = self.articleRule.pageparent
	content_re = "";
	#子页面url
	urls = []

	#文本数据内容
	content = ""

	article = doc.find(wrapparent);
	#pages
	if pageparent:
	    urls = self.parsePage(article, pageparent)
	#need title, tags
	extrarules = self.articleRule.extrarules

	#只有文章是有content
	if len(extrarules):
	    for key, rule, fetch_all in extrarules:
		field = Field(field_id=key, rule=rule);
		value = getElementData(doc, rule, self.data["images"], fetch_all)

		self.data[field.get('name')] = field

		if field.is_article_content():
		    content_re = field.get("rule")
		    content = value
		elif field.is_gallery_content():
		    content_re = field.get("rule")
		    content = []
		    if (isinstance(value, list)):
			content += value
		else:
		    field.value = value

	#采集分页内容
	if len(urls) > 0 and content_re:
	    for next_url in urls:
		next_page = Fetch(next_url, charset = self.seed["charset"], timeout = self.seed["timeout"]).read()
		if next_page is not None:
		    next_page = self._getContent(next_page, wrapparent, content_re);
		    if next_page:
			if isinstance(content, list):
			    content.append(next_page)
			else:
			    content += next_page

	if content and content_re:
	    if isinstance(content, list):
		self.data['content'].value = content
		self.data['images'] += content
	    else:
		content = Readability(content, self.url, self.articleRule.filters)
		images = content.getImages();

		self.data['content'].value = content.getContent();
		self.data['images'] += images
Esempio n. 2
0
    def parseListPage(self, site, doc, listurl):
	'''
	分析采集回来的页面
	@param site Fetch instance
	@param doc 页面String stream
	@param url link
	'''
        doc = pq(doc);
        list = doc.find(self.listRule.getListParent());
	extrarules = self.listRule.extrarules

        if list:
            def entry(i, e):
                #link
                urlParent = self.listRule.getContentUrl()
		 
		if e.tag == "a":
		    link = e.get("href")
		else:
		    link = getElementData(e, urlParent)

		if link is not None:
		    link = urlparse.urljoin(listurl, link);

		_item = Item({
		    "type" : self.seed_type,
		    "images" : []
		})

		for field_id, _rule, fetch_all in extrarules:
		    field = Field(field_id = field_id, rule=_rule)
		    value = getElementData(e, _rule, _item["images"])
		    #TODO:
		    # filter HOOK
		    field.value = value
		    _item[field["name"]] = field

		if (link is not None):
		    _item['url'] = link

		# get item guid
		if self.guid_rule:
		    guid = self.getItemGUID(_item)
		elif self.seed_type in self.dont_craw_content:
		    self.guid_rule = []
		    for f in _item.fields:
			self.guid_rule.append(_item[f]["id"])
		    guid = self.getItemGUID(_item)
		    self.guid_rule = None
		else:
		    self.guid_rule = "url"
		    guid = self.getItemGUID(_item)
		    self.guid_rule = None
		
		self.items[guid] = _item

	    if len(self.listRule.getEntryItem()) == 0:
		list.children().map(entry)
	    else:	
		list.find(self.listRule.getEntryItem()).map(entry)
Esempio n. 3
0
    def _getContent(self, html, wrapparent, content_re):
	if not html:
	    return

	html = pq(html).find(wrapparent)
	_content = getElementData(html, content_re);
	if _content:
	    return _content
Esempio n. 4
0
    def getHtml(self):
	#自动加上html标记
	if self.content.find("<html>") == -1:
	    content = "<html><body>" + self.content + "</body></html>"
	    self.html = pq(content)
Esempio n. 5
0
def getElementData(obj, rule, images=None, fetch_all=0):
    """
    根据rule对obj的进行解析
    obj可以是pq后的对象, 也可以是html页面
    images将会把解析过程的image连接插入此表中

    规则可以有两种模式:
    1. DOM selector
	1.1 选择器类似于jquery 比如你要某个a的url
	    >> a.attr("href")
	1.2 需要一个标签内的文本内容
	    >> div[id="content"].text()
	1.3 需要获得某个子元素中的内容
	    >> li.eq(1).text()    #li元素组中的第2个文本内容
    2. 正则模式
	正则模式需要的内容使用[arg]标签,其余可以使用(*)填充
    """
    if not isinstance(obj, pq):
	obj = pq(obj);
    
    old_rule = rule
    rule = rule.split(".")
    
    #避免有url链接
    if len(rule) > 1 and old_rule.find("[arg]") == -1:
	#第一个永远是dom选择
	selectRule = rule.pop(0)
	#移除 ( )
	selectRule = selectRule.replace("(", "");
	selectRule = selectRule.replace(")", "");

	selecteddom = obj.find(selectRule);

	for attr in rule:
	    m = attrParrent.match(attr)
	    if m:
		action, v = m.groups()
		if v:
		    v = v.encode("utf-8")
		    #去除引号
		    v = v.strip("\'").strip('\"');

		if action == "attr" and hasattr(selecteddom, "attr") and v:
		    if fetch_all == 1:
			values = []
			dom_count = len(selecteddom)

			for i in range(dom_count):
			    vv = selecteddom.eq(i).attr(v)
			    if vv:
				values.append(vv)
				if is_image(vv):
				    images.append(vv)
			
			return values
		    else:
			value = selecteddom.attr(v)
			if selecteddom and selecteddom[0].tag == "img" and v == "src" and images is not None:
			    images.append(value)

			return value
		elif action == "eq" and hasattr(selecteddom, "eq"):
		    _rules = attr.split(" ")
		    if len(rule) > 1:
			selecteddom = selecteddom.eq(int(v))
			if len(_rules) > 1:
			    '''
			    假设eq后面还有子元素
			    eq(1) a
			    '''
			    _rules.pop(0)
			    _dom = " ".join(_rules)    
			    selecteddom = selecteddom.find(_dom)
		    else:
			return selecteddom.eq(int(v))
		elif action == "text" and hasattr(selecteddom, "text"):
		    return safeunicode(selecteddom.text()).strip()
		elif action == "html" and hasattr(selecteddom, "html"):
		    return safeunicode(selecteddom.html()).strip()

    elif len(rule) == 1:
	rule = rule.pop()
	#正则模式
	if rule.find('[arg]'):
	    content = obj.html()
	    content_text = obj.text()

	    rule = rule.replace('[arg]', '(.+)?')
	    rule = rule.replace('(*)', '.+?')

	    if isinstance(content, unicode):
		rule = safeunicode(rule)
	    else:
		rule = safestr(rule)

	    parrent = re.compile(rule, re.MULTILINE | re.UNICODE)
	    try:
		result = parrent.search(content)
		if result is not None:
		    result = safeunicode(result.group(1)).strip()
		    return result
		else:
		    result = parrent.search(content_text)
		    if result is not None:
			result = safeunicode(result.group(1)).strip()
			return result
	    except:
		return None
    
    return None