Example #1
0
    def parseJsonPage(self, site, doc, listurl):
	try:
	    doc = json.loads(doc, encoding=site.getCharset())
	    item = self.listRule.getEntryItem()
	    if item and item in doc:
		data = doc[item]
	    else:
		data = doc

	    urlParent = self.listRule.getContentUrl()
	    extrarules = self.listRule.extrarules

	    if isinstance(data, list) and urlParent:
		for _data in data:
		    if urlParent in _data:
			link = urlparse.urljoin(listurl, _data[urlParent])
			guid = md5(link).hexdigest()

			_item = Item({
			    "type" : self.seed_type,
			    "images" : []
			})

			#取出需要的key数据
			for field_id, _rule, fetch_all in extrarules:
			    field = Field(field_id = field_id, rule=_rule)
			    if _rule in _data:
				value = _data[_rule]
				if is_image(value):
				    _item["images"].append(value)
				    field.value = value
				    _item[field["name"]] = field
			
			if (link is not None):
			    _item['url'] = link

			# get item guid
			if self.guid_rule:
			    guid = self.getItemGUID(_item)
			elif self.seed_type in self.dont_craw_content:
			    self.guid_rule = []
			    for f in _item.fields:
				self.guid_rule.append(_item[f]["id"])
			    guid = self.getItemGUID(_item)
			    self.guid_rule = None
			else:
			    self.guid_rule = "url"
			    guid = self.getItemGUID(_item)
			    self.guid_rule = None
			
			self.items[guid] = _item
	except:
	    raise "Cant parse json file"
Example #2
0
    def parseDocument(self, doc):
        doc = pq(doc);

	wrapparent = self.articleRule.wrapparent
	pageparent = self.articleRule.pageparent
	content_re = "";
	#子页面url
	urls = []

	#文本数据内容
	content = ""

	article = doc.find(wrapparent);
	#pages
	if pageparent:
	    urls = self.parsePage(article, pageparent)
	#need title, tags
	extrarules = self.articleRule.extrarules

	#只有文章是有content
	if len(extrarules):
	    for key, rule, fetch_all in extrarules:
		field = Field(field_id=key, rule=rule);
		value = getElementData(doc, rule, self.data["images"], fetch_all)

		self.data[field.get('name')] = field

		if field.is_article_content():
		    content_re = field.get("rule")
		    content = value
		elif field.is_gallery_content():
		    content_re = field.get("rule")
		    content = []
		    if (isinstance(value, list)):
			content += value
		else:
		    field.value = value

	#采集分页内容
	if len(urls) > 0 and content_re:
	    for next_url in urls:
		next_page = Fetch(next_url, charset = self.seed["charset"], timeout = self.seed["timeout"]).read()
		if next_page is not None:
		    next_page = self._getContent(next_page, wrapparent, content_re);
		    if next_page:
			if isinstance(content, list):
			    content.append(next_page)
			else:
			    content += next_page

	if content and content_re:
	    if isinstance(content, list):
		self.data['content'].value = content
		self.data['images'] += content
	    else:
		content = Readability(content, self.url, self.articleRule.filters)
		images = content.getImages();

		self.data['content'].value = content.getContent();
		self.data['images'] += images
Example #3
0
            def entry(i, e):
                #link
                urlParent = self.listRule.getContentUrl()
		 
		if e.tag == "a":
		    link = e.get("href")
		else:
		    link = getElementData(e, urlParent)

		if link is not None:
		    link = urlparse.urljoin(listurl, link);

		_item = Item({
		    "type" : self.seed_type,
		    "images" : []
		})

		for field_id, _rule, fetch_all in extrarules:
		    field = Field(field_id = field_id, rule=_rule)
		    value = getElementData(e, _rule, _item["images"])
		    #TODO:
		    # filter HOOK
		    field.value = value
		    _item[field["name"]] = field

		if (link is not None):
		    _item['url'] = link

		# get item guid
		if self.guid_rule:
		    guid = self.getItemGUID(_item)
		elif self.seed_type in self.dont_craw_content:
		    self.guid_rule = []
		    for f in _item.fields:
			self.guid_rule.append(_item[f]["id"])
		    guid = self.getItemGUID(_item)
		    self.guid_rule = None
		else:
		    self.guid_rule = "url"
		    guid = self.getItemGUID(_item)
		    self.guid_rule = None
		
		self.items[guid] = _item
Example #4
0
 def _testField(self):
     f = Field(name='Title',
               id="title",
               rule='h1.text()',
               type='list',
               other='222')
Example #5
0
 def _testItem(self):
     test_item = Item(test="a", ddd="ff")
     "ddd" in test_item
     "adads" in test_item
     test_item["aaa"] = "c"
     test_item['f1'] = Field(name='title', rule='b')