Example #1
0
	def __init__(self):
		self.crossRefLinkDao=CrossRefLinkDAO.CrossRefLinkDAO()
		self.lawDao=LawDAO.LawDAO()
		self.articleDao=ArticleDAO.ArticleDAO()
		self.caseDao=CaseDAO.CaseDAO()
		self.keywordDao=KeywordDAO.KeywordDAO()
		self.queueDao=HyperlinkQueueDAO.HyperlinkQueueDAO()
		self.newsletterDao=ProfNewsletterDAO.ProfNewsletterDAO()
		self.lncQADao=LncQADAO.LncQADAO()
		self.moduleQADao=ModuleQADAO.ModuleQADAO()
		self.exNewsSummaryDao=ExNewsSummaryDAO.ExNewsSummaryDAO()
		self.exNewsDao=ExNewsDAO()
		self.log=getLog()    
		self.linkUrlFormat='/law/content.php?content_type=%s&origin_id=%s&provider_id=%s&isEnglish=%s'
		self.startLinkTagFormat='<a href="%s" class="link_2" re="T" cate="en_href" target="_blank" >'
		self.startLinkTagPattern=re.compile(r'<a href="(?P<hreflink>[^"^#]*?)#i(?P<proNum>[\d\.]*)"\s+class="link_2"\s+re="T"\s+cate="en_href"\s+target="_blank"\s*>',re.I)
		self.linkTagFormat=self.startLinkTagFormat+'%s</a>'

		#Following regex object (in which 'en_href' is an hyperlink mark) match English hyperlink tag 
		self.linkTagPattern=re.compile(r'<a\s+?[^>]*?cate=["\']en_href["\'][^>]*?>([^<]*?)</a>',re.I)
		
		#Following regex object match the text end with '([any text])'
		self.multiVerPat=re.compile(r'\([^)]*\)\s*$')

		#Following regex object match text end with string "of the people's republic of china"(ignore letter case and space)
		#In general,law title strip string "of the ...." is regarded as an abbreviation of law title
		self.abbrPat=re.compile(r'of the People\'s Republic of China\s*$',re.I)

		#Following regex object match pargraph which begin with Article * and end with 2 linefeed
		#Pargraph in article content matched is regarded as a provision
		self.provisionStartPattern=re.compile(r'(Article ([\d\.]+).?(.\n?)+?.?)(<br\s*/*>[\r\s]*<br\s*/*>)',re.I)

		#Following regex object match hidden provision position tag(both begin tag and end tag)
		self.provisionPosTagPattern=re.compile(r'<a name="(end_)?i[\d\.]+" re="T"\s*></a>')
		self.oldProvisionPosTagPattern=re.compile(r'<a re="T" name="(end_)?i[\d\.]+"\s*></a>')
		
		#self.originManualLinkPattern=re.compile(r'(<a\s+href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I)	
		self.originManualLinkPattern=re.compile(r'(<a\s+)[^>]*?(href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I)	
		self.originManualLinkPatternEx=re.compile(r'(<a\s+)[^>]*?class="link_2_manual"[^>]*?(href="[^"]*")[^>]*(>)',re.I)	

		self.delManulLinkPattern=re.compile(r'<a[^>]*?class="link_2_del"[^>]*?>([^<]*?)</a\s*>',re.I)
	
		self.delTagPattern=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*?</span>',re.I)
		self.delTagPatternStart=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*$',re.I)
		self.delTagPatternEnd=re.compile(r'[^<]*?</span>',re.I)

		#content type name display in user agent
        	self.contentTypeNameMap={'T':'Legislation',\
					'C':'Cases',\
					'LM':'Legal news',\
					'FL':'Foreign law',\
					'PNL':'Newsletters',\
					'HN':'Articles',\
					'PC':'Practical materials',\
					'LB':'Q & A',\
					'LOTP':'Tax overview',\
					'LOFDI':'Investment overview',\
					'LOEP':'Employment overview',\
					'LOEE':'Energy overview',\
					'LOCP':'Corporate overview',\
					'LOCS':'Financing overview',\
					'LOIP':'IP overview',\
					'LOMA':'MA overview',\
					'LOCR':'CDR overview',\
					'LOPGEP':'PG Employment overview',\
					'LOPGCP':'PG Corporate overview',\
					'LOPGLS':'PG Legalstudio overview',\
					'CKL':'Checklists',\
					'GOVF':'Goverment form',\
					'SC':'Smart chart',\
					'TEMPLATE':'Precedents',\
					'EL':'Elearning',\
					'SUMMARY':'Overview summary',\
					'PEA':'Q & A'}
		
		#Action type name display to user
		self.actionTypeNameMap={'N':'New','D':'Delete','U':'Update'}
Example #2
0
class HyperlinkProcess(object):
	def __init__(self):
		self.crossRefLinkDao=CrossRefLinkDAO.CrossRefLinkDAO()
		self.lawDao=LawDAO.LawDAO()
		self.articleDao=ArticleDAO.ArticleDAO()
		self.caseDao=CaseDAO.CaseDAO()
		self.keywordDao=KeywordDAO.KeywordDAO()
		self.queueDao=HyperlinkQueueDAO.HyperlinkQueueDAO()
		self.newsletterDao=ProfNewsletterDAO.ProfNewsletterDAO()
		self.lncQADao=LncQADAO.LncQADAO()
		self.moduleQADao=ModuleQADAO.ModuleQADAO()
		self.exNewsSummaryDao=ExNewsSummaryDAO.ExNewsSummaryDAO()
		self.exNewsDao=ExNewsDAO()
		self.log=getLog()    
		self.linkUrlFormat='/law/content.php?content_type=%s&origin_id=%s&provider_id=%s&isEnglish=%s'
		self.startLinkTagFormat='<a href="%s" class="link_2" re="T" cate="en_href" target="_blank" >'
		self.startLinkTagPattern=re.compile(r'<a href="(?P<hreflink>[^"^#]*?)#i(?P<proNum>[\d\.]*)"\s+class="link_2"\s+re="T"\s+cate="en_href"\s+target="_blank"\s*>',re.I)
		self.linkTagFormat=self.startLinkTagFormat+'%s</a>'

		#Following regex object (in which 'en_href' is an hyperlink mark) match English hyperlink tag 
		self.linkTagPattern=re.compile(r'<a\s+?[^>]*?cate=["\']en_href["\'][^>]*?>([^<]*?)</a>',re.I)
		
		#Following regex object match the text end with '([any text])'
		self.multiVerPat=re.compile(r'\([^)]*\)\s*$')

		#Following regex object match text end with string "of the people's republic of china"(ignore letter case and space)
		#In general,law title strip string "of the ...." is regarded as an abbreviation of law title
		self.abbrPat=re.compile(r'of the People\'s Republic of China\s*$',re.I)

		#Following regex object match pargraph which begin with Article * and end with 2 linefeed
		#Pargraph in article content matched is regarded as a provision
		self.provisionStartPattern=re.compile(r'(Article ([\d\.]+).?(.\n?)+?.?)(<br\s*/*>[\r\s]*<br\s*/*>)',re.I)

		#Following regex object match hidden provision position tag(both begin tag and end tag)
		self.provisionPosTagPattern=re.compile(r'<a name="(end_)?i[\d\.]+" re="T"\s*></a>')
		self.oldProvisionPosTagPattern=re.compile(r'<a re="T" name="(end_)?i[\d\.]+"\s*></a>')
		
		#self.originManualLinkPattern=re.compile(r'(<a\s+href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I)	
		self.originManualLinkPattern=re.compile(r'(<a\s+)[^>]*?(href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I)	
		self.originManualLinkPatternEx=re.compile(r'(<a\s+)[^>]*?class="link_2_manual"[^>]*?(href="[^"]*")[^>]*(>)',re.I)	

		self.delManulLinkPattern=re.compile(r'<a[^>]*?class="link_2_del"[^>]*?>([^<]*?)</a\s*>',re.I)
	
		self.delTagPattern=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*?</span>',re.I)
		self.delTagPatternStart=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*$',re.I)
		self.delTagPatternEnd=re.compile(r'[^<]*?</span>',re.I)

		#content type name display in user agent
        	self.contentTypeNameMap={'T':'Legislation',\
					'C':'Cases',\
					'LM':'Legal news',\
					'FL':'Foreign law',\
					'PNL':'Newsletters',\
					'HN':'Articles',\
					'PC':'Practical materials',\
					'LB':'Q & A',\
					'LOTP':'Tax overview',\
					'LOFDI':'Investment overview',\
					'LOEP':'Employment overview',\
					'LOEE':'Energy overview',\
					'LOCP':'Corporate overview',\
					'LOCS':'Financing overview',\
					'LOIP':'IP overview',\
					'LOMA':'MA overview',\
					'LOCR':'CDR overview',\
					'LOPGEP':'PG Employment overview',\
					'LOPGCP':'PG Corporate overview',\
					'LOPGLS':'PG Legalstudio overview',\
					'CKL':'Checklists',\
					'GOVF':'Goverment form',\
					'SC':'Smart chart',\
					'TEMPLATE':'Precedents',\
					'EL':'Elearning',\
					'SUMMARY':'Overview summary',\
					'PEA':'Q & A'}
		
		#Action type name display to user
		self.actionTypeNameMap={'N':'New','D':'Delete','U':'Update'}

	def eraseHyperlink(self,article):
		"""
		清除hyperlink所加的超链接
		hyperlink sample:<a href="" class="link_2" re="T" cate="en_href" target="_blank">Criminal Law</a>
		@param article 
		return 清除hyperlink链接后的文章
		"""	
		if article and article.content:
			article.content=self.delManulLinkPattern.sub(r'<span cate="link_2_del">\1</span>',article.content)
			article.content=self.originManualLinkPattern.sub(r'\1\2class="link_2" re="T" cate="manual_en_href" target="_blank"\3',article.content)
			article.content=self.originManualLinkPatternEx.sub(r'\1\2class="link_2" re="T" cate="manual_en_href" target="_blank"\3',article.content)
			article.content=self.linkTagPattern.sub(r'\1',article.content)

	def addProvisionPosTag(self,article):
		"""
		Mark provision position with following html tag(will not be displayed):
		Mark start position with:<a name="i2" re="T"></a>
		Mark end position with:<a name="end_i1" re="T"></a>
		@param article 
		"""	
		if article and article.content:
			article.content=self.provisionStartPattern.sub(r'<a name="i\2" re="T"></a>\1<a name="end_i\2" re="T"></a>\4',article.content)

	def removeProvisionPosTag(self,article):
		"""
		Remove provision position mark in content
		@param article 
		return 
		"""
		if article and article.content:
			article.content=self.provisionPosTagPattern.sub('',article.content)
			article.content=self.oldProvisionPosTagPattern.sub('',article.content)
			

	def getArticle(self,queueItem):
		"""
		根据Hyperlink队列中的元素获取文章
		@param queueItem hyperlink队列中的一个元素
		return 返回文章
		"""
		if queueItem.contentType == Article.CONTENT_TYPE_LAW:
			article=self.lawDao.getById(queueItem.targetId)
		elif queueItem.contentType == Article.CONTENT_TYPE_CASE:
			article=self.caseDao.getById(queueItem.targetId)
		elif queueItem.contentType == Article.CONTENT_TYPE_NEWSLETTER:
			article=self.newsletterDao.getById(queueItem.targetId)
		elif queueItem.contentType == Article.CONTENT_TYPE_LNCQA:
			article=self.lncQADao.getById(queueItem.targetId)
		elif queueItem.contentType == Article.CONTENT_TYPE_MODULEQA:
			article=self.moduleQADao.getById(queueItem.targetId)
		elif queueItem.contentType ==Article.CONTENT_TYPE_OVERVIEW_SUMMARY:
			article=self.exNewsSummaryDao.getById(queueItem.targetId)
		else:
			article=self.exNewsDao.getById(queueItem.targetId)

		if article:
			article.actionType=queueItem.actionType
			article.status=queueItem.status	
			article.contentType=queueItem.contentType
			if article.content:
				article.content=article.content.replace('’','\'')
				article.content=article.content.replace('‘','\'')
				article.content=article.content.replace('”','"')
				article.content=article.content.replace('“','"')
			return article

	def getArticleByOrigin(self,originId,providerId,isEnglish='Y',contentType='T'):
		"""
		Get article by attribute origin_id ,provider_id and isEnglish
		"""
		if contentType == Article.CONTENT_TYPE_LAW:
			article=self.lawDao.getByOrigin(originId,providerId,isEnglish)
		elif contentType == Article.CONTENT_TYPE_CASE:
			article=self.caseDao.getByOrigin(originId,providerId,isEnglish)
		elif contentType == Article.CONTENT_TYPE_NEWSLETTER:
			article=self.newsletterDao.getByOrigin(originId,providerId,isEnglish)
		elif contentType == Article.CONTENT_TYPE_LNCQA:
			article=self.lncQADao.getByOrigin(originId,providerId,isEnglish)
		elif contentType == Article.CONTENT_TYPE_MODULEQA:
			article=self.moduleQADao.getByOrigin(originId,providerId,isEnglish)
		elif contentType ==Article.CONTENT_TYPE_OVERVIEW_SUMMARY:
			article=self.exNewsSummaryDao.getByOrigin(originId,providerId,isEnglish)
		else:
			article=self.exNewsDao.getByOrigin(originId,providerId,isEnglish)
		if article:
			if article.contentType:
				article.contentType=contentType
			if article.content:
				article.content=article.content.replace('’','\'')
				article.content=article.content.replace('‘','\'')
				article.content=article.content.replace('”','"')
				article.content=article.content.replace('“','"')
			return article

	def updateArticle(self,article):
		"""
		做完hyperlink后更新相关文章的时间
		"""
		if article.contentType == Article.CONTENT_TYPE_LAW:
			self.lawDao.update(article)
		elif article.contentType == Article.CONTENT_TYPE_CASE:
			self.caseDao.update(article)
		elif article.contentType == Article.CONTENT_TYPE_NEWSLETTER:
			self.newsletterDao.update(article)
		elif article.contentType == Article.CONTENT_TYPE_LNCQA:
			self.lncQADao.update(article)
		elif article.contentType == Article.CONTENT_TYPE_MODULEQA:
			self.moduleQADao.update(article)
		elif article.contentType ==Article.CONTENT_TYPE_OVERVIEW_SUMMARY:
			self.exNewsSummaryDao.update(article)
		else:
			self.exNewsDao.update(article)

			
	def checkHyperlinkedKeyword(self,content,startPos,endPos):
		"""
		判断关键词是否被加上了超链接,是返回True,否则返回False
		@param content 
		@param startPos关键词在文章中的出现位置
		@param endPos 关键词结尾位置
		"""
		if content:
			startMatch=re.search(r'<a[^>]+?>[^<]*$',content[:startPos])#在关键字出现位置前找锚标记a开始标签
			endMatch=re.search(r'[^<]*</a\s*>',content[endPos:])#在关键字出现位置后找锚标记结束符
			if startMatch and endMatch:
				return True
		return False

	def checkWrappedWithDelTag(self,content,startPos,endPos):
		"""
		检查startPos和endPos指定位置的关键词是否是手动删除的
		要求手动删除的连接是由<span cate="link_2_del">***</span>
		"""
		if content:
			startMatch=self.delTagPatternStart.search(content[:startPos])
			endMatch=self.delTagPatternEnd.search(content[endPos:])
			if startMatch and endMatch:
				return True
		return False	

	def checkBeginAndEndIsLetter(self,content,startPos,endPos):
		"""
		Check the first letters before startPos and the first letters after endPos in the content is English letters(include '_','-') or not
		return True when the first letter before startPos or the first letter after endPos is English letters
		otherwise return False	
		Sample:'Anti-trust Law' and  'Trust Law'
		"""
		try:
			fletter=content[startPos-1:startPos]#The first letter before startPos	
			lletter=content[endPos:endPos+1]#The first letter after endPos
			if (fletter and fletter in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_') or \
				(lletter and lletter in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_'):
				return True
			else:
				return False
		except Exception,e:
			return False