def __init__(self): self.crossRefLinkDao=CrossRefLinkDAO.CrossRefLinkDAO() self.lawDao=LawDAO.LawDAO() self.articleDao=ArticleDAO.ArticleDAO() self.caseDao=CaseDAO.CaseDAO() self.keywordDao=KeywordDAO.KeywordDAO() self.queueDao=HyperlinkQueueDAO.HyperlinkQueueDAO() self.newsletterDao=ProfNewsletterDAO.ProfNewsletterDAO() self.lncQADao=LncQADAO.LncQADAO() self.moduleQADao=ModuleQADAO.ModuleQADAO() self.exNewsSummaryDao=ExNewsSummaryDAO.ExNewsSummaryDAO() self.exNewsDao=ExNewsDAO() self.log=getLog() self.linkUrlFormat='/law/content.php?content_type=%s&origin_id=%s&provider_id=%s&isEnglish=%s' self.startLinkTagFormat='<a href="%s" class="link_2" re="T" cate="en_href" target="_blank" >' self.startLinkTagPattern=re.compile(r'<a href="(?P<hreflink>[^"^#]*?)#i(?P<proNum>[\d\.]*)"\s+class="link_2"\s+re="T"\s+cate="en_href"\s+target="_blank"\s*>',re.I) self.linkTagFormat=self.startLinkTagFormat+'%s</a>' #Following regex object (in which 'en_href' is an hyperlink mark) match English hyperlink tag self.linkTagPattern=re.compile(r'<a\s+?[^>]*?cate=["\']en_href["\'][^>]*?>([^<]*?)</a>',re.I) #Following regex object match the text end with '([any text])' self.multiVerPat=re.compile(r'\([^)]*\)\s*$') #Following regex object match text end with string "of the people's republic of china"(ignore letter case and space) #In general,law title strip string "of the ...." is regarded as an abbreviation of law title self.abbrPat=re.compile(r'of the People\'s Republic of China\s*$',re.I) #Following regex object match pargraph which begin with Article * and end with 2 linefeed #Pargraph in article content matched is regarded as a provision self.provisionStartPattern=re.compile(r'(Article ([\d\.]+).?(.\n?)+?.?)(<br\s*/*>[\r\s]*<br\s*/*>)',re.I) #Following regex object match hidden provision position tag(both begin tag and end tag) self.provisionPosTagPattern=re.compile(r'<a name="(end_)?i[\d\.]+" re="T"\s*></a>') self.oldProvisionPosTagPattern=re.compile(r'<a re="T" name="(end_)?i[\d\.]+"\s*></a>') #self.originManualLinkPattern=re.compile(r'(<a\s+href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I) self.originManualLinkPattern=re.compile(r'(<a\s+)[^>]*?(href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I) self.originManualLinkPatternEx=re.compile(r'(<a\s+)[^>]*?class="link_2_manual"[^>]*?(href="[^"]*")[^>]*(>)',re.I) self.delManulLinkPattern=re.compile(r'<a[^>]*?class="link_2_del"[^>]*?>([^<]*?)</a\s*>',re.I) self.delTagPattern=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*?</span>',re.I) self.delTagPatternStart=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*$',re.I) self.delTagPatternEnd=re.compile(r'[^<]*?</span>',re.I) #content type name display in user agent self.contentTypeNameMap={'T':'Legislation',\ 'C':'Cases',\ 'LM':'Legal news',\ 'FL':'Foreign law',\ 'PNL':'Newsletters',\ 'HN':'Articles',\ 'PC':'Practical materials',\ 'LB':'Q & A',\ 'LOTP':'Tax overview',\ 'LOFDI':'Investment overview',\ 'LOEP':'Employment overview',\ 'LOEE':'Energy overview',\ 'LOCP':'Corporate overview',\ 'LOCS':'Financing overview',\ 'LOIP':'IP overview',\ 'LOMA':'MA overview',\ 'LOCR':'CDR overview',\ 'LOPGEP':'PG Employment overview',\ 'LOPGCP':'PG Corporate overview',\ 'LOPGLS':'PG Legalstudio overview',\ 'CKL':'Checklists',\ 'GOVF':'Goverment form',\ 'SC':'Smart chart',\ 'TEMPLATE':'Precedents',\ 'EL':'Elearning',\ 'SUMMARY':'Overview summary',\ 'PEA':'Q & A'} #Action type name display to user self.actionTypeNameMap={'N':'New','D':'Delete','U':'Update'}
class HyperlinkProcess(object): def __init__(self): self.crossRefLinkDao=CrossRefLinkDAO.CrossRefLinkDAO() self.lawDao=LawDAO.LawDAO() self.articleDao=ArticleDAO.ArticleDAO() self.caseDao=CaseDAO.CaseDAO() self.keywordDao=KeywordDAO.KeywordDAO() self.queueDao=HyperlinkQueueDAO.HyperlinkQueueDAO() self.newsletterDao=ProfNewsletterDAO.ProfNewsletterDAO() self.lncQADao=LncQADAO.LncQADAO() self.moduleQADao=ModuleQADAO.ModuleQADAO() self.exNewsSummaryDao=ExNewsSummaryDAO.ExNewsSummaryDAO() self.exNewsDao=ExNewsDAO() self.log=getLog() self.linkUrlFormat='/law/content.php?content_type=%s&origin_id=%s&provider_id=%s&isEnglish=%s' self.startLinkTagFormat='<a href="%s" class="link_2" re="T" cate="en_href" target="_blank" >' self.startLinkTagPattern=re.compile(r'<a href="(?P<hreflink>[^"^#]*?)#i(?P<proNum>[\d\.]*)"\s+class="link_2"\s+re="T"\s+cate="en_href"\s+target="_blank"\s*>',re.I) self.linkTagFormat=self.startLinkTagFormat+'%s</a>' #Following regex object (in which 'en_href' is an hyperlink mark) match English hyperlink tag self.linkTagPattern=re.compile(r'<a\s+?[^>]*?cate=["\']en_href["\'][^>]*?>([^<]*?)</a>',re.I) #Following regex object match the text end with '([any text])' self.multiVerPat=re.compile(r'\([^)]*\)\s*$') #Following regex object match text end with string "of the people's republic of china"(ignore letter case and space) #In general,law title strip string "of the ...." is regarded as an abbreviation of law title self.abbrPat=re.compile(r'of the People\'s Republic of China\s*$',re.I) #Following regex object match pargraph which begin with Article * and end with 2 linefeed #Pargraph in article content matched is regarded as a provision self.provisionStartPattern=re.compile(r'(Article ([\d\.]+).?(.\n?)+?.?)(<br\s*/*>[\r\s]*<br\s*/*>)',re.I) #Following regex object match hidden provision position tag(both begin tag and end tag) self.provisionPosTagPattern=re.compile(r'<a name="(end_)?i[\d\.]+" re="T"\s*></a>') self.oldProvisionPosTagPattern=re.compile(r'<a re="T" name="(end_)?i[\d\.]+"\s*></a>') #self.originManualLinkPattern=re.compile(r'(<a\s+href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I) self.originManualLinkPattern=re.compile(r'(<a\s+)[^>]*?(href="[^"]*")[^>]*?class="link_2_manual"[^>]*(>)',re.I) self.originManualLinkPatternEx=re.compile(r'(<a\s+)[^>]*?class="link_2_manual"[^>]*?(href="[^"]*")[^>]*(>)',re.I) self.delManulLinkPattern=re.compile(r'<a[^>]*?class="link_2_del"[^>]*?>([^<]*?)</a\s*>',re.I) self.delTagPattern=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*?</span>',re.I) self.delTagPatternStart=re.compile(r'<span\s+cate=["\']link_2_del["\']\s*>[^<]*$',re.I) self.delTagPatternEnd=re.compile(r'[^<]*?</span>',re.I) #content type name display in user agent self.contentTypeNameMap={'T':'Legislation',\ 'C':'Cases',\ 'LM':'Legal news',\ 'FL':'Foreign law',\ 'PNL':'Newsletters',\ 'HN':'Articles',\ 'PC':'Practical materials',\ 'LB':'Q & A',\ 'LOTP':'Tax overview',\ 'LOFDI':'Investment overview',\ 'LOEP':'Employment overview',\ 'LOEE':'Energy overview',\ 'LOCP':'Corporate overview',\ 'LOCS':'Financing overview',\ 'LOIP':'IP overview',\ 'LOMA':'MA overview',\ 'LOCR':'CDR overview',\ 'LOPGEP':'PG Employment overview',\ 'LOPGCP':'PG Corporate overview',\ 'LOPGLS':'PG Legalstudio overview',\ 'CKL':'Checklists',\ 'GOVF':'Goverment form',\ 'SC':'Smart chart',\ 'TEMPLATE':'Precedents',\ 'EL':'Elearning',\ 'SUMMARY':'Overview summary',\ 'PEA':'Q & A'} #Action type name display to user self.actionTypeNameMap={'N':'New','D':'Delete','U':'Update'} def eraseHyperlink(self,article): """ 清除hyperlink所加的超链接 hyperlink sample:<a href="" class="link_2" re="T" cate="en_href" target="_blank">Criminal Law</a> @param article return 清除hyperlink链接后的文章 """ if article and article.content: article.content=self.delManulLinkPattern.sub(r'<span cate="link_2_del">\1</span>',article.content) article.content=self.originManualLinkPattern.sub(r'\1\2class="link_2" re="T" cate="manual_en_href" target="_blank"\3',article.content) article.content=self.originManualLinkPatternEx.sub(r'\1\2class="link_2" re="T" cate="manual_en_href" target="_blank"\3',article.content) article.content=self.linkTagPattern.sub(r'\1',article.content) def addProvisionPosTag(self,article): """ Mark provision position with following html tag(will not be displayed): Mark start position with:<a name="i2" re="T"></a> Mark end position with:<a name="end_i1" re="T"></a> @param article """ if article and article.content: article.content=self.provisionStartPattern.sub(r'<a name="i\2" re="T"></a>\1<a name="end_i\2" re="T"></a>\4',article.content) def removeProvisionPosTag(self,article): """ Remove provision position mark in content @param article return """ if article and article.content: article.content=self.provisionPosTagPattern.sub('',article.content) article.content=self.oldProvisionPosTagPattern.sub('',article.content) def getArticle(self,queueItem): """ 根据Hyperlink队列中的元素获取文章 @param queueItem hyperlink队列中的一个元素 return 返回文章 """ if queueItem.contentType == Article.CONTENT_TYPE_LAW: article=self.lawDao.getById(queueItem.targetId) elif queueItem.contentType == Article.CONTENT_TYPE_CASE: article=self.caseDao.getById(queueItem.targetId) elif queueItem.contentType == Article.CONTENT_TYPE_NEWSLETTER: article=self.newsletterDao.getById(queueItem.targetId) elif queueItem.contentType == Article.CONTENT_TYPE_LNCQA: article=self.lncQADao.getById(queueItem.targetId) elif queueItem.contentType == Article.CONTENT_TYPE_MODULEQA: article=self.moduleQADao.getById(queueItem.targetId) elif queueItem.contentType ==Article.CONTENT_TYPE_OVERVIEW_SUMMARY: article=self.exNewsSummaryDao.getById(queueItem.targetId) else: article=self.exNewsDao.getById(queueItem.targetId) if article: article.actionType=queueItem.actionType article.status=queueItem.status article.contentType=queueItem.contentType if article.content: article.content=article.content.replace('’','\'') article.content=article.content.replace('‘','\'') article.content=article.content.replace('”','"') article.content=article.content.replace('“','"') return article def getArticleByOrigin(self,originId,providerId,isEnglish='Y',contentType='T'): """ Get article by attribute origin_id ,provider_id and isEnglish """ if contentType == Article.CONTENT_TYPE_LAW: article=self.lawDao.getByOrigin(originId,providerId,isEnglish) elif contentType == Article.CONTENT_TYPE_CASE: article=self.caseDao.getByOrigin(originId,providerId,isEnglish) elif contentType == Article.CONTENT_TYPE_NEWSLETTER: article=self.newsletterDao.getByOrigin(originId,providerId,isEnglish) elif contentType == Article.CONTENT_TYPE_LNCQA: article=self.lncQADao.getByOrigin(originId,providerId,isEnglish) elif contentType == Article.CONTENT_TYPE_MODULEQA: article=self.moduleQADao.getByOrigin(originId,providerId,isEnglish) elif contentType ==Article.CONTENT_TYPE_OVERVIEW_SUMMARY: article=self.exNewsSummaryDao.getByOrigin(originId,providerId,isEnglish) else: article=self.exNewsDao.getByOrigin(originId,providerId,isEnglish) if article: if article.contentType: article.contentType=contentType if article.content: article.content=article.content.replace('’','\'') article.content=article.content.replace('‘','\'') article.content=article.content.replace('”','"') article.content=article.content.replace('“','"') return article def updateArticle(self,article): """ 做完hyperlink后更新相关文章的时间 """ if article.contentType == Article.CONTENT_TYPE_LAW: self.lawDao.update(article) elif article.contentType == Article.CONTENT_TYPE_CASE: self.caseDao.update(article) elif article.contentType == Article.CONTENT_TYPE_NEWSLETTER: self.newsletterDao.update(article) elif article.contentType == Article.CONTENT_TYPE_LNCQA: self.lncQADao.update(article) elif article.contentType == Article.CONTENT_TYPE_MODULEQA: self.moduleQADao.update(article) elif article.contentType ==Article.CONTENT_TYPE_OVERVIEW_SUMMARY: self.exNewsSummaryDao.update(article) else: self.exNewsDao.update(article) def checkHyperlinkedKeyword(self,content,startPos,endPos): """ 判断关键词是否被加上了超链接,是返回True,否则返回False @param content @param startPos关键词在文章中的出现位置 @param endPos 关键词结尾位置 """ if content: startMatch=re.search(r'<a[^>]+?>[^<]*$',content[:startPos])#在关键字出现位置前找锚标记a开始标签 endMatch=re.search(r'[^<]*</a\s*>',content[endPos:])#在关键字出现位置后找锚标记结束符 if startMatch and endMatch: return True return False def checkWrappedWithDelTag(self,content,startPos,endPos): """ 检查startPos和endPos指定位置的关键词是否是手动删除的 要求手动删除的连接是由<span cate="link_2_del">***</span> """ if content: startMatch=self.delTagPatternStart.search(content[:startPos]) endMatch=self.delTagPatternEnd.search(content[endPos:]) if startMatch and endMatch: return True return False def checkBeginAndEndIsLetter(self,content,startPos,endPos): """ Check the first letters before startPos and the first letters after endPos in the content is English letters(include '_','-') or not return True when the first letter before startPos or the first letter after endPos is English letters otherwise return False Sample:'Anti-trust Law' and 'Trust Law' """ try: fletter=content[startPos-1:startPos]#The first letter before startPos lletter=content[endPos:endPos+1]#The first letter after endPos if (fletter and fletter in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_') or \ (lletter and lletter in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_'): return True else: return False except Exception,e: return False