def _replace_terminals(rules, new_r): updated_rules = [] for r in rules: if new_r.RHS[0] in r.RHS and len(r.RHS) > 1: productions = [new_r.LHS if x == new_r.RHS[0] else x for x in r.RHS] updated_rules.append(Rule(r.LHS, productions)) else: updated_rules.append(Rule(r.LHS, r.RHS)) updated_rules.append(new_r) return updated_rules
def convert_to_cnf(rules, start_symbol): grammar = Grammar(start_symbol, rules) variables = grammar.variables available_vars = [x + '1' for x in ascii_uppercase if x != start_symbol] + [x for x in ascii_uppercase if x not in variables] terminals = grammar.terminals # Eliminate start symbol on RHS if _check_start_symbol_rhs(grammar.rules, start_symbol): grammar.rules = [Rule(start_symbol + '1', [start_symbol])] + grammar.rules grammar.start_symbol = start_symbol + '1' # Eliminate epsilon productions while True: nullable_variable = _find_nullable_variable(grammar.rules) if nullable_variable: grammar.rules = _eliminate_epsilon(grammar.rules, nullable_variable) else: break # Eliminate unit productions grammar.rules = _eliminate_recursive_units(grammar.rules) while True: unit_production = _find_unit_production(grammar.rules, variables) if unit_production: grammar.rules = _eliminate_unit_productions(grammar.rules, unit_production) grammar.rules = _eliminate_recursive_units(grammar.rules) else: break # Replace terminals in the right hand sides for terminal in terminals: if _check_if_terminal_needs_to_be_replaced(grammar.rules, terminal): new_rule = Rule(available_vars.pop(), [terminal]) grammar.rules = _replace_terminals(grammar.rules, new_rule) # Replace long productions while True: long_production = _find_long_production(grammar.rules) if long_production: new_rule = Rule(available_vars.pop(), [long_production.RHS[0], long_production.RHS[1]]) grammar.rules = _replace_long_productions(grammar.rules, new_rule) else: break return grammar
def run(self): rules = self.dbo.getAllRules() logger.info("get " + str(len(rules)) + " rules to update message") rule = Rule() for rule in rules: html = self.htmlDownload.download(rule.webUrl, rule.webModel) if html == -1: continue messages = self.htmlParser.parse(html, rule) for msg in messages: rule.addMessage(msg) result = self.dbo.saveMessagesFromRule(rule) logger.info(rule.webUrl + " update " + str(result) + " messages")
class TestWechatPush(unittest.TestCase): user = User(userName='******', password='******', wechatId='MAIZHILING', wechatName='dalaomai') rule = Rule( id=1, webName="佛山市人民政府", webUrl="http://www.foshan.gov.cn/zwgk/zwdt/jryw/", ruleModel="regular", rulePattern= r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >', titlePosition="2", hrefPosition="1", timePosition="0", isEffect=1) msg = Message("test", "http://www.foshan.gov.cn/zwgk/zwdt/jryw/", "2019-1-8") rule.addMessage(msg) rule.addMessage(msg) user.addRule(rule) user.addRule(rule) def testBaseMethod(self): wechatPush = WechatPush() self.assertEqual(wechatPush.getAccessToken(), 0) text = wechatPush.structureMessageTextByMessage(self.msg) self.assertEqual(wechatPush.sendMessages(self.user, text), 0) self.assertEqual(wechatPush.push(), 0) return
def preprocess_rules(rules): # split left and right hand sides updated = [r.split('->') for r in rules if r != ''] # strip unnecessary spaces updated = [[x.strip() for x in r] for r in updated] result = [] # split rules with multiple productions into simpler ones & append to result for r in updated: if '|' in r[1]: productions = r[1].split('|') for prod in productions: result.append(Rule(r[0], prod.split())) else: result.append(Rule(r[0], r[1].split())) return result
def strip_quotation_marks(rules): updated_rules = [] for r in rules: terminal = re.match(r"\'(.+)\'", r.RHS[0]) if terminal: updated_rules.append(Rule(r.LHS, [terminal.group(1)])) else: updated_rules.append(r) return updated_rules
def _replace_long_productions(rules, new_rule): updated_rules = [] for r in rules: new_prod = r.RHS[:] if len(new_prod) > 2: for i in range(len(r.RHS) - 1): if r.RHS[i] == new_rule.RHS[0] and r.RHS[i + 1] == new_rule.RHS[1]: new_prod = new_prod[:i] + [new_rule.LHS] + new_prod[i + 2:] updated_rules.append(Rule(r.LHS, new_prod)) updated_rules.append(new_rule) return updated_rules
def _eliminate_epsilon(rules, nullable_var): updated_rules = [] for r in rules: if nullable_var in r.RHS: if len(r.RHS) == 1: # append epsilon rule if it was a unit production updated_rules = updated_rules + [Rule(r.LHS, ['/'])] updated_rules = updated_rules + [Rule(r.LHS, r.RHS)] else: # new combinations which omit every possible subset of the nullable variables new_rules = _create_combinations(r.RHS, nullable_var) for created_rule in new_rules: updated_rules.append(Rule(r.LHS, created_rule)) elif r.LHS == nullable_var and r.RHS[0] == '/': # don't append the original epsilon rule continue else: # keep rule as it is updated_rules = updated_rules + [Rule(r.LHS, r.RHS)] return updated_rules
def parse_rule(statement_txt): potential_rule_parts = statement_txt.split("->", 1) lhs_txt = potential_rule_parts[0].strip() rhs_txt = potential_rule_parts[1].strip() lhs = None try: lhs = parse_multiple_facts(lhs_txt) except ValueError: raise ValuError( f"Unable to parse statement {statement_txt} as a rule.") rhs = parse_fact(rhs_txt) return Rule(lhs, rhs)
def parse_rule(rule_file): rules = {} for line in open(rule_file): if line.strip() == '': continue name,domain,subRule,priority,contrary,output_chi,output_eng = line.strip().split('\t') rule = Rule(name,domain,int(priority)) if subRule.strip() != 'NULL': rule.subRule = subRule.strip().split('|') if contrary.strip() != 'NULL': rule.contrary = contrary.strip().split('|') rule.output_common = output_chi rule.output_eng = output_eng rules[rule.name] = rule return rules
def __getUnPushedRulesSaveInUser(self, user): ''' ''' sql = "select distinct ruleId,webName,webUrl,lastPushTime from UnPushed where userId = " + str( user.id) results = self.__getFromDB(sql) for result in results: rule = Rule(id=result[0], webName=result[1], webUrl=result[2], subscribeLastPushTime=result[3]) self.__getUnPushedMessagesSaveInRule(user, rule) user.addRule(rule) return 0
def saveRules(self, rules): ''' ''' sql = "insert into Rule(webName,webUrl,rulePattern,ruleModel,titlePosition,timePosition,hrefPosition,isEffect,updateTime)\ select %s,%s,%s,%s,%s,%s,%s,%s,%s from dual\ where not exists(select webName from Rule where webUrl = %s and rulePattern = %s)" values = [] rule = Rule() for rule in rules: values.append([ rule.webName, rule.webUrl, rule.rulePattern, rule.ruleModel, rule.titlePosition, rule.timePosition, rule.hrefPosition, rule.isEffect, rule.updateTime, rule.webUrl, rule.rulePattern ]) result = self.__saveValuesToDB(sql, values) return result
def testBaseMethod(self): nowTime = datetime.now() user = User(id=1, userName="******", password="******", permission=1, wechatId="wechatId", wechatName="wechatName", registerTime=nowTime, phoneNumber=18888888888, emailAddress="*****@*****.**", updateTime=nowTime) rule = Rule( id=1, webName="佛山市科学技术局", webUrl="http://www.fskw.gov.cn/tzgg/", ruleModel="regular", rulePattern= r'<li><span>[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)">[\s\S]*?</li>', titlePosition="2", hrefPosition="1", timePosition="0", isEffect=1) self.assertEqual(user.verifyPassword("admin"), 1) self.assertEqual(user.verifyPassword("****"), 0) user.alterPassword("****") self.assertEqual(user.verifyPassword("****"), 1) rules = user.getRules() self.assertEqual(len(rules), 0) user.addRule(rule) rules = user.getRules() self.assertEqual(rules[0], rule) user.removeRule(rule) rules = user.getRules() self.assertEqual(len(rules), 0) user.addRule(rule) user.removeRuleById(1) ## rules = user.getRules() self.assertEqual(len(rules), 0) pass
class TestMysqlOperator(unittest.TestCase): user = User(id =1, userName='******', password='******', wechatId='MAIZHILING', wechatName='dalaomai') rule = Rule( id=1, webName="佛山市人民政府", webUrl = "http://www.foshan.gov.cn/zwgk/zwdt/jryw/", ruleModel = "regular", rulePattern=r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >', titlePosition="2", hrefPosition = "1", timePosition="0", isEffect = 1 ) msg = Message("title","http://www.foshan.gov.cn/zwgk/zwdt/jryw/","2019-1-8") def testBaseProperty(self): mysqlOperator = MysqlOperator(DB_CONFIG) self.assertNotEqual(mysqlOperator.db,None) def testBaseMethod(self): mysqlOperator = MysqlOperator(DB_CONFIG) nowTime = datetime.now() msg = Message("title","href",nowTime.strftime("%Y-%m-%d")) self.rule.addMessage(msg) self.assertNotEqual(mysqlOperator.saveRules([self.rule]),-1) self.assertNotEqual(mysqlOperator.saveMessagesFromRule(self.rule),-1) self.assertNotEqual(mysqlOperator.saveUser(self.user),-1) self.assertNotEqual(len(mysqlOperator.getAllRules()),0) self.assertNotEqual(mysqlOperator.updateUserLastPushTimeForRule(self.user,self.rule,0),-1) self.assertNotEqual(len(mysqlOperator.getUnPushedUsers()),0) self.assertNotEqual(mysqlOperator.updateUserLastPushTimeForRule(self.user,self.rule,datetime.now()),-1) self.assertEqual(len(mysqlOperator.getUnPushedUsers()),0) return
def getAllRules(self): ''' return id,webName,webUrl,ruleModel,rulePattern,titlePosition,timePosition,hrefPosition,isEffect,updateTime,webModel ''' sql = "select id,webName,webUrl,ruleModel,rulePattern,titlePosition,hrefPosition,timePosition,isEffect,updateTime,webModel from Rule " executeResults = self.__getFromDB(sql) rules = [] for executeResult in executeResults: rule = Rule(id=executeResult[0], webName=executeResult[1], webUrl=executeResult[2], ruleModel=executeResult[3], rulePattern=executeResult[4], titlePosition=executeResult[5], hrefPosition=executeResult[6], timePosition=executeResult[7], isEffect=executeResult[8], updateTime=executeResult[9], webModel=executeResult[10]) rules.append(rule) return rules
def parse_rule_representation(rule_rep): """Function that takes string containing a rule in RuleTaker format and creates a Rule. E.g. input: (((\"something\" \"needs\" \"cow\" \"+\")) -> (\"something\" \"is\" \"red\" \"+\")) """ rule = None rule_rep = rule_rep.strip() # Remove enclosing parens () rule_txt = rule_rep[1:-1] rule_parts = rule_txt.split("->") if len(rule_parts) == 2: # LHS is enclosed in parens. Remove (). lhs = rule_parts[0].strip()[1:-1] rhs = rule_parts[1] lhs_facts = [] lhs_parts = [] for m in re.finditer(r"\([^()]+\)", lhs): lhs_part = m.group(0) lhs_fact = parse_triple_representation(lhs_part) if lhs_fact is not None: lhs_facts.append(lhs_fact) rhs_fact = parse_triple_representation(rhs) rule = Rule(lhs_facts, rhs_fact) return rule
def parse_rule(rule_file): rules = {} for line in open(rule_file): if line.strip() == '': continue name, domain, subRule, priority, contrary, output_chi, output_eng = line.strip( ).split('\t') rule = Rule(name, domain, int(priority)) if subRule.strip() != 'NULL': rule.subRule = subRule.strip().split('|') if contrary.strip() != 'NULL': rule.contrary = contrary.strip().split('|') rule.output_common = output_chi rule.output_eng = output_eng rules[rule.name] = rule return rules
#!/usr/bin/env python
from common import Rule, Word rules = [] rules.append(Rule('S', 'NP VP')) rules.append(Rule('NP', 'ART ADJ N')) rules.append(Rule('NP', 'ART N')) rules.append(Rule('NP', 'ADJ N')) rules.append(Rule('VP', 'AUX VP')) rules.append(Rule('VP', 'V NP')) words = 'the large can can hold the water' words = words.split() dictionary = [] dictionary.append(Word('the', 'ART')) dictionary.append(Word('large', 'ADJ')) dictionary.append(Word('can', 'N AUX V')) dictionary.append(Word('hold', 'N V')) dictionary.append(Word('water', 'N V')) if __name__ == '__main__': for i in rules: print(i) print() for i in dictionary: print(i)
def run_it(*args,**kwargs): # 接受要采集的种子信息和地址信息, uuid =kwargs['uuid'] url = kwargs['url'] uri = kwargs['uri'] # 判读有配置模板信息 sql = ''' SELECT `uuid`,`charset`,`request_type`,`sub_uri`,`type` FROM `application`.`sys_seed_ruler_info` WHERE delete_flag = 0 and seed_uuid = '%s' ''' % (uuid) res ,datarule = applicationDb.read_sql(sql) print(datarule) lastrule=() urllen = 0 for i in datarule: if url.find(i[3]) > -1: if len(i[3]) > urllen: lastrule = i urllen = len(i[3]) # 获取网页源码(HtmlSource) htmlSource = HtmlSource() print("读取网页%s" %(url)) if len(lastrule) > 0: html_text = htmlSource.get_html(url_p=url, type_p=lastrule[2], chartset_p=lastrule[1]) else: html_text = htmlSource.get_html(url_p=url) rule = Rule() # 粗提取url list_a = htmlSource.get_url_list_xpath(html_text) for a in list_a: print("原文:"+a) list_a = htmlSource.addr_clear(list_a) # 去噪点去重复 for a in list_a: print("去噪点:"+a) list_a = htmlSource.addr_whole(list_a, url_root=rule.get_url_root(url)) # 补全路径 for a in list_a: print("补全路径:" + a) # 判断url是否当前的网站内地址 TODO # 如果是入库标记状态0 # 如果不是丢弃url # 数据入库 for a in list_a: sql =''' INSERT INTO `result`.`sys_url_info` VALUES ('%s', '%s',0) '''%(rule.get_md5_value(a),a) resultDb.write_sql(sql) print("网页链接提取完毕.") if(len(lastrule) > 0): print("读取模板信息.") # 获取模板信息 sql =''' SELECT `colum_name`,`ruler`,`type`,`app1`,`app2`,`arr`,`spl1`,`spl2` FROM `application`.`sys_seed_ruler_colum_info` where delete_flag = 0 and ruler_uuid = '%s' ''' %(lastrule[0]) res2, columrole = applicationDb.read_sql(sql) # 如果有调用网页采集程序,调用规则提取数据,调用结果配置数据入库,完成采集任务 if(len(columrole)>0): print(columrole) # 将网页源码和当前url传递给(Rule)获得结果 result=[] if lastrule[4] == '0': print("详细页面信息提取.") result = rule.html_content_analysis_detial(html_text=html_text, column=columrole, url=url) elif lastrule[4] =='1': print("列表页面信息提取.") result = rule.html_content_analysis_list(html_text=html_text,column=columrole,url=url) # 调用ResultData入库 rd = ResultData() rd.resultRefulence(rule_uuid=lastrole[0], result=result,type=lastrole[4] ) # 更新url sql =''' UPDATE `result`.`sys_url_info` SET `flag` = 2 WHERE `url` = '%s' ''' %(url) resultDb.write_sql(sql)
def _eliminate_unit_productions(rules, unit_prod): updated_rules = list( filter(lambda x: x.LHS != unit_prod.LHS or len(x.RHS) != 1 or x.RHS[0] != unit_prod.RHS[0], rules)) new_rules = [Rule(unit_prod.LHS, r.RHS) for r in rules if r.LHS == unit_prod.RHS[0]] updated_rules = updated_rules + new_rules return updated_rules
from common import Rule, Word rules = [] rules.append(Rule('S', 'NP VP')) rules.append(Rule('NP', 'N')) rules.append(Rule('NP', 'SURNAME N')) rules.append(Rule('NP', 'N N')) rules.append(Rule('NP', 'V N')) rules.append(Rule('PP', 'PREP NP')) rules.append(Rule('VP', 'V NP')) rules.append(Rule('VP', 'ADV VP')) rules.append(Rule('VP', 'PP VP')) words = '王 翻译 在 翻译 小说' words = words.split() dictionary = [] dictionary.append(Word('王', 'SURNAME N')) dictionary.append(Word('翻译', 'N V')) dictionary.append(Word('在', 'V ADV PREP')) dictionary.append(Word('小说', 'N')) if __name__ == '__main__': for i in rules: print(i) print() for i in dictionary: print(i)