Esempio n. 1
0
def _replace_terminals(rules, new_r):
    updated_rules = []
    for r in rules:
        if new_r.RHS[0] in r.RHS and len(r.RHS) > 1:
            productions = [new_r.LHS if x == new_r.RHS[0] else x for x in r.RHS]
            updated_rules.append(Rule(r.LHS, productions))
        else:
            updated_rules.append(Rule(r.LHS, r.RHS))
    updated_rules.append(new_r)
    return updated_rules
Esempio n. 2
0
def convert_to_cnf(rules, start_symbol):
    grammar = Grammar(start_symbol, rules)
    variables = grammar.variables
    available_vars = [x + '1' for x in ascii_uppercase if x != start_symbol] + [x for x in ascii_uppercase if
                                                                                x not in variables]
    terminals = grammar.terminals

    # Eliminate start symbol on RHS

    if _check_start_symbol_rhs(grammar.rules, start_symbol):
        grammar.rules = [Rule(start_symbol + '1', [start_symbol])] + grammar.rules
        grammar.start_symbol = start_symbol + '1'

    # Eliminate epsilon productions

    while True:
        nullable_variable = _find_nullable_variable(grammar.rules)
        if nullable_variable:
            grammar.rules = _eliminate_epsilon(grammar.rules, nullable_variable)
        else:
            break

    # Eliminate unit productions

    grammar.rules = _eliminate_recursive_units(grammar.rules)

    while True:
        unit_production = _find_unit_production(grammar.rules, variables)
        if unit_production:
            grammar.rules = _eliminate_unit_productions(grammar.rules, unit_production)
            grammar.rules = _eliminate_recursive_units(grammar.rules)
        else:
            break

    # Replace terminals in the right hand sides

    for terminal in terminals:
        if _check_if_terminal_needs_to_be_replaced(grammar.rules, terminal):
            new_rule = Rule(available_vars.pop(), [terminal])
            grammar.rules = _replace_terminals(grammar.rules, new_rule)

    # Replace long productions

    while True:
        long_production = _find_long_production(grammar.rules)
        if long_production:
            new_rule = Rule(available_vars.pop(), [long_production.RHS[0], long_production.RHS[1]])
            grammar.rules = _replace_long_productions(grammar.rules, new_rule)
        else:
            break


    return grammar
Esempio n. 3
0
 def run(self):
     rules = self.dbo.getAllRules()
     logger.info("get " + str(len(rules)) + " rules to update message")
     rule = Rule()
     for rule in rules:
         html = self.htmlDownload.download(rule.webUrl, rule.webModel)
         if html == -1:
             continue
         messages = self.htmlParser.parse(html, rule)
         for msg in messages:
             rule.addMessage(msg)
         result = self.dbo.saveMessagesFromRule(rule)
         logger.info(rule.webUrl + " update " + str(result) + " messages")
Esempio n. 4
0
class TestWechatPush(unittest.TestCase):

    user = User(userName='******',
                password='******',
                wechatId='MAIZHILING',
                wechatName='dalaomai')
    rule = Rule(
        id=1,
        webName="佛山市人民政府",
        webUrl="http://www.foshan.gov.cn/zwgk/zwdt/jryw/",
        ruleModel="regular",
        rulePattern=
        r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >',
        titlePosition="2",
        hrefPosition="1",
        timePosition="0",
        isEffect=1)
    msg = Message("test", "http://www.foshan.gov.cn/zwgk/zwdt/jryw/",
                  "2019-1-8")
    rule.addMessage(msg)
    rule.addMessage(msg)
    user.addRule(rule)
    user.addRule(rule)

    def testBaseMethod(self):
        wechatPush = WechatPush()
        self.assertEqual(wechatPush.getAccessToken(), 0)
        text = wechatPush.structureMessageTextByMessage(self.msg)
        self.assertEqual(wechatPush.sendMessages(self.user, text), 0)

        self.assertEqual(wechatPush.push(), 0)
        return
Esempio n. 5
0
def preprocess_rules(rules):
    # split left and right hand sides
    updated = [r.split('->') for r in rules if r != '']
    # strip unnecessary spaces
    updated = [[x.strip() for x in r] for r in updated]

    result = []
    # split rules with multiple productions into simpler ones & append to result
    for r in updated:
        if '|' in r[1]:
            productions = r[1].split('|')
            for prod in productions:
                result.append(Rule(r[0], prod.split()))
        else:
            result.append(Rule(r[0], r[1].split()))
    return result
Esempio n. 6
0
def strip_quotation_marks(rules):
    updated_rules = []
    for r in rules:
        terminal = re.match(r"\'(.+)\'", r.RHS[0])
        if terminal:
            updated_rules.append(Rule(r.LHS, [terminal.group(1)]))
        else:
            updated_rules.append(r)
    return updated_rules
Esempio n. 7
0
def _replace_long_productions(rules, new_rule):
    updated_rules = []
    for r in rules:
        new_prod = r.RHS[:]
        if len(new_prod) > 2:
            for i in range(len(r.RHS) - 1):
                if r.RHS[i] == new_rule.RHS[0] and r.RHS[i + 1] == new_rule.RHS[1]:
                    new_prod = new_prod[:i] + [new_rule.LHS] + new_prod[i + 2:]
        updated_rules.append(Rule(r.LHS, new_prod))
    updated_rules.append(new_rule)
    return updated_rules
Esempio n. 8
0
def _eliminate_epsilon(rules, nullable_var):
    updated_rules = []
    for r in rules:
        if nullable_var in r.RHS:
            if len(r.RHS) == 1:
                # append epsilon rule if it was a unit production
                updated_rules = updated_rules + [Rule(r.LHS, ['/'])]
                updated_rules = updated_rules + [Rule(r.LHS, r.RHS)]
            else:
                # new combinations which omit every possible subset of the nullable variables
                new_rules = _create_combinations(r.RHS, nullable_var)
                for created_rule in new_rules:
                    updated_rules.append(Rule(r.LHS, created_rule))
        elif r.LHS == nullable_var and r.RHS[0] == '/':
            # don't append the original epsilon rule
            continue
        else:
            # keep rule as it is
            updated_rules = updated_rules + [Rule(r.LHS, r.RHS)]
    return updated_rules
Esempio n. 9
0
def parse_rule(statement_txt):
    potential_rule_parts = statement_txt.split("->", 1)
    lhs_txt = potential_rule_parts[0].strip()
    rhs_txt = potential_rule_parts[1].strip()
    lhs = None
    try:
        lhs = parse_multiple_facts(lhs_txt)
    except ValueError:
        raise ValuError(
            f"Unable to parse statement {statement_txt} as a rule.")
    rhs = parse_fact(rhs_txt)
    return Rule(lhs, rhs)
Esempio n. 10
0
def parse_rule(rule_file):
    
    rules = {}

    for line in open(rule_file):
        if line.strip() == '':
            continue
        name,domain,subRule,priority,contrary,output_chi,output_eng = line.strip().split('\t')

        rule = Rule(name,domain,int(priority))

        if subRule.strip() != 'NULL':
            rule.subRule = subRule.strip().split('|')
        
        if contrary.strip() != 'NULL':
            rule.contrary =  contrary.strip().split('|')
        
        rule.output_common = output_chi
        rule.output_eng = output_eng

        rules[rule.name] = rule

    return rules
Esempio n. 11
0
 def __getUnPushedRulesSaveInUser(self, user):
     '''
     '''
     sql = "select distinct ruleId,webName,webUrl,lastPushTime from UnPushed where userId = " + str(
         user.id)
     results = self.__getFromDB(sql)
     for result in results:
         rule = Rule(id=result[0],
                     webName=result[1],
                     webUrl=result[2],
                     subscribeLastPushTime=result[3])
         self.__getUnPushedMessagesSaveInRule(user, rule)
         user.addRule(rule)
     return 0
Esempio n. 12
0
    def saveRules(self, rules):
        '''
        '''
        sql = "insert into Rule(webName,webUrl,rulePattern,ruleModel,titlePosition,timePosition,hrefPosition,isEffect,updateTime)\
       select %s,%s,%s,%s,%s,%s,%s,%s,%s from dual\
      where not exists(select webName from Rule where webUrl = %s and rulePattern = %s)"

        values = []
        rule = Rule()
        for rule in rules:
            values.append([
                rule.webName, rule.webUrl, rule.rulePattern, rule.ruleModel,
                rule.titlePosition, rule.timePosition, rule.hrefPosition,
                rule.isEffect, rule.updateTime, rule.webUrl, rule.rulePattern
            ])
        result = self.__saveValuesToDB(sql, values)
        return result
Esempio n. 13
0
    def testBaseMethod(self):
        nowTime = datetime.now()
        user = User(id=1,
                    userName="******",
                    password="******",
                    permission=1,
                    wechatId="wechatId",
                    wechatName="wechatName",
                    registerTime=nowTime,
                    phoneNumber=18888888888,
                    emailAddress="*****@*****.**",
                    updateTime=nowTime)
        rule = Rule(
            id=1,
            webName="佛山市科学技术局",
            webUrl="http://www.fskw.gov.cn/tzgg/",
            ruleModel="regular",
            rulePattern=
            r'<li><span>[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)">[\s\S]*?</li>',
            titlePosition="2",
            hrefPosition="1",
            timePosition="0",
            isEffect=1)
        self.assertEqual(user.verifyPassword("admin"), 1)
        self.assertEqual(user.verifyPassword("****"), 0)
        user.alterPassword("****")
        self.assertEqual(user.verifyPassword("****"), 1)

        rules = user.getRules()
        self.assertEqual(len(rules), 0)

        user.addRule(rule)
        rules = user.getRules()
        self.assertEqual(rules[0], rule)

        user.removeRule(rule)
        rules = user.getRules()
        self.assertEqual(len(rules), 0)

        user.addRule(rule)
        user.removeRuleById(1)  ##
        rules = user.getRules()
        self.assertEqual(len(rules), 0)
        pass
Esempio n. 14
0
class TestMysqlOperator(unittest.TestCase):
    user = User(id =1,
                userName='******',
                password='******',
                wechatId='MAIZHILING',
                wechatName='dalaomai')
    rule = Rule(    id=1,
                    webName="佛山市人民政府",
                    webUrl = "http://www.foshan.gov.cn/zwgk/zwdt/jryw/",
                    ruleModel = "regular",
                    rulePattern=r'<li [\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})[\s\S]*?href="([\s\S]*?)"[\s\S]*?title="([\s\S]*?)" >',
                    titlePosition="2",
                    hrefPosition = "1",
                    timePosition="0",
                    isEffect = 1
                    )
    msg = Message("title","http://www.foshan.gov.cn/zwgk/zwdt/jryw/","2019-1-8")

    def testBaseProperty(self):
        mysqlOperator = MysqlOperator(DB_CONFIG)
        self.assertNotEqual(mysqlOperator.db,None)

    def testBaseMethod(self):
        mysqlOperator = MysqlOperator(DB_CONFIG)
        nowTime = datetime.now()


        msg = Message("title","href",nowTime.strftime("%Y-%m-%d"))
        self.rule.addMessage(msg)
        self.assertNotEqual(mysqlOperator.saveRules([self.rule]),-1)
        self.assertNotEqual(mysqlOperator.saveMessagesFromRule(self.rule),-1)
        self.assertNotEqual(mysqlOperator.saveUser(self.user),-1)
        self.assertNotEqual(len(mysqlOperator.getAllRules()),0)

        self.assertNotEqual(mysqlOperator.updateUserLastPushTimeForRule(self.user,self.rule,0),-1)
        self.assertNotEqual(len(mysqlOperator.getUnPushedUsers()),0)
        self.assertNotEqual(mysqlOperator.updateUserLastPushTimeForRule(self.user,self.rule,datetime.now()),-1)
        self.assertEqual(len(mysqlOperator.getUnPushedUsers()),0)

        return
Esempio n. 15
0
    def getAllRules(self):
        '''
        return id,webName,webUrl,ruleModel,rulePattern,titlePosition,timePosition,hrefPosition,isEffect,updateTime,webModel
        '''

        sql = "select id,webName,webUrl,ruleModel,rulePattern,titlePosition,hrefPosition,timePosition,isEffect,updateTime,webModel from Rule "
        executeResults = self.__getFromDB(sql)
        rules = []
        for executeResult in executeResults:
            rule = Rule(id=executeResult[0],
                        webName=executeResult[1],
                        webUrl=executeResult[2],
                        ruleModel=executeResult[3],
                        rulePattern=executeResult[4],
                        titlePosition=executeResult[5],
                        hrefPosition=executeResult[6],
                        timePosition=executeResult[7],
                        isEffect=executeResult[8],
                        updateTime=executeResult[9],
                        webModel=executeResult[10])
            rules.append(rule)
        return rules
Esempio n. 16
0
def parse_rule_representation(rule_rep):
    """Function that takes string containing a rule in RuleTaker format and creates
    a Rule. E.g. input:
        (((\"something\" \"needs\" \"cow\" \"+\")) -> (\"something\" \"is\" \"red\" \"+\"))
    """
    rule = None
    rule_rep = rule_rep.strip()
    # Remove enclosing parens ()
    rule_txt = rule_rep[1:-1]
    rule_parts = rule_txt.split("->")
    if len(rule_parts) == 2:
        # LHS is enclosed in parens. Remove ().
        lhs = rule_parts[0].strip()[1:-1]
        rhs = rule_parts[1]
        lhs_facts = []
        lhs_parts = []
        for m in re.finditer(r"\([^()]+\)", lhs):
            lhs_part = m.group(0)
            lhs_fact = parse_triple_representation(lhs_part)
            if lhs_fact is not None:
                lhs_facts.append(lhs_fact)
        rhs_fact = parse_triple_representation(rhs)
        rule = Rule(lhs_facts, rhs_fact)
        return rule
Esempio n. 17
0
def parse_rule(rule_file):

    rules = {}

    for line in open(rule_file):
        if line.strip() == '':
            continue
        name, domain, subRule, priority, contrary, output_chi, output_eng = line.strip(
        ).split('\t')

        rule = Rule(name, domain, int(priority))

        if subRule.strip() != 'NULL':
            rule.subRule = subRule.strip().split('|')

        if contrary.strip() != 'NULL':
            rule.contrary = contrary.strip().split('|')

        rule.output_common = output_chi
        rule.output_eng = output_eng

        rules[rule.name] = rule

    return rules
Esempio n. 18
0
#!/usr/bin/env python
Esempio n. 19
0
from common import Rule, Word

rules = []
rules.append(Rule('S', 'NP VP'))
rules.append(Rule('NP', 'ART ADJ N'))
rules.append(Rule('NP', 'ART N'))
rules.append(Rule('NP', 'ADJ N'))
rules.append(Rule('VP', 'AUX VP'))
rules.append(Rule('VP', 'V NP'))

words = 'the large can can hold the water'
words = words.split()

dictionary = []
dictionary.append(Word('the', 'ART'))
dictionary.append(Word('large', 'ADJ'))
dictionary.append(Word('can', 'N AUX V'))
dictionary.append(Word('hold', 'N V'))
dictionary.append(Word('water', 'N V'))

if __name__ == '__main__':
    for i in rules:
        print(i)

    print()

    for i in dictionary:
        print(i)
Esempio n. 20
0
def run_it(*args,**kwargs):
    # 接受要采集的种子信息和地址信息,
    uuid =kwargs['uuid']
    url = kwargs['url']
    uri = kwargs['uri']
    # 判读有配置模板信息
    sql = '''
        SELECT  `uuid`,`charset`,`request_type`,`sub_uri`,`type`
        FROM `application`.`sys_seed_ruler_info`
        WHERE delete_flag = 0
        and seed_uuid = '%s'
    ''' % (uuid)
    res ,datarule = applicationDb.read_sql(sql)

    print(datarule)
    lastrule=()
    urllen = 0
    for i in datarule:
        if url.find(i[3]) > -1:
            if len(i[3]) > urllen:
                lastrule = i
                urllen = len(i[3])
    # 获取网页源码(HtmlSource)
    htmlSource = HtmlSource()

    print("读取网页%s" %(url))
    if len(lastrule) > 0:
        html_text = htmlSource.get_html(url_p=url, type_p=lastrule[2], chartset_p=lastrule[1])
    else:
        html_text = htmlSource.get_html(url_p=url)
    rule = Rule()
    # 粗提取url
    list_a = htmlSource.get_url_list_xpath(html_text)
    for a in list_a:
        print("原文:"+a)

    list_a = htmlSource.addr_clear(list_a) # 去噪点去重复
    for a in list_a:
        print("去噪点:"+a)
    list_a = htmlSource.addr_whole(list_a, url_root=rule.get_url_root(url))  # 补全路径
    for a in list_a:
        print("补全路径:" + a)

    # 判断url是否当前的网站内地址 TODO
    # 如果是入库标记状态0
    # 如果不是丢弃url

    # 数据入库
    for a in list_a:
        sql ='''
            INSERT INTO `result`.`sys_url_info`
            VALUES ('%s', '%s',0)
        '''%(rule.get_md5_value(a),a)
        resultDb.write_sql(sql)
    print("网页链接提取完毕.")
    if(len(lastrule) > 0):
        print("读取模板信息.")
        # 获取模板信息
        sql ='''
            SELECT `colum_name`,`ruler`,`type`,`app1`,`app2`,`arr`,`spl1`,`spl2`
            FROM `application`.`sys_seed_ruler_colum_info`
            where delete_flag = 0
            and ruler_uuid = '%s'
        ''' %(lastrule[0])
        res2, columrole = applicationDb.read_sql(sql)

        # 如果有调用网页采集程序,调用规则提取数据,调用结果配置数据入库,完成采集任务
        if(len(columrole)>0):
            print(columrole)
            # 将网页源码和当前url传递给(Rule)获得结果
            result=[]
            if lastrule[4] == '0':
                print("详细页面信息提取.")
                result = rule.html_content_analysis_detial(html_text=html_text, column=columrole, url=url)

            elif  lastrule[4] =='1':
                print("列表页面信息提取.")
                result = rule.html_content_analysis_list(html_text=html_text,column=columrole,url=url)

            # 调用ResultData入库
            rd = ResultData()
            rd.resultRefulence(rule_uuid=lastrole[0], result=result,type=lastrole[4] )


    # 更新url
    sql ='''
        UPDATE `result`.`sys_url_info`
        SET `flag` = 2
        WHERE `url` = '%s'
    ''' %(url)
    resultDb.write_sql(sql)
Esempio n. 21
0
def _eliminate_unit_productions(rules, unit_prod):
    updated_rules = list(
        filter(lambda x: x.LHS != unit_prod.LHS or len(x.RHS) != 1 or x.RHS[0] != unit_prod.RHS[0], rules))
    new_rules = [Rule(unit_prod.LHS, r.RHS) for r in rules if r.LHS == unit_prod.RHS[0]]
    updated_rules = updated_rules + new_rules
    return updated_rules
Esempio n. 22
0
from common import Rule, Word

rules = []
rules.append(Rule('S', 'NP VP'))
rules.append(Rule('NP', 'N'))
rules.append(Rule('NP', 'SURNAME N'))
rules.append(Rule('NP', 'N N'))
rules.append(Rule('NP', 'V N'))
rules.append(Rule('PP', 'PREP NP'))
rules.append(Rule('VP', 'V NP'))
rules.append(Rule('VP', 'ADV VP'))
rules.append(Rule('VP', 'PP VP'))

words = '王 翻译 在 翻译 小说'
words = words.split()
dictionary = []
dictionary.append(Word('王', 'SURNAME N'))
dictionary.append(Word('翻译', 'N V'))
dictionary.append(Word('在', 'V ADV PREP'))
dictionary.append(Word('小说', 'N'))

if __name__ == '__main__':
    for i in rules:
        print(i)

    print()

    for i in dictionary:
        print(i)