def _checkSymbolFormRuleKey(self, symStr): """makes sure that symbol usage is valid for weight label keys permits step separators and expression characters >>> g = Grammar() >>> g._checkSymbolFormRuleKey('wer') """ if symStr == '': # permit the empty string return valid = self.SYM + self.STEP + ''.join(self.EXPRESS) for char in symStr: if char not in valid: raise error.TransitionSyntaxError( "rule definition uses illegal characters (%s)" % char) # there must be at least one symbol on left side of production # rule that is just a symbol count = 0 # can be a defined symbol or an all expression for char in self.SYM + self.EXPRESSALL: if char in symStr: count += 1 if count > 0: break if count == 0: # no symbols were found raise error.TransitionSyntaxError( "rule definition does not define source symbol")
def _parseWeightValue(self, pairWeight): """read a complete dictionary of transition keys and weights, and load weights as a list""" self._weightSrc = {} for key, value in list(pairWeight.items()): # make key into a list of symbol strings key = self._parseWeightKey(key) # make value into a list of pairs weights = value.split(self.ASSIGNDELIMIT) weightList = [] for assign in weights: if self.ASSIGN not in assign: continue if assign.count(self.ASSIGN) > 1: # bad syntax or other error raise error.TransitionSyntaxError("incorrect weight specification: %s" % assign) symbol, w = assign.split(self.ASSIGN) # convert to float or int, may not be less tn zero # will return None on error w = drawer.strToNum(w, 'num', 0, None) # it woudl be nice to accept zero weights but this causes many # side-effects; need to test in whole # not defining all weights is permitted if w in (None, 0): # no zero weights, or other errors raise error.TransitionSyntaxError("bad weight value given: %s" % assign) weightList.append((symbol, w)) # assign to weight src self._weightSrc[key] = weightList
def _checkRuleReference(self): """Make sure that all rule outputs and inputs refer to defined symbols. Rule inputs can use EXPRESSALL for matching This method is called in _parse(). """ self._maxRuleOutputSize = 0 knownSym = list(self._symbols.keys()) for inRule, outRule in list(self._rules.items()): #environment.printDebug(['in rule, out rule', inRule, outRule]) environment.printDebug( ['in rule', repr(inRule), 'out rule', outRule]) # need to iterate through rule parts first match = True # permit empty string as input: this is not yet fully implemented if inRule == '': pass else: for r in inRule: if r not in knownSym + [self.EXPRESSALL]: match = False if not match: raise error.TransitionSyntaxError( "source rule component (%s) references an undefined symbol" % inRule) # check out rules, of which there is 1 or more # NOTE: this assumes there are not delimiters used match = False for option, weight in outRule: # pairs of value, weight #environment.printDebug(['_checkRuleReference(): option', repr(option)]) if len(option) > self._maxRuleOutputSize: self._maxRuleOutputSize = len(option) if option == '': # accept empty output option match = True # if out rules point to more then value, need to split here for char in option: # if char == '': # permit empty string # match = True if char not in knownSym: match = False break else: match = True if not match: break if not match: raise error.TransitionSyntaxError( "destination rule component (%s) references an undefined symbol" % outRule)
def _parse(self, usrStr): # divide all groups into pairs of key, {}-enclosed values # all elements of notation are <key>{<value>} pairs # this notation has two types: symbol definitions and weight definitions # symbol defs: keys are alphabetic, values can be anything (incl lists) # name{symbol} # weight defs: keys are source transitions statments w/ step indicator : # transition{name=weight|name=weight} # support for limited regular expressions in weight defs # t:*:t match any one in the second palce; not e same as zero or more # t:-t:t match anything that is not t # t:w|q:t match either (alternation) # note: this will remove all spaces in all keys and all values self._parseValidate(usrStr) usrStr = self._parseClean(usrStr) pairSymbol = {} pairWeight = {} groups = usrStr.split(self.CLOSE) for double in groups: if self.OPEN not in double: continue try: key, value = double.split(self.OPEN) except: # possible syntax error in formationi raise error.TransitionSyntaxError("badly placed delimiters") # key is always a symbol def: will change case and remove spaces key = drawer.strScrub(key, 'lower', [' ']) # rm spaces from key # split into 2 dictionaries, one w/ symbol defs, one w/ weights # if it is or has a step indicator (:), it is not a def if self.STEP in key or self.ASSIGN in value: # it is a weight # store weights values in lower self._checkSymbolFormWeightKey(key) pairWeight[key] = drawer.strScrub(value, 'lower', [' ']) else: # must be a symbol def self._checkSymbolFormDef(key) # will raise exception on bad key pairSymbol[key] = drawer.strScrub(value, None, [' ']) # this initializes symbol table if pairSymbol == {}: raise error.TransitionSyntaxError("no symbols defined") self._symbols = pairSymbol # pass the pair dictionary to weight parser if pairWeight == {}: raise error.TransitionSyntaxError("no weights defined") self._parseWeightValue(pairWeight) # check symbol usage and determine orders self._checkSymbolUsage()
def _checkSymbolFormWeightKey(self, symStr): """makes sure that symbol usage is valid for weight label keys permits expression characters""" valid = self.SYM + self.STEP + ''.join(self.EXPRESS) for char in symStr: if char not in valid: raise error.TransitionSyntaxError("symbol definition uses illegal characters (%s)" % char)
def _parseValidate(self, usrStr): """make sure the the string is well formed >>> g = Grammar() >>> g._parseValidate('sdf{3}') >>> g._parseValidate('sdf{3}}') Traceback (most recent call last): TransitionSyntaxError: all braces not paired """ if usrStr.count(self.OPEN) != usrStr.count(self.CLOSE): # replace with exception subclass raise error.TransitionSyntaxError("all braces not paired")
def _parseWeightKey(self, key): """ make key into a list of symbol strings store expression weight keys in a tuple, with operator leading, as a sub tuple. only one operator is allowed, must be tuple b/c will be a dict key >>> a = Transition() >>> a._parseWeightKey('a:b:c') ('a', 'b', 'c') >>> a._parseWeightKey('a:b:c|d') ('a', 'b', ('|', 'c', 'd')) >>> a._parseWeightKey('a:b:c|d|e') ('a', 'b', ('|', 'c', 'd', 'e')) >>> a._parseWeightKey('a:*:c') ('a', ('*',), 'c') >>> a._parseWeightKey('a:*:-c') ('a', ('*',), ('-', 'c')) """ # make key into a list of symbol strings # if key is self.STEP, assign as empty tuple if key == self.STEP: return () key = tuple(key.split(self.STEP)) # always split by step delim # filter empty strings keyPost = [] for element in key: if element == '': continue keyPost.append(element.strip()) key = keyPost # check for expressions in each segment of key keyFinal = [] for segment in key: keyPost = [] for exp in self.EXPRESS: if exp in segment: keyPost.append(exp) if len(keyPost) == 0: # no expressions used, a normal weight key keyFinal.append(segment) # make it a tuple before return elif len(keyPost) > 1: msg = "only one operator may be used per weight key segment" raise error.TransitionSyntaxError(msg) # definitial an expression, pack new tuple, leading with expression op # if it is an or sym, need to split by this symbol else: if self.EXPRESSOR in segment: opperands = segment.split(self.EXPRESSOR) segmentPost = [self.EXPRESSOR] + opperands keyFinal.append(tuple(segmentPost)) else: # key post already has expression operator leading for val in segment: if val in self.EXPRESS: continue keyPost.append(val) keyFinal.append(tuple(keyPost)) return tuple(keyFinal)
def _parseAxiom(self, axiomSrc=None): """Call this after all symbols have been found """ knownSym = list(self._symbols.keys()) if axiomSrc != None: # NOTE: assumes no delimiters between symbols axiomSrc = axiomSrc.strip() for char in axiomSrc: if char not in knownSym: raise error.TransitionSyntaxError( "bad axiom value given: %s" % char) self._axiom = axiomSrc else: # get a random start self._axiom = random.choice(knownSym) # always update state to axiom self._state = self._axiom
def _checkSymbolUsage(self): """check to see if all symbols used in weight keys are in symbol list also update orders; this fills _ordersSrc""" symbols = list(self._symbols.keys()) for key in list(self._weightSrc.keys()): ord = len(key) # a zero key will be an empty tuple if ord not in self._ordersSrc: self._ordersSrc.append(ord) # len of weight label is order # check symbols used in each weight weights = self._weightSrc[key] for w in weights: # weights are always symbol, number pairs if w[0] not in symbols: raise error.TransitionSyntaxError("weight specified for undefined symbol: %s" % w[0]) # sort order list self._ordersSrc.sort()
def _checkSymbolFormDef(self, symStr): """makes sure that symbol usage is valid for symbol definitions symbols cannot have spaces, case, or strange characters >>> g = Grammar() >>> g._checkSymbolFormDef('wer') >>> g._checkSymbolFormDef('w:er') Traceback (most recent call last): TransitionSyntaxError: symbol definition uses illegal characters (:) >>> g._checkSymbolFormDef('wer@#$') Traceback (most recent call last): TransitionSyntaxError: symbol definition uses illegal characters (@) """ for char in symStr: if char not in self.SYM: raise error.TransitionSyntaxError( "symbol definition uses illegal characters (%s)" % char)
def _parse(self, usrStr): ''' >>> g = Grammar() >>> g._parse('a{3}b{4} @ a{b}b{a|b}') >>> g._parse('a{3}b{4} @ a{b}b{a|b|c}') Traceback (most recent call last): TransitionSyntaxError: destination rule component ([('a', 1), ('b', 1), ('c', 1)]) references an undefined symbol >>> g._parse('a{3}b{4}c{3} @ a{b}b{a|b|c}') >>> g._parse('a{3}b{4}c{3} @ a{b}d{a|b|c}') Traceback (most recent call last): TransitionSyntaxError: source rule component (d) references an undefined symbol >>> g._symbols {'a': '3', 'c': '3', 'b': '4'} >>> g._parse('a{3}b{4}c{3} @ a{bb}b{aa|b|c}') >>> g._symbols {'a': '3', 'c': '3', 'b': '4'} >>> g._rules {'a': [('bb', 1)], 'b': [('aa', 1), ('b', 1), ('c', 1)]} >>> g._parse('a{3}b{4} @ a{b}b{a|b} @ b') >>> g._axiom 'b' >>> g._parse('a{3}b{4} @ a{b}b{a|b} @ baab') >>> g._axiom 'baab' >>> g._parse('a{3}b{4} @ a{b}ac{a|b} @ baab') Traceback (most recent call last): TransitionSyntaxError: source rule component (ac) references an undefined symbol >>> g._parse('a{3}b{4} @ *{b} @ baab') # * can be source, not dst >>> g._parse('a{3}b{4} @ a{*} @ baab') # * cannot be destination Traceback (most recent call last): TransitionSyntaxError: destination rule component ([('*', 1)]) references an undefined symbol ''' # divide all groups into pairs of key, {}-enclosed values # all elements of notation are <key>{<value>} pairs self._parseValidate(usrStr) usrStr = self._parseClean(usrStr) pairSymbol = {} pairRule = {} if usrStr.count(self.SPLIT) not in [1, 2]: # must be 1 or 2 raise error.TransitionSyntaxError( "must include exactly one split delimiter (%s)" % self.SPLIT) if usrStr.count(self.SPLIT) == 1: partSymbol, partRule = usrStr.split(self.SPLIT) partAxiom = None elif usrStr.count(self.SPLIT) == 2: # split into three partSymbol, partRule, partAxiom = usrStr.split(self.SPLIT) for subStr, dst in [(partSymbol, 'symbol'), (partRule, 'rule')]: groups = subStr.split(self.CLOSE) for double in groups: if self.OPEN not in double: continue try: key, value = double.split(self.OPEN) except: # possible syntax error in formationi raise error.TransitionSyntaxError( "badly placed delimiters") # key is always a symbol def: will change case and remove spaces key = drawer.strScrub(key, 'lower', [' ']) # rm spaces from key # split into 2 dictionaries, one w/ symbol defs, one w/ rules # symbol defs must come before if dst == 'symbol': self._checkSymbolFormDef( key) # will raise exception on bad key pairSymbol[key] = drawer.strScrub(value, None, [' ']) elif dst == 'rule': self._checkSymbolFormRuleKey(key) pairRule[key] = drawer.strScrub(value, 'lower', [' ']) # this initializes symbol table if pairSymbol == {}: raise error.TransitionSyntaxError("no symbols defined") self._symbols = pairSymbol # pass the pair dictionary to weight parser if pairRule == {}: raise error.TransitionSyntaxError("no rules defined") self._parseRuleValue(pairRule) # assigns to self._rules # check symbol usage and determine orders self._checkRuleReference() # assigns axiom value self._parseAxiom(partAxiom)
def _parseRuleValue(self, pairRule): """Read a preliminary dictionary of rules, and split into a list of rules based on having one or more probabilistic rule options >>> g = Grammar() >>> g._parseRuleValue({'a': 'ab'}) >>> g._rules {'a': [('ab', 1)]} >>> g._parseRuleValue({'a': 'ab|ac'}) >>> g._rules {'a': [('ab', 1), ('ac', 1)]} >>> g._parseRuleValue({'a': 'ab|ac|aa=3'}) >>> g._rules {'a': [('ab', 1), ('ac', 1), ('aa', 3)]} >>> g._parseRuleValue({'a': 'ab=3|c=12|aa=3'}) >>> g._rules {'a': [('ab', 3), ('c', 12), ('aa', 3)]} >>> g._parseRuleValue({'a': 'ab=3|c=12|aa=3', 'c': 'b=3|c=5|a'}) >>> g._rules {'a': [('ab', 3), ('c', 12), ('aa', 3)], 'c': [('b', 3), ('c', 5), ('a', 1)]} >>> g._parseRuleValue({'a': 'ab=3|c=12|aa=0'}) Traceback (most recent call last): TransitionSyntaxError: bad weight value given: aa=0 >>> g._parseRuleValue({'a': ''}) >>> g._rules {'a': [('', 1)]} >>> g._parseRuleValue({'a': 'ab=3|c=12|'}) >>> g._rules {'a': [('ab', 3), ('c', 12), ('', 1)]} >>> g._parseRuleValue({'*a': 'ab=3|c=12|a=3'}) >>> g._rules {'*a': [('ab', 3), ('c', 12), ('a', 3)]} """ self._rules = {} # this always clears the last rules for key, value in list(pairRule.items()): # make value into a src:dst pairs ruleList = [] weights = value.split(self.ASSIGNDELIMIT) # this is the | if len(weights) == 1: # if there is only one weight, add an assignment value of 1 # this is permitted if self.ASSIGN not in weights[0]: ruleList.append((weights[0], 1)) else: # remove weight, as it is not significant w = weights[0].split(self.ASSIGN)[0] ruleList.append((w, 1)) # if there are no assignments but more than one option # that is, no = sign assignments elif value.count(self.ASSIGN) == 0: for symbol in weights: ruleList.append((symbol, 1)) else: #environment.printDebug(['obtained weights', weights, value.count(self.ASSIGN)]) for assign in weights: # if not assignment, provide one as a string if self.ASSIGN not in assign: assign += '=1' # assume 1 symbol, w = assign.split(self.ASSIGN) # convert to float or int, may not be less tn zero # will return None on error w = drawer.strToNum(w, 'num', 0, None) if w in [None, 0]: # no zero weights, or other errors raise error.TransitionSyntaxError( "bad weight value given: %s" % assign) ruleList.append((symbol, w)) self._rules[key] = ruleList
def _checkSymbolFormDef(self, symStr): """makes sure that symbol usage is valid for symbol definitions symbols cannot have spaces, case, or strange characters""" for char in symStr: if char not in self.SYM: raise error.TransitionSyntaxError("symbol definition uses illegal characters (%s)" % char)
def _parseValidate(self, usrStr): """make sure the the string appears correct""" if usrStr.count(self.OPEN) != usrStr.count(self.CLOSE): raise error.TransitionSyntaxError("all braces not paired")