def import_from_fsm(self, FileName="automaton.fsm", SymbolFileName="automaton.sym"): """ Load automaton from file in FSM format. Based on FSM man page: http://www2.research.att.com/~fsmtools/fsm/man4.html . This method must be updated if new symbol is added to Netbench. Raises Exception if unknown symbol string type is found and coresponding class can not be determinated. :param FileName: File name from which the fsm part will be imported. :type FileName: string :param SymbolFileName: File name from which the sym part will be imported. :type SymbolFileName: string :raises: nfa_data_import_exception if unknown symbol string type is found and coresponding class can not be determinated. """ # initialization self.states = dict() # Finite set of states self.alphabet = dict() # Symbols of alphabet self.start = -1 # ID of Start state self.transitions = set() # Transitions self.final = set() # Final states self.Flags = dict() # Flags for specified properties # Load symbols from symbol file fs = open(SymbolFileName, 'r') symbol_mapper = dict() # Read all symbols for line in fs.readlines(): # Split line line = line.split() # Get symbol ID - subtract 1 (FSM Library use 0 for epsilon symbol, Netbench use -1 for epsilon symbol) symbol_id = int(line[1]) - 1 # maps symbol string to its id symbol_mapper[line[0]] = symbol_id # get name of symbol class try: cls = b_symbol.io_reverse_mapper[line[0][0]] except: raise nfa_data_import_exception(line[0][0]) symbol = None # Create new object of selected class if cls == "b_Sym_char": symbol = sym_char.b_Sym_char("", "", 0) if cls == "b_Sym_char_class": symbol = sym_char_class.b_Sym_char_class("", set(), 0) if cls == "b_Sym_string": symbol = sym_string.b_Sym_string("", "", 0) if cls == "b_Sym_kchar": symbol = sym_kchar.b_Sym_kchar("", ("", ""), 0) if cls == "DEF_SYMBOLS": symbol = b_symbol.DEF_SYMBOLS("", 0) if cls == "b_Sym_cnt_constr": symbol = sym_cnt_constr.b_Sym_cnt_constr("", "", 0, 0, 0) if symbol == None: raise nfa_data_import_exception(line[0][0]) else: # Import symbol symbol.import_symbol(line[0], symbol_id) # Add to alphabet self.alphabet[symbol_id] = symbol fs.close() fr = open(FileName, 'r') # file read # first line indicating start state line = fr.readline() line = line.split() src = int(line[0]) self.start = src self.states[src] = b_State(mid=src) # line is transition if len(line) > 1: des = int(line[1]) if src != des: self.states[des] = b_State(mid=des) self.transitions.add((src, symbol_mapper[line[2]], des)) # first line is start state and too final state # (line is final state) else: self.final.add(src) self.states[src]._rnum = src # from 2 line to EndOfFile for line in fr.readlines(): line = line.split() src = int(line[0]) if src not in self.states: self.states[src] = b_State(mid=src) # line is transition if len(line) > 1: des = int(line[1]) if des not in self.states: self.states[des] = b_State(mid=des) self.transitions.add((src, symbol_mapper[line[2]], des)) # line is final state else: self.final.add(src) self.states[src]._rnum = src self.Flags["ImportFromFsm"] = True fr.close()
def get_nfa(self): """ Parse a current line and returns parsed nfa. :returns: Created automaton in nfa_data format. Returns None if failure happens. :rtype: nfa_data or None """ # Check if some reg. exp. are set. if (self._position < 0): return None # Create random value. #value = random.randint(0, sys.maxint) # Get line. line = self._text[self._position] # Remove trailing \n if line[len(line) - 1] == '\n': line = line[0:len(line)-1] #line = "/" + line + "/" self.last = line # find where we are msfm_path = aux_func.getPatternMatchDir() work_path = os.getcwd() # invoke C regexp parser #cmd = "echo '" + line + "' | " + msfm_path + "/pcre_parser/parser -o STDOUT -s" #res = aux_func.getstatusoutput(cmd) cmd = "" # Create cnt_constr symbols if requested if self.create_cnt_constr == False: cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s" else: cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s -c" # Do not create eof symbols if requested if self.create_eof_symbols == False: cmd += " -E" res = aux_func.getstatusoutput(cmd, line) # Print stderr if there is some content if len(res[2]) != 0: sys.stderr.write(res[2] + "\n") # If error, stop. if res[0] != 0: sys.stderr.write("PARSER ERROR:\n") sys.stderr.write("CMD: " + cmd + "\n") sys.stderr.write("PCRE: " + line + "\n") sys.stderr.write("MSFM:\n"); sys.stderr.write(res[1] + "\n"); return None; else: try: # Create empty object nfa = nfa_data.nfa_data() # Preprocess automaton file FSMfile = res[1].split("\n") # Get start state of NFA nfa.start = int(FSMfile[2]) del FSMfile[2] # FORMAT of Automata file # - Number of the States in the automaton # - Number of the transition in the automaton # - Each transition is represenetd by one line in the file. Line # is in format Source_State|Symbol|Target_State|Epsilon # - End of the transition table is represented by line of # # - Number of the end states # - Line with identifikator of the endState. Every endstate is # folowed by , (coma) # - End of endState section is represented by line of # # - Number of the symbols in symbol table # - Every symbol is stored on its own line and it is represented # as Symbol_Number:Character1|Character2| # - End of the file TransitionTable = [x.split("|") for x in FSMfile[2:int(FSMfile[1])+2]]; # Transition table is list of the list and represents the whole # transition table of the automata. 2 is an index of the first # transition FSMfile[1] is the number of the transition in automaton # List of the endStates is stored after all transition (FSMfile[1]) # and after 4 other lines (number of states, number of transitions, # number of endstates, and the line of #### # Endstates are isolated by , (coma) Endstates = FSMfile[int(FSMfile[1])+4].split(",") # Alphabet symbols start on the index FSMfile[1] # (all transitions) + 7 (4 as before + line of #, # line of endstates and number of symbols) Symbols = (FSMfile[int(FSMfile[1])+7:]); # Creates end states objects. for state in Endstates: if state != "": Tmp = b_State(int(state),set([self._position])) #Creates state which is described by the int(State) nfa.states[Tmp.get_id()] = Tmp nfa.final.add(Tmp.get_id()) all_msfm_syms = dict() # For every symbol in alphabet for ActSym in Symbols: # Separate symbol number and symbol data (done by first :) StartSym = ActSym.find(":"); if ActSym[StartSym+1] == '#': # Split at # sharp_split = ActSym[StartSym+1:len(ActSym)-1].split("#") # Get m m = int(sharp_split[1]) # Get n n = 0 # Check if infinite number of symbols can occure if sharp_split[2] == '': n = float("inf") else: n = int(sharp_split[2]) # Get symbol part of encoded cnt constr SymSym = ActSym.rfind("#"); symSet = set([x for x in ActSym[SymSym+1:len(ActSym)-1].split("|")]) symSetMod = set() # convert hex to char for s in symSet: symSetMod.add(chr(long(s,16) & 255)) # Create symbol symbol = None text_info = "" if not (m == 0 and n == 0): # Create char if number of symbols is 1. if len(symSetMod) == 1: char = symSetMod.pop() symbol = char text_info += char + "{" + str(m) + "," + str(n) + "}" else: # Create char class otherwise. strSymSetMod = str() for sym in symSetMod: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" text_info += strSymSetMod + "{" + str(m) + "," + str(n) + "}" symbol = symSetMod # Create sym_cnt_constr object Tmp = sym_cnt_constr.b_Sym_cnt_constr(text_info, symbol, m ,n, int(ActSym[:StartSym], 16)) nfa.alphabet[Tmp.get_id()] = Tmp # Create mapping from symbol chars to their ids if (m,n,frozenset(symbol)) not in all_msfm_syms: all_msfm_syms[(m,n,frozenset(symbol))] = set() all_msfm_syms[(m,n,frozenset(symbol))].add(int(ActSym[:StartSym], 16)) else: #BUG: Workaround for bug in parser, when cnt constr symbols are generated even construction such as s+, d*, .+, ... are converted. This behaviaor is not OK, but fix of the parser would consume to mauch time. This workaround works OK. # Create mapping from symbol chars to their ids if frozenset(symSetMod) not in all_msfm_syms: all_msfm_syms[frozenset(symSetMod)] = set() all_msfm_syms[frozenset(symSetMod)].add(int(ActSym[:StartSym], 16)) # Create char if number of symbols is 1. if len(symSetMod) == 1: char = symSetMod.pop() Symbol = sym_char.b_Sym_char(char,char,int(ActSym[:StartSym], 16)) nfa.alphabet[Symbol.get_id()] = Symbol # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char) else: # Create char class otherwise. # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod) strSymSetMod = str() for sym in symSetMod: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" #nfa.alphabet[int(ActSym[:StartSym], 16)] Tmp = sym_char_class.b_Sym_char_class(strSymSetMod,symSetMod,int(ActSym[:StartSym], 16)) nfa.alphabet[Tmp.get_id()] = Tmp elif ActSym[StartSym+1:] == "EOF|": # Add EOF symbol into alphabet Symbol = sym_eof.b_Sym_EOF("EOF", int(ActSym[:StartSym], 16)) nfa.alphabet[Symbol.get_id()] = Symbol # Create mapping from symbol chars to their ids if "EOF" not in all_msfm_syms: all_msfm_syms["EOF"] = set() all_msfm_syms["EOF"].add(int(ActSym[:StartSym], 16)) else: symSet = set([x for x in ActSym[StartSym+1:len(ActSym)-1].split("|")]) symSetMod = set() # convert hex to char for s in symSet: symSetMod.add(chr(long(s,16) & 255)) # Create mapping from symbol chars to their ids if frozenset(symSetMod) not in all_msfm_syms: all_msfm_syms[frozenset(symSetMod)] = set() all_msfm_syms[frozenset(symSetMod)].add(int(ActSym[:StartSym], 16)) # Create char if number of symbols is 1. if len(symSetMod) == 1: char = symSetMod.pop() Symbol = sym_char.b_Sym_char(char,char,int(ActSym[:StartSym], 16)) nfa.alphabet[Symbol.get_id()] = Symbol # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char) else: # Create char class otherwise. # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod) strSymSetMod = str() for sym in symSetMod: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" #nfa.alphabet[int(ActSym[:StartSym], 16)] Tmp = sym_char_class.b_Sym_char_class(strSymSetMod,symSetMod,int(ActSym[:StartSym], 16)) nfa.alphabet[Tmp.get_id()] = Tmp # TODO: use special class for Epsilon? # Epsilon is representad now as sym_char object with char "" and index -1 #nfa.alphabet[-1] Tmp = sym_char.b_Sym_char("Epsilon", "",-1) nfa.alphabet[Tmp.get_id()] = Tmp # removeable symbols removeable_symbols = set() nonremoveable_symbols = set() # Add non final states to automaton. for transition in TransitionTable: # if not in states, add start state of transition. if not (int(transition[0]) in nfa.states): nfa.states[int(transition[0])] = b_State(int(transition[0]), set()) # if not in states, add end state of transition. if not (int(transition[2]) in nfa.states): nfa.states[int(transition[2])] = b_State(int(transition[2]), set()) # Handle epsilon transitions. alphaNum = -1 if transition[3] == '1': alphaNum = -1 removeable_symbols.add(int(transition[1], 16)) else: alphaNum = int(transition[1], 16) nonremoveable_symbols.add(alphaNum) # Add transition to automaton. nfa.transitions.add((int(transition[0]), alphaNum, int(transition[2]))) # Corect the removeable symbols removeable_symbols -= nonremoveable_symbols # Remove unused symbols for rsymbol in removeable_symbols: del nfa.alphabet[rsymbol] # Remove duplicit symbols sym_mapping = dict() # Create mapping between current ids and the ids which will be used. # Only non removed id can be used as key #print all_msfm_syms #print removeable_symbols for key in all_msfm_syms: sym = all_msfm_syms[key].pop() if sym not in removeable_symbols: all_msfm_syms[key].add(sym) else: found = 0 syms = set() syms.add(sym) while found == 0: if len(all_msfm_syms[key]) == 0: break sym = all_msfm_syms[key].pop() syms.add(sym) if sym not in removeable_symbols: found = 1 all_msfm_syms[key] |= syms for sid in all_msfm_syms[key]: sym_mapping[sid] = sym sym_mapping[-1] = -1 add_transitions = set() #print sym_mapping for transition in nfa.transitions: #print transition add_transitions.add((transition[0], sym_mapping[transition[1]], transition[2])) nfa.transitions = add_transitions for sid in sym_mapping: if sid != sym_mapping[sid]: if sid not in removeable_symbols: del nfa.alphabet[sid] # Somethimg is wrong with the msfm file, try autodetect the start state if nfa.start < 0: # Determinate start station # Dictionary mapping between states and their previous states. StateInSymbols = dict() # Autodetect start state of NFA - remove when start state is aded to the msfm format # Compute the mapping between states and their transitions. for state in nfa.states.keys(): StateInSymbols[state] = set() for transition in nfa.transitions: if StateInSymbols.has_key(transition[2]) == True: StateInSymbols[transition[2]].add(transition[0]) else: StateInSymbols[transition[2]] = set() StateInSymbols[transition[2]].add(transition[0]) # Autodetection - start state can have only 0 or 1 in transition originating from itself - problem /^(abc)+..../ for state in StateInSymbols.keys(): if len(StateInSymbols[state]) == 0: nfa.start =state elif (len(StateInSymbols[state]) == 1) and (list(StateInSymbols[state])[0] == state): nfa.start = state return nfa except None: sys.stderr.write("ERROR while parsing msfm output of parser:\n") sys.stderr.write("CMD: " + cmd + "\n") sys.stderr.write("PCRE: " + line + "\n") sys.stderr.write("MSFM:\n"); sys.stderr.write(res[1] + "\n"); return None
def determinise(self, create_table=False, states_limit=0): """ Determinisation of automaton. :param create_table: If create_table = false than state representation table is not created and less memory is consumed. :type create_table: boolean :param states_limit: If num of states exceeds this limit, during determinization, then flag "Deterministic" is set to False and determinize stops; if nfa exceeds limit and is already deterministic, then nothing happens (this is because speed, not because logic); safe use is only if you want to stop algorithm if it exceeds limit; zero means no limit. :type states_limit: int :flags: Set Deterministic, Epsilon Free and Alphabet collision free. This method sets _compute to False, and get_compute() will return False until compute() is called. """ # if not self.has_flag("Alphabet collision free") \ # or self.get_flag("Alphabet collision free") == False: # raise ALPHABET_COLLISION_FREE_ERROR # Automaton doesn't have any state = automaton is empty if self._automaton.is_empty() or self._automaton.start < 0: return self.remove_epsilons() # check the Epsilon free flag ? counter = 0 stack = list() newStates = dict() newStatesRev = dict() tmp = set() tmp.add(self._automaton.start) newStates[counter] = tmp newStatesRev[frozenset(tmp)] = counter stack.append(counter) counter += 1 final = set() transitions = set() alphCounter = 0 alphabet = dict() alphabetRev = dict() states = dict() states[0] = b_State( 0, self._automaton.states[self._automaton.start].get_regexp_number()) stateTrans = dict() # transtions from each state for transition in self._automaton.transitions: stateTrans.setdefault(transition[0], set()).add( (transition[1], transition[2])) # copy alphabet, ID's 0,1,... mapId = dict() # maps old id -> new id for id, sym in self._automaton.alphabet.iteritems(): sym.set_id(alphCounter) alphabet[alphCounter] = sym alphabetRev[sym] = alphCounter mapId[id] = alphCounter alphCounter += 1 while stack: actState = stack.pop() if newStates[actState].intersection(self._automaton.final): final.add(actState) # transitions from actual state for each symbol outSymbols = dict() # (symbol id, set of states id) for state in newStates[actState]: if state not in stateTrans.keys(): continue for t in stateTrans[state]: outSymbols.setdefault(mapId[t[0]], set()).add(t[1]) # resolve symbol collisions symbolAdded = True while symbolAdded: symbolAdded = False for sym1 in list(outSymbols.keys()): toCompare = list(outSymbols.keys()) toCompare.remove(sym1) for sym2 in toCompare: if not (outSymbols[sym1] and outSymbols[sym2]): continue # no next state for one of the symbols if not alphabet[sym1].collision([alphabet[sym2]]): continue # print "COLLISION DETECTED" symStates = list([[]] * 3) symStates[0] = outSymbols[sym1] symStates[2] = outSymbols[sym2] symStates[1] = symStates[0] | symStates[2] outSymbols[sym1] = set() outSymbols[sym2] = set() ret = alphabet[sym1].resolve_collision(alphabet[sym2]) for i in range(3): if not ret[i]: # no symbol returned continue for new in ret[i]: symbolAdded = True if new not in alphabetRev: # add new symbol new.set_id(alphCounter) alphabet[alphCounter] = new alphabetRev[new] = alphCounter id = alphCounter alphCounter += 1 else: id = alphabetRev[new] # update next states for symbol tmp = outSymbols.setdefault(id, set()) outSymbols[id] = tmp | symStates[i] # create new transitions for symbol, nextState in outSymbols.iteritems(): if not nextState: continue # no next states -> ignore symbol if frozenset(nextState) not in newStatesRev.keys(): # create a new state newStatesRev[frozenset(nextState)] = counter newStates[counter] = nextState stack.append(counter) endVal = set() # set of regular expression numbres for state in nextState: if self._automaton.states[state].is_final() == True: endVal |= self._automaton.states[ state].get_regexp_number() states[counter] = b_State(counter, endVal) if states_limit != 0 and counter > states_limit: self.set_flag("Deterministic", False) return counter = counter + 1 transitions.add( (actState, symbol, newStatesRev[frozenset(nextState)])) # remove unused symbols toRemove = alphabet.keys() for trans in transitions: if trans[1] in toRemove: toRemove.remove(trans[1]) self._automaton.alphabet = alphabet self._automaton.remove_symbols(toRemove) # set new symbol ID's mapId = dict() # maps old id -> new id alphCounter = 0 alphabet = dict() for id, sym in self._automaton.alphabet.iteritems(): sym.set_id(alphCounter) alphabet[alphCounter] = sym mapId[id] = alphCounter alphCounter += 1 # correct symbol ID's in transitions newTrans = set() for trans in transitions: newTrans.add((trans[0], mapId[trans[1]], trans[2])) # update automaton self._automaton.start = 0 self._automaton.alphabet = alphabet self._automaton.states = states self._automaton.transitions = newTrans self._automaton.final = final self.set_flag("Deterministic", True) self.set_flag("Epsilon Free", True) if len(self._automaton.alphabet) > 0: self.set_flag("Alphabet collision free", True) self._compute = False if create_table == True: for i in range(0, counter): self._state_representation.append(newStates[i])
def minimise(self): """ Minimalization of DFA automaton. :raises: ALPHABET_COLLISION_ERROR() if alphabet is not collision free. :raises: DETERMINISTIC_ERROR() if automaton is not deterministic. :flags: Sets Minimal flag to true. This method sets _compute to False, and get_compute() will return False until compute() is called. """ if not self.has_flag("Alphabet collision free") \ or self.get_flag("Alphabet collision free") == False: raise ALPHABET_COLLISION_FREE_ERROR if not self.has_flag("Deterministic") \ or self.get_flag("Deterministic") != True: raise DETERMINISTIC_ERROR # variables a = self._automaton # shortcut newClasses = dict() # new indistinguishable states actualClasses = dict() # actual indistinguishable states table = dict() # table of "transitions", key is state, value is class # 1) *** Eliminate not available states. *** self.remove_unreachable() # 2) *** Compute not indistinguishable states. *** # set default table for state in a.states.keys(): table[state] = dict() for t in a.transitions: table[t[0]][t[1]] = t[2] defaultTable = copy.deepcopy(table) # zero iteration: # set first class other then final states newClasses[0] = a.states.keys() for finalState in a.final: if finalState in newClasses[0]: newClasses[0].remove(finalState) if self.get_multilanguage() is True: # set final states into next classes # each final state will be set in class according # get_regexp_number() newClassIdOffset = len(newClasses) newClassIdMapper = dict() for finalStateKey in a.final: frozen_regexp = frozenset( a.states[finalStateKey].get_regexp_number()) if not newClassIdMapper.has_key(frozen_regexp): newClassIdMapper[frozen_regexp] = newClassIdOffset newClasses[newClassIdOffset] = list() newClassIdOffset += 1 newClasses[newClassIdMapper[frozen_regexp]].append( finalStateKey) else: # all final states are in one class newClasses[1] = list(a.final) # indistinguishable iterations while newClasses != actualClasses: actualClasses = copy.deepcopy(newClasses) # recompute table for next iteration table = copy.deepcopy(defaultTable) for state in table.keys(): for symbol in table[state].keys(): for ClassID in range(0, len(actualClasses), 1): if table[state][symbol] in actualClasses[ClassID]: table[state][symbol] = ClassID break newClasses = dict() for ClassID in sorted(actualClasses.keys()): states_in_class = copy.deepcopy(actualClasses[ClassID]) while states_in_class != []: state = states_in_class[0] states_in_class.remove(state) states_in_new_class = [] states_in_new_class.append(state) for other_state in list(states_in_class): if table[state] == table[other_state]: states_in_class.remove(other_state) states_in_new_class.append(other_state) newClassID = len(newClasses.keys()) newClasses[newClassID] = states_in_new_class # *** Change to Reduced DFA. *** # change STATES back = a.states a.states = dict() for ClassID in range(0, len(actualClasses), 1): finalIndication = set() # indication of final states for state in actualClasses[ClassID]: if state in a.final: finalIndication |= back[state].get_regexp_number() a.states[ClassID] = b_State(mid=ClassID, rnum=finalIndication) # change ALPHABET - nothing to change # change START STATE for ClassID in range(0, len(actualClasses), 1): if a.start in actualClasses[ClassID]: a.start = ClassID break # change TRANSITIONS newTran = set() # re-computed transitions for t in a.transitions: sourceState = -1 destinationState = -1 # discover source state for ClassID in range(0, len(actualClasses), 1): if t[0] in actualClasses[ClassID]: sourceState = ClassID break # discover destination state for ClassID in range(0, len(actualClasses), 1): if t[2] in actualClasses[ClassID]: destinationState = ClassID break # add new transitions newTran.add((sourceState, t[1], destinationState)) a.transitions = newTran # change FINAL STATES newFinal = set() for finalState in a.final: for ClassID in range(0, len(actualClasses), 1): if finalState in actualClasses[ClassID]: newFinal.add(ClassID) break a.final = newFinal # 3) *** Removal of surplus state that do not affect the adoption # of a string. *** self.set_flag("Minimal", True) self._compute = False
def _determinise(self, create_table=False, states_limit=0): """ Determinisation of automaton. :param create_table: If create_table = false than state representation table is not created and less memory is consumed. :type create_table: boolean :param states_limit: If num of states exceeds this limit, during determinization, then flag "Deterministic" is set to False and determinize stops; if nfa exceeds limit and is already deterministic, then nothing happens (this is because speed, not because logic); safe use is only if you want to stop algorithm if it exceeds limit; zero means no limit. :type states_limit: int :raises: ALPHABET_COLLISION_FREE_ERROR() if alphabet is not collision free. :flags: Set Deterministic and Epsilon Free. This method sets _compute to False, and get_compute() will return False until compute() is called. """ if self.has_flag("Deterministic") and self.get_flag( "Deterministic") == True: return if self.has_flag("Epsilon Free") == False or self.get_flag( "Epsilon Free") == False: self.remove_epsilons() if not self.has_flag("Alphabet collision free") \ or self.get_flag("Alphabet collision free") == False: raise ALPHABET_COLLISION_FREE_ERROR # Automatom doesn't have any state = automaton is empty if self._automaton.is_empty() or self._automaton.start < 0: return Stack = list() Citac = 0 newStates = dict() newStatesBack = dict() tmp = set() tmp.add(self._automaton.start) newStates[Citac] = tmp newStatesBack[frozenset(tmp)] = Citac Citac = Citac + 1 Stack.append(0) EndStates = set() Transitions = set() alphabetCounter = 0 alphabet = dict() alphabetBack = dict() states = dict() states[0] = b_State( 0, self._automaton.states[self._automaton.start].get_regexp_number()) StateOutSymbols = dict() for transition in self._automaton.transitions: if StateOutSymbols.has_key(transition[0]) == True: StateOutSymbols[transition[0]].add( (transition[1], transition[2])) else: StateOutSymbols[transition[0]] = set() StateOutSymbols[transition[0]].add( (transition[1], transition[2])) while len(Stack) != 0: ActState = Stack.pop() TransitionLine = dict() if len(newStates[ActState].intersection( self._automaton.final)) != 0: EndStates.add(ActState) Symbols = set() SymbolSetList = list() translationTable = list() for States in newStates[ActState]: if States in StateOutSymbols.keys(): for sym in StateOutSymbols[States]: if self._automaton.alphabet[sym[0]].get_type( ) == b_symbol.io_mapper["b_Sym_char"]: SymbolSetList.append( set(self._automaton.alphabet[sym[0]].char)) translationTable.append(sym[1]) elif self._automaton.alphabet[sym[0]].get_type( ) == b_symbol.io_mapper["b_Sym_char_class"]: SymbolSetList.append( self._automaton.alphabet[sym[0]].charClass) translationTable.append(sym[1]) else: raise Exception() res = self.__allIntersections(SymbolSetList) translatedUsedList = list() for used in res[0]: newSet = set() for target in used: newSet.add(translationTable[target]) translatedUsedList.append(newSet) for i in range(0, len(translatedUsedList)): if frozenset( translatedUsedList[i]) not in newStatesBack.keys(): newStatesBack[frozenset(translatedUsedList[i])] = Citac newStates[Citac] = translatedUsedList[i] Stack.append(Citac) endVal = set() for state in translatedUsedList[i]: if self._automaton.states[state].is_final() == True: endVal |= self._automaton.states[ state].get_regexp_number() states[Citac] = b_State(Citac, endVal) if states_limit != 0 and Citac > states_limit: self.set_flag("Deterministic", False) return Citac = Citac + 1 if frozenset(res[1][i]) not in alphabetBack.keys(): alphabetBack[frozenset(res[1][i])] = alphabetCounter if len(res[1][i]) > 1: strSymSetMod = str() for sym in res[1][i]: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" Tmp = sym_char_class.b_Sym_char_class( strSymSetMod, res[1][i], alphabetCounter) alphabet[Tmp.get_id()] = Tmp else: for sym in res[1][i]: char = sym Symbol = sym_char.b_Sym_char(char, char, alphabetCounter) alphabet[Symbol.get_id()] = Symbol alphabetCounter += 1 Transitions.add( (ActState, alphabetBack[frozenset(res[1][i])], newStatesBack[frozenset(translatedUsedList[i])])) self._automaton.start = 0 self._automaton.alphabet = alphabet self._automaton.states = states self._automaton.transitions = Transitions self._automaton.final = EndStates if create_table == True: for i in range(0, Citac): self._state_representation.append(newStates[i]) self.set_flag("Deterministic", True) self.set_flag("Epsilon Free", True) self._compute = False
def import_from_fsm(self, FileName="automaton.fsm", SymbolFileName = "automaton.sym"): """ Load automaton from file in FSM format. Based on FSM man page: http://www2.research.att.com/~fsmtools/fsm/man4.html . This method must be updated if new symbol is added to Netbench. Raises Exception if unknown symbol string type is found and coresponding class can not be determinated. :param FileName: File name from which the fsm part will be imported. :type FileName: string :param SymbolFileName: File name from which the sym part will be imported. :type SymbolFileName: string :raises: nfa_data_import_exception if unknown symbol string type is found and coresponding class can not be determinated. """ # initialization self.states = dict(); # Finite set of states self.alphabet = dict(); # Symbols of alphabet self.start = -1; # ID of Start state self.transitions = set(); # Transitions self.final = set(); # Final states self.Flags = dict(); # Flags for specified properties # Load symbols from symbol file fs = open(SymbolFileName, 'r') symbol_mapper = dict() # Read all symbols for line in fs.readlines(): # Split line line = line.split() # Get symbol ID - subtract 1 (FSM Library use 0 for epsilon symbol, Netbench use -1 for epsilon symbol) symbol_id = int(line[1]) - 1 # maps symbol string to its id symbol_mapper[line[0]] = symbol_id # get name of symbol class try: cls = b_symbol.io_reverse_mapper[line[0][0]] except: raise nfa_data_import_exception(line[0][0]) symbol = None # Create new object of selected class if cls == "b_Sym_char": symbol = sym_char.b_Sym_char("","", 0) if cls == "b_Sym_char_class": symbol = sym_char_class.b_Sym_char_class("", set(), 0) if cls == "b_Sym_string": symbol = sym_string.b_Sym_string("","", 0) if cls == "b_Sym_kchar": symbol = sym_kchar.b_Sym_kchar("",("",""), 0) if cls == "DEF_SYMBOLS": symbol = b_symbol.DEF_SYMBOLS("", 0) if cls == "b_Sym_cnt_constr": symbol = sym_cnt_constr.b_Sym_cnt_constr("","",0,0,0) if symbol == None: raise nfa_data_import_exception(line[0][0]) else: # Import symbol symbol.import_symbol(line[0], symbol_id) # Add to alphabet self.alphabet[symbol_id] = symbol fs.close() fr = open(FileName, 'r') # file read # first line indicating start state line = fr.readline() line = line.split() src = int(line[0]) self.start = src self.states[src] = b_State(mid = src) # line is transition if len(line) > 1: des = int(line[1]) if src != des: self.states[des] = b_State(mid = des) self.transitions.add((src, symbol_mapper[line[2]], des)) # first line is start state and too final state # (line is final state) else : self.final.add(src) self.states[src]._rnum = src # from 2 line to EndOfFile for line in fr.readlines(): line = line.split() src = int(line[0]) if src not in self.states: self.states[src] = b_State(mid = src) # line is transition if len(line) > 1: des = int(line[1]) if des not in self.states: self.states[des] = b_State(mid = des) self.transitions.add((src, symbol_mapper[line[2]], des)) # line is final state else : self.final.add(src) self.states[src]._rnum = src self.Flags["ImportFromFsm"] = True fr.close()
def get_nfa(self): """ Parse a current line and returns parsed nfa. :returns: Created automaton in nfa_data format. Returns None if failure happens. :rtype: nfa_data or None """ # Check if some reg. exp. are set. if (self._position < 0): return None # Create random value. #value = random.randint(0, sys.maxint) # Get line. line = self._text[self._position] # Remove trailing \n if line[len(line) - 1] == '\n': line = line[0:len(line) - 1] #line = "/" + line + "/" self.last = line # find where we are msfm_path = aux_func.getPatternMatchDir() work_path = os.getcwd() # invoke C regexp parser #cmd = "echo '" + line + "' | " + msfm_path + "/pcre_parser/parser -o STDOUT -s" #res = aux_func.getstatusoutput(cmd) cmd = "" # Create cnt_constr symbols if requested if self.create_cnt_constr == False: cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s" else: cmd = msfm_path + "/pcre_parser/parser -o STDOUT -s -c" # Do not create eof symbols if requested if self.create_eof_symbols == False: cmd += " -E" res = aux_func.getstatusoutput(cmd, line) # Print stderr if there is some content if len(res[2]) != 0: sys.stderr.write(res[2] + "\n") # If error, stop. if res[0] != 0: sys.stderr.write("PARSER ERROR:\n") sys.stderr.write("CMD: " + cmd + "\n") sys.stderr.write("PCRE: " + line + "\n") sys.stderr.write("MSFM:\n") sys.stderr.write(res[1] + "\n") return None else: try: # Create empty object nfa = nfa_data.nfa_data() # Preprocess automaton file FSMfile = res[1].split("\n") # Get start state of NFA nfa.start = int(FSMfile[2]) del FSMfile[2] # FORMAT of Automata file # - Number of the States in the automaton # - Number of the transition in the automaton # - Each transition is represenetd by one line in the file. Line # is in format Source_State|Symbol|Target_State|Epsilon # - End of the transition table is represented by line of # # - Number of the end states # - Line with identifikator of the endState. Every endstate is # folowed by , (coma) # - End of endState section is represented by line of # # - Number of the symbols in symbol table # - Every symbol is stored on its own line and it is represented # as Symbol_Number:Character1|Character2| # - End of the file TransitionTable = [ x.split("|") for x in FSMfile[2:int(FSMfile[1]) + 2] ] # Transition table is list of the list and represents the whole # transition table of the automata. 2 is an index of the first # transition FSMfile[1] is the number of the transition in automaton # List of the endStates is stored after all transition (FSMfile[1]) # and after 4 other lines (number of states, number of transitions, # number of endstates, and the line of #### # Endstates are isolated by , (coma) Endstates = FSMfile[int(FSMfile[1]) + 4].split(",") # Alphabet symbols start on the index FSMfile[1] # (all transitions) + 7 (4 as before + line of #, # line of endstates and number of symbols) Symbols = (FSMfile[int(FSMfile[1]) + 7:]) # Creates end states objects. for state in Endstates: if state != "": Tmp = b_State( int(state), set([self._position]) ) #Creates state which is described by the int(State) nfa.states[Tmp.get_id()] = Tmp nfa.final.add(Tmp.get_id()) all_msfm_syms = dict() # For every symbol in alphabet for ActSym in Symbols: # Separate symbol number and symbol data (done by first :) StartSym = ActSym.find(":") if ActSym[StartSym + 1] == '#': # Split at # sharp_split = ActSym[StartSym + 1:len(ActSym) - 1].split("#") # Get m m = int(sharp_split[1]) # Get n n = 0 # Check if infinite number of symbols can occure if sharp_split[2] == '': n = float("inf") else: n = int(sharp_split[2]) # Get symbol part of encoded cnt constr SymSym = ActSym.rfind("#") symSet = set([ x for x in ActSym[SymSym + 1:len(ActSym) - 1].split("|") ]) symSetMod = set() # convert hex to char for s in symSet: symSetMod.add(chr(long(s, 16) & 255)) # Create symbol symbol = None text_info = "" if not (m == 0 and n == 0): # Create char if number of symbols is 1. if len(symSetMod) == 1: char = symSetMod.pop() symbol = char text_info += char + "{" + str(m) + "," + str( n) + "}" else: # Create char class otherwise. strSymSetMod = str() for sym in symSetMod: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" text_info += strSymSetMod + "{" + str( m) + "," + str(n) + "}" symbol = symSetMod # Create sym_cnt_constr object Tmp = sym_cnt_constr.b_Sym_cnt_constr( text_info, symbol, m, n, int(ActSym[:StartSym], 16)) nfa.alphabet[Tmp.get_id()] = Tmp # Create mapping from symbol chars to their ids if (m, n, frozenset(symbol)) not in all_msfm_syms: all_msfm_syms[(m, n, frozenset(symbol))] = set() all_msfm_syms[(m, n, frozenset(symbol))].add( int(ActSym[:StartSym], 16)) else: #BUG: Workaround for bug in parser, when cnt constr symbols are generated even construction such as s+, d*, .+, ... are converted. This behaviaor is not OK, but fix of the parser would consume to mauch time. This workaround works OK. # Create mapping from symbol chars to their ids if frozenset(symSetMod) not in all_msfm_syms: all_msfm_syms[frozenset(symSetMod)] = set() all_msfm_syms[frozenset(symSetMod)].add( int(ActSym[:StartSym], 16)) # Create char if number of symbols is 1. if len(symSetMod) == 1: char = symSetMod.pop() Symbol = sym_char.b_Sym_char( char, char, int(ActSym[:StartSym], 16)) nfa.alphabet[Symbol.get_id()] = Symbol # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char) else: # Create char class otherwise. # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod) strSymSetMod = str() for sym in symSetMod: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" #nfa.alphabet[int(ActSym[:StartSym], 16)] Tmp = sym_char_class.b_Sym_char_class( strSymSetMod, symSetMod, int(ActSym[:StartSym], 16)) nfa.alphabet[Tmp.get_id()] = Tmp elif ActSym[StartSym + 1:] == "EOF|": # Add EOF symbol into alphabet Symbol = sym_eof.b_Sym_EOF("EOF", int(ActSym[:StartSym], 16)) nfa.alphabet[Symbol.get_id()] = Symbol # Create mapping from symbol chars to their ids if "EOF" not in all_msfm_syms: all_msfm_syms["EOF"] = set() all_msfm_syms["EOF"].add(int(ActSym[:StartSym], 16)) else: symSet = set([ x for x in ActSym[StartSym + 1:len(ActSym) - 1].split("|") ]) symSetMod = set() # convert hex to char for s in symSet: symSetMod.add(chr(long(s, 16) & 255)) # Create mapping from symbol chars to their ids if frozenset(symSetMod) not in all_msfm_syms: all_msfm_syms[frozenset(symSetMod)] = set() all_msfm_syms[frozenset(symSetMod)].add( int(ActSym[:StartSym], 16)) # Create char if number of symbols is 1. if len(symSetMod) == 1: char = symSetMod.pop() Symbol = sym_char.b_Sym_char( char, char, int(ActSym[:StartSym], 16)) nfa.alphabet[Symbol.get_id()] = Symbol # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char.b_Sym_char(char, char) else: # Create char class otherwise. # nfa.alphabet[int(ActSym[:StartSym], 16)] = sym_char_class.b_Sym_char_class(str(symSetMod), symSetMod) strSymSetMod = str() for sym in symSetMod: strSymSetMod += sym strSymSetMod = "[" + strSymSetMod + "]" #nfa.alphabet[int(ActSym[:StartSym], 16)] Tmp = sym_char_class.b_Sym_char_class( strSymSetMod, symSetMod, int(ActSym[:StartSym], 16)) nfa.alphabet[Tmp.get_id()] = Tmp # TODO: use special class for Epsilon? # Epsilon is representad now as sym_char object with char "" and index -1 #nfa.alphabet[-1] Tmp = sym_char.b_Sym_char("Epsilon", "", -1) nfa.alphabet[Tmp.get_id()] = Tmp # removeable symbols removeable_symbols = set() nonremoveable_symbols = set() # Add non final states to automaton. for transition in TransitionTable: # if not in states, add start state of transition. if not (int(transition[0]) in nfa.states): nfa.states[int(transition[0])] = b_State( int(transition[0]), set()) # if not in states, add end state of transition. if not (int(transition[2]) in nfa.states): nfa.states[int(transition[2])] = b_State( int(transition[2]), set()) # Handle epsilon transitions. alphaNum = -1 if transition[3] == '1': alphaNum = -1 removeable_symbols.add(int(transition[1], 16)) else: alphaNum = int(transition[1], 16) nonremoveable_symbols.add(alphaNum) # Add transition to automaton. nfa.transitions.add( (int(transition[0]), alphaNum, int(transition[2]))) # Corect the removeable symbols removeable_symbols -= nonremoveable_symbols # Remove unused symbols for rsymbol in removeable_symbols: del nfa.alphabet[rsymbol] # Remove duplicit symbols sym_mapping = dict() # Create mapping between current ids and the ids which will be used. # Only non removed id can be used as key #print all_msfm_syms #print removeable_symbols for key in all_msfm_syms: sym = all_msfm_syms[key].pop() if sym not in removeable_symbols: all_msfm_syms[key].add(sym) else: found = 0 syms = set() syms.add(sym) while found == 0: if len(all_msfm_syms[key]) == 0: break sym = all_msfm_syms[key].pop() syms.add(sym) if sym not in removeable_symbols: found = 1 all_msfm_syms[key] |= syms for sid in all_msfm_syms[key]: sym_mapping[sid] = sym sym_mapping[-1] = -1 add_transitions = set() #print sym_mapping for transition in nfa.transitions: #print transition add_transitions.add( (transition[0], sym_mapping[transition[1]], transition[2])) nfa.transitions = add_transitions for sid in sym_mapping: if sid != sym_mapping[sid]: if sid not in removeable_symbols: del nfa.alphabet[sid] # Somethimg is wrong with the msfm file, try autodetect the start state if nfa.start < 0: # Determinate start station # Dictionary mapping between states and their previous states. StateInSymbols = dict() # Autodetect start state of NFA - remove when start state is aded to the msfm format # Compute the mapping between states and their transitions. for state in nfa.states.keys(): StateInSymbols[state] = set() for transition in nfa.transitions: if StateInSymbols.has_key(transition[2]) == True: StateInSymbols[transition[2]].add(transition[0]) else: StateInSymbols[transition[2]] = set() StateInSymbols[transition[2]].add(transition[0]) # Autodetection - start state can have only 0 or 1 in transition originating from itself - problem /^(abc)+..../ for state in StateInSymbols.keys(): if len(StateInSymbols[state]) == 0: nfa.start = state elif (len(StateInSymbols[state]) == 1) and (list( StateInSymbols[state])[0] == state): nfa.start = state return nfa except None: sys.stderr.write( "ERROR while parsing msfm output of parser:\n") sys.stderr.write("CMD: " + cmd + "\n") sys.stderr.write("PCRE: " + line + "\n") sys.stderr.write("MSFM:\n") sys.stderr.write(res[1] + "\n") return None