def trie(words, nocase=True): """ Builds a trie automaton Recognizes bunch of "words". Returns the automaton, mapping of words to their terminal states and all states that might want to have a transition to "unknown" state added. """ # TODO: Optimize by paths! auto = Automaton() start = auto.start() add_unknowns = [start] if nocase: label_type = LabelType.CharNoCase else: label_type = LabelType.Char terminals = {} for word in words: state = start for letter in word: old = state state = old.find_next_state(letter) if not state: state = auto.add_state() old.add_transition(letter, label_type, state) add_unknowns.append(state) terminals[word] = state return auto, terminals, add_unknowns
def read_header_value(event_name): """ Read and emit characters of a (possibly multi-line) header value. Note that the header is not "demangled" in any way, only the newlines are removed. """ auto = Automaton() start = auto.start() start.set_name(event_name) start.mark_enter() line, end = newline() auto.join(start, line) start.loop_fallback() continuation = auto.add_state() continuation.loop("HorizWhitespace", LabelType.Special) continuation.add_fallback(start) end.add_transition("HorizWhitespace", LabelType.Special, continuation) return auto, end, False
def read_until(term_label, term_label_type, inner_name=None): """ Read a string until a terminator is found. Eg. read until space. If inner_name is set, an event on each read (not terminator) letter is emitted with the given name for entered state. Returns the automaton and the final state. """ auto = Automaton() start = auto.start() if inner_name: start.set_name(inner_name) start.mark_enter() final = auto.add_state() start.add_transition(term_label, term_label_type, final) start.loop_fallback() return auto, final
def constant(s, nocase=False): """ Automaton accepting/consuming a string constant. """ assert len(s) >= 2 # Not implemented yet auto = Automaton() start = auto.start() # We go through a single "real" transition, otherwise the # automaton acts a bit unpredictable when joining together with # others. mid = auto.add_state() final = auto.add_state() if nocase: s = s.lower() mid.set_path(s[1:], nocase) if nocase: lt = LabelType.CharNoCase else: lt = LabelType.Char start.add_transition(s[0], lt, mid) return auto, final
def newline(): """ Automaton accepting arbitrary newline. While most network protocols want to work with CRLF, there are sometimes misguided implementations that send some other form of newline. And it's easier to accept "any" newline for us anyway. """ auto = Automaton() start = auto.start() final = auto.add_state() # Direct LF start.add_transition('\\n', LabelType.Char, final) # First CR cr = auto.add_state() start.add_transition('\\r', LabelType.Char, cr) # CR->LF cr.add_transition('\\n', LabelType.Char, final) # CR->something else->don't consume the something else and accept cr.add_fallback(final, fallthrough=True) return auto, final
def connection_header(): """ Parse a connection header. We want to recognize the Connection: close and Connection: keep-alive. In theory, the header may contain other things, but in practice it mostly doesn't happen. Doing it properly would be complicated (until we improve our automata-handling utilities) with very little gain, so we cheat a little bit. """ auto = Automaton() start = auto.start() start.loop('HorizWhitespace', LabelType.Special) # Detect the two nice tokens. We cheat by the fact they differ by their # first letter. close_start = auto.add_state() close_start.mark_enter() start.add_transition('c', LabelType.CharNoCase, close_start) close_start.set_path('lose', nocase=True) # Will lead to the next state close_end = auto.add_state("ConnectionClose") close_end.mark_enter() keep_start = auto.add_state() start.add_transition('k', LabelType.CharNoCase, keep_start) keep_start.set_path('eep-alive', nocase=True) keep_end = auto.add_state("ConnectionKeepAlive") keep_end.mark_enter() # Now handle all the rest by a header-parsing automaton that doesn't emit # any events. other, other_end, _ = read_header_value(None) fallback = other.start() auto.join_transition(start, other, fallthrough=True) # Whenever leaving any of our states, just move to the dummy header # collector that handles all the header continuations, header ends, etc. for state in [close_start, close_end, keep_start, keep_end]: state.add_fallback(fallback, fallthrough=True) return auto, other_end, True
def read_boundary(): """ Read a boundary=value from a header. This is meant for content type header (we ignore the actual content type) """ # States: # * We linger in read_until_colon first, then transition to waiting_word. # * The waiting_word is responsible to find the 'boundary=' word # * Then in_boundary accumulates the actual boundary # If we leave it, it means it is somehow unknown and we loop back to # waiting for another colon. # # And then to complicate things, there might be a newline that either means # end of the header or it may be a header continuation. In the latter case, # we need to distinguish the state of somewhere before or after colon. auto = Automaton() line, end = newline() read_until_colon = auto.start() waiting_word = auto.add_state() # Target od waiting_word's path equals = auto.add_state() in_boundary = auto.add_state("Boundary") equals.add_fallback(in_boundary) in_boundary.mark_enter() # Including newlines, yes - they'll be handled later. in_boundary.add_transition("Whitespace", LabelType.Special, read_until_colon, fallthrough=True) in_boundary.add_transition(';', LabelType.Char, read_until_colon, fallthrough=True) in_boundary.loop_fallback() waiting_word.set_path("boundary=", True) waiting_word.loop("HorizWhitespace", LabelType.Special) waiting_line, waiting_end = newline() auto.join(waiting_word, waiting_line) waiting_continuation = auto.add_state() waiting_continuation.loop("HorizWhitespace", LabelType.Special) waiting_continuation.add_fallback(waiting_word, fallthrough=True) waiting_word.add_fallback(read_until_colon, fallthrough=True) waiting_end.add_transition("HorizWhitespace", LabelType.Special, waiting_continuation) waiting_end.add_fallback(end, fallthrough=True) read_until_colon.add_transition(';', LabelType.Char, waiting_word) auto.join(read_until_colon, line) continuation = auto.add_state() continuation.loop("HorizWhitespace", LabelType.Special) continuation.add_fallback(read_until_colon, fallthrough=True) end.add_transition("HorizWhitespace", LabelType.Special, continuation) read_until_colon.loop_fallback() return auto, end, False
expected. They are generated and used during the unit tests. """ from common import Automaton, LabelType from http import connection_header, methods, read_boundary, read_header_value, req_line, request def output(name, automaton): compiled = automaton.compile("automata::test::" + name) with open(name + ".h", "w") as header: header.write(compiled.cpp_header()) with open(name + ".cpp", "w") as file: file.write(compiled.cpp_file()) # Automaton no. 1: accept everything up to a comma, eg `.*,`. until_comma = Automaton() start = until_comma.start() comma = until_comma.add_state("Comma") comma.mark_enter() start.add_transition(',', LabelType.Char, comma) start.add_transition("All", LabelType.Special, start) output("until_comma", until_comma) # Automaton no. 2: http method terminated by a space method, final = methods() output("http_method", method) # Automaton no. 3: HTTP request line http_req, final = req_line() output("http_req", http_req)