예제 #1
0
def trie(words, nocase=True):
    """
    Builds a trie automaton

    Recognizes bunch of "words".

    Returns the automaton, mapping of words to their terminal states and all
    states that might want to have a transition to "unknown" state added.
    """
    # TODO: Optimize by paths!
    auto = Automaton()
    start = auto.start()
    add_unknowns = [start]
    if nocase:
        label_type = LabelType.CharNoCase
    else:
        label_type = LabelType.Char
    terminals = {}
    for word in words:
        state = start
        for letter in word:
            old = state
            state = old.find_next_state(letter)
            if not state:
                state = auto.add_state()
                old.add_transition(letter, label_type, state)
                add_unknowns.append(state)
        terminals[word] = state
    return auto, terminals, add_unknowns
예제 #2
0
def read_header_value(event_name):
    """
    Read and emit characters of a (possibly multi-line) header value.

    Note that the header is not "demangled" in any way, only the newlines are
    removed.
    """
    auto = Automaton()
    start = auto.start()
    start.set_name(event_name)
    start.mark_enter()
    line, end = newline()
    auto.join(start, line)
    start.loop_fallback()
    continuation = auto.add_state()
    continuation.loop("HorizWhitespace", LabelType.Special)
    continuation.add_fallback(start)
    end.add_transition("HorizWhitespace", LabelType.Special, continuation)
    return auto, end, False
예제 #3
0
def read_until(term_label, term_label_type, inner_name=None):
    """
    Read a string until a terminator is found. Eg. read until space.

    If inner_name is set, an event on each read (not terminator)
    letter is emitted with the given name for entered state.

    Returns the automaton and the final state.
    """
    auto = Automaton()
    start = auto.start()
    if inner_name:
        start.set_name(inner_name)
        start.mark_enter()

    final = auto.add_state()
    start.add_transition(term_label, term_label_type, final)
    start.loop_fallback()
    return auto, final
예제 #4
0
def constant(s, nocase=False):
    """
    Automaton accepting/consuming a string constant.
    """
    assert len(s) >= 2  # Not implemented yet
    auto = Automaton()
    start = auto.start()
    # We go through a single "real" transition, otherwise the
    # automaton acts a bit unpredictable when joining together with
    # others.
    mid = auto.add_state()
    final = auto.add_state()
    if nocase:
        s = s.lower()
    mid.set_path(s[1:], nocase)
    if nocase:
        lt = LabelType.CharNoCase
    else:
        lt = LabelType.Char
    start.add_transition(s[0], lt, mid)
    return auto, final
예제 #5
0
def newline():
    """
    Automaton accepting arbitrary newline.

    While most network protocols want to work with CRLF, there are
    sometimes misguided implementations that send some other form of
    newline. And it's easier to accept "any" newline for us anyway.
    """
    auto = Automaton()
    start = auto.start()
    final = auto.add_state()
    # Direct LF
    start.add_transition('\\n', LabelType.Char, final)
    # First CR
    cr = auto.add_state()
    start.add_transition('\\r', LabelType.Char, cr)
    # CR->LF
    cr.add_transition('\\n', LabelType.Char, final)
    # CR->something else->don't consume the something else and accept
    cr.add_fallback(final, fallthrough=True)
    return auto, final
예제 #6
0
def connection_header():
    """
    Parse a connection header.

    We want to recognize the Connection: close and Connection: keep-alive. In
    theory, the header may contain other things, but in practice it mostly
    doesn't happen. Doing it properly would be complicated (until we improve
    our automata-handling utilities) with very little gain, so we cheat a
    little bit.
    """
    auto = Automaton()
    start = auto.start()
    start.loop('HorizWhitespace', LabelType.Special)

    # Detect the two nice tokens. We cheat by the fact they differ by their
    # first letter.
    close_start = auto.add_state()
    close_start.mark_enter()
    start.add_transition('c', LabelType.CharNoCase, close_start)
    close_start.set_path('lose', nocase=True)  # Will lead to the next state
    close_end = auto.add_state("ConnectionClose")
    close_end.mark_enter()

    keep_start = auto.add_state()
    start.add_transition('k', LabelType.CharNoCase, keep_start)
    keep_start.set_path('eep-alive', nocase=True)
    keep_end = auto.add_state("ConnectionKeepAlive")
    keep_end.mark_enter()

    # Now handle all the rest by a header-parsing automaton that doesn't emit
    # any events.
    other, other_end, _ = read_header_value(None)
    fallback = other.start()
    auto.join_transition(start, other, fallthrough=True)
    # Whenever leaving any of our states, just move to the dummy header
    # collector that handles all the header continuations, header ends, etc.
    for state in [close_start, close_end, keep_start, keep_end]:
        state.add_fallback(fallback, fallthrough=True)

    return auto, other_end, True
예제 #7
0
def read_boundary():
    """
    Read a boundary=value from a header.

    This is meant for content type header (we ignore the actual content type)
    """

    # States:
    # * We linger in read_until_colon first, then transition to waiting_word.
    # * The waiting_word is responsible to find the 'boundary=' word
    # * Then in_boundary accumulates the actual boundary
    # If we leave it, it means it is somehow unknown and we loop back to
    # waiting for another colon.
    #
    # And then to complicate things, there might be a newline that either means
    # end of the header or it may be a header continuation. In the latter case,
    # we need to distinguish the state of somewhere before or after colon.
    auto = Automaton()
    line, end = newline()

    read_until_colon = auto.start()

    waiting_word = auto.add_state()
    # Target od waiting_word's path
    equals = auto.add_state()

    in_boundary = auto.add_state("Boundary")
    equals.add_fallback(in_boundary)
    in_boundary.mark_enter()
    # Including newlines, yes - they'll be handled later.
    in_boundary.add_transition("Whitespace",
                               LabelType.Special,
                               read_until_colon,
                               fallthrough=True)
    in_boundary.add_transition(';',
                               LabelType.Char,
                               read_until_colon,
                               fallthrough=True)
    in_boundary.loop_fallback()
    waiting_word.set_path("boundary=", True)
    waiting_word.loop("HorizWhitespace", LabelType.Special)
    waiting_line, waiting_end = newline()
    auto.join(waiting_word, waiting_line)
    waiting_continuation = auto.add_state()
    waiting_continuation.loop("HorizWhitespace", LabelType.Special)
    waiting_continuation.add_fallback(waiting_word, fallthrough=True)
    waiting_word.add_fallback(read_until_colon, fallthrough=True)
    waiting_end.add_transition("HorizWhitespace", LabelType.Special,
                               waiting_continuation)
    waiting_end.add_fallback(end, fallthrough=True)

    read_until_colon.add_transition(';', LabelType.Char, waiting_word)

    auto.join(read_until_colon, line)
    continuation = auto.add_state()
    continuation.loop("HorizWhitespace", LabelType.Special)
    continuation.add_fallback(read_until_colon, fallthrough=True)
    end.add_transition("HorizWhitespace", LabelType.Special, continuation)

    read_until_colon.loop_fallback()

    return auto, end, False
예제 #8
0
expected. They are generated and used during the unit tests.
"""
from common import Automaton, LabelType
from http import connection_header, methods, read_boundary, read_header_value, req_line, request


def output(name, automaton):
    compiled = automaton.compile("automata::test::" + name)
    with open(name + ".h", "w") as header:
        header.write(compiled.cpp_header())
    with open(name + ".cpp", "w") as file:
        file.write(compiled.cpp_file())


# Automaton no. 1: accept everything up to a comma, eg `.*,`.
until_comma = Automaton()
start = until_comma.start()
comma = until_comma.add_state("Comma")
comma.mark_enter()
start.add_transition(',', LabelType.Char, comma)
start.add_transition("All", LabelType.Special, start)

output("until_comma", until_comma)

# Automaton no. 2: http method terminated by a space
method, final = methods()
output("http_method", method)

# Automaton no. 3: HTTP request line
http_req, final = req_line()
output("http_req", http_req)