Example #1
0
def snap_expression(stream, PatternDict):
    """expression:  term
                    term | expression
    """
    __debug_entry("expression", stream)
    # -- term
    result = snap_term(stream, PatternDict)
    if result == None:
        return __debug_exit(None, stream)

    # -- optional '|'
    if not check(stream, '|'):
        return __debug_exit(result, stream)

    position_1 = stream.tell()
    __debug_print("'|' (in expression)")

    # -- expression
    result_2 = snap_expression(stream, PatternDict)
    __debug_print("expression(in expression):", result_2)
    if result_2 == None:
        stream.seek(position_1)
        return __debug_exit(result, stream)

    result = parallelize.do([result, result_2])
    return __debug_exit(construct.beautify(result), stream)
Example #2
0
def snap_term(stream, PatternDict):
    """term:  primary
              primary term 
    """
    __debug_entry("term", stream)

    # -- primary
    result = snap_primary(stream, PatternDict)
    __debug_print("##primary(in term):", result)
    if result == None: return __debug_exit(None, stream)
    position_1 = stream.tell()

    # -- optional 'term'
    result_2 = snap_term(stream, PatternDict)
    __debug_print("##term(in term):", result_2)
    if result_2 == None:
        stream.seek(position_1)
        return __debug_exit(result, stream)

    ## print "##1:", result.get_string(NormalizeF=False)
    ## print "##2:", result_2.get_string(NormalizeF=False)
    result = sequentialize.do([result, result_2],
                              MountToFirstStateMachineF=True,
                              CloneRemainingStateMachinesF=False)

    return __debug_exit(construct.beautify(result), stream)
Example #3
0
def snap_expression(stream, PatternDict):
    """expression:  term
                    term | expression
    """              
    __debug_entry("expression", stream)    
    # -- term
    result = snap_term(stream, PatternDict) 
    if result == None: 
        return __debug_exit(None, stream)

    # -- optional '|'
    if stream.read(1) != '|': 
        stream.seek(-1, 1)
        return __debug_exit(result, stream)
    
    position_1 = stream.tell()
    __debug_print("'|' (in expression)")

    # -- expression
    result_2 = snap_expression(stream, PatternDict) 
    __debug_print("expression(in expression):",  result_2)
    if result_2 == None:
        stream.seek(position_1) 
        return __debug_exit(result, stream)

    result = parallelize.do([result, result_2])    
    return __debug_exit(__beautify(result), stream)
Example #4
0
def snap_set_term(stream, PatternDict):
    __debug_entry("set_term", stream)

    operation_list = ["union", "intersection", "difference", "inverse"]
    character_set_list = special_character_set_db().keys()

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    word = read_identifier(stream)

    if word in operation_list:
        set_list = snap_set_list(stream, word, PatternDict)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for character_set in set_list[1:]:
                    result.unite_with(character_set)
            result = result.inverse()
            if Setup.get_character_value_limit() != -1:
                result.intersect_with(
                    Interval(0, Setup.get_character_value_limit()))
            return __debug_exit(result, stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")

        if word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in character_set_list:
        result = special_character_set_db()[word]

    elif word != "":
        verify_word_in_list(word, character_set_list + operation_list,
                            "Unknown keyword '%s'." % word, stream)
    else:
        stream.seek(position)
        result = snap_set_expression(stream, PatternDict)

    return __debug_exit(result, stream)
def snap_set_term(stream, PatternDict):
    __debug_entry("set_term", stream)    

    operation_list     = [ "union", "intersection", "difference", "inverse"]
    character_set_list = special_character_set_db().keys()

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    word = read_identifier(stream)

    if word in operation_list: 
        set_list = snap_set_list(stream, word, PatternDict)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L      = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for character_set in set_list[1:]:
                    result.unite_with(character_set)
            result = result.inverse()
            if Setup.get_character_value_limit() != -1:
                result.intersect_with(Interval(0, Setup.get_character_value_limit()))
            return __debug_exit(result, stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")
            
        if   word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in character_set_list:
        result = special_character_set_db()[word]

    elif word != "":
        verify_word_in_list(word, character_set_list + operation_list, 
                            "Unknown keyword '%s'." % word, stream)
    else:
        stream.seek(position)
        result = snap_set_expression(stream, PatternDict)

    return __debug_exit(result, stream)
def snap_set_term(stream):
    __debug_entry("set_term", stream)    

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    try:    
        word = read_until_non_letter(stream)
        stream.seek(-1, 1)  # putback the non-letter
    except: 
        word = "not a valid word"

    word = word.strip()

    if word in [ "union", "intersection", "difference", "inverse"]: 
        set_list = snap_set_list(stream, word)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L      = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for set in set_list[1:]:
                    result.unite_with(set)
            result = result.inverse()
            return __debug_exit(result, stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")
            
        if   word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in special_character_set_db.keys():
        result = special_character_set_db[word]

    else:
        # try to snap an expression out of it
        stream.seek(position)
        result = snap_set_expression(stream)

    return __debug_exit(result, stream)
Example #7
0
def snap_set_term(stream):
    __debug_entry("set_term", stream)

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    try:
        word = read_until_non_letter(stream)
        stream.seek(-1, 1)  # putback the non-letter
    except:
        word = "not a valid word"

    word = word.strip()

    if word in ["union", "intersection", "difference", "inverse"]:
        set_list = snap_set_list(stream, word)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for set in set_list[1:]:
                    result.unite_with(set)
            result = result.inverse()
            return __debug_exit(result, stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")

        if word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in special_character_set_db.keys():
        result = special_character_set_db[word]

    else:
        # try to snap an expression out of it
        stream.seek(position)
        result = snap_set_expression(stream)

    return __debug_exit(result, stream)
def snap_set_expression(stream, PatternDict):
    assert     stream.__class__.__name__ == "StringIO" \
            or stream.__class__.__name__ == "file"

    __debug_entry("set_expression", stream)

    result = snap_property_set(stream)
    if result != None: return result

    x = stream.read(2)
    if   x == "\\C":
        return case_fold_expression.do(stream, PatternDict, snap_set_expression=snap_set_expression)

    elif x == "[:":
        result = snap_set_term(stream, PatternDict)
        skip_whitespace(stream)
        x = stream.read(2)
        if x != ":]":
            raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \
                                             "found: '%s'" % x)
    elif x[0] == "[":
        stream.seek(-1, 1)
        result = traditional_character_set.do(stream)   

    elif x[0] == "{":
        stream.seek(-1, 1)
        result = snap_replacement(stream, PatternDict, StateMachineF=False)   

    else:
        result = None

    return __debug_exit(result, stream)
def snap_set_expression(stream):
    assert     stream.__class__.__name__ == "StringIO" \
            or stream.__class__.__name__ == "file"

    __debug_entry("set_expression", stream)

    result = snap_property_set(stream)
    if result != None: return result

    x = stream.read(2)
    if   x == "[:":
        result = snap_set_term(stream)
        skip_whitespace(stream)
        x = stream.read(2)
        if x != ":]":
            raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \
                                             "found: '%s'" % x)
    elif x[0] == "[":
        stream.seek(-1, 1)
        result = traditional_character_set.do(stream)   
    elif x == "\\P": 
        stream.seek(-2, 1)
        result = property.do(stream)
    elif x == "\\N": 
        stream.seek(-2, 1)
        result = property.do_shortcut(stream, "N", "na") # UCS Property: Name
    elif x == "\\G": 
        stream.seek(-2, 1)
        result = property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category
    else:
        result = None

    return __debug_exit(result, stream)
Example #10
0
def snap_set_list(stream, set_operation_name):
    __debug_entry("set_list", stream)

    skip_whitespace(stream)
    if stream.read(1) != "(":
        raise RegularExpressionException(
            "Missing opening bracket '%s' operation." % set_operation_name)

    set_list = []
    while 1 + 1 == 2:
        skip_whitespace(stream)
        result = snap_set_term(stream)
        if result == None:
            raise RegularExpressionException(
                "Missing set expression list after '%s' operation." %
                set_operation_name)
        set_list.append(result)
        skip_whitespace(stream)
        tmp = stream.read(1)
        if tmp != ",":
            if tmp != ")":
                stream.seek(-1, 1)
                raise RegularExpressionException(
                    "Missing closing ')' after after '%s' operation." %
                    set_operation_name)
            return __debug_exit(set_list, stream)
Example #11
0
def snap_set_expression(stream, PatternDict):
    assert     stream.__class__.__name__ == "StringIO" \
            or stream.__class__.__name__ == "file"

    __debug_entry("set_expression", stream)

    result = snap_property_set(stream)
    if result != None: return result

    x = stream.read(2)
    if x == "\\C":
        return case_fold_expression.do(stream,
                                       PatternDict,
                                       snap_set_expression=snap_set_expression)

    elif x == "[:":
        result = snap_set_term(stream, PatternDict)
        skip_whitespace(stream)
        x = stream.read(2)
        if x != ":]":
            raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \
                                             "found: '%s'" % x)
    elif x[0] == "[":
        stream.seek(-1, 1)
        result = traditional_character_set.do(stream)

    elif x[0] == "{":
        stream.seek(-1, 1)
        result = snap_replacement(stream, PatternDict, StateMachineF=False)

    else:
        result = None

    return __debug_exit(result, stream)
Example #12
0
def snap_set_expression(stream):
    assert     stream.__class__.__name__ == "StringIO" \
            or stream.__class__.__name__ == "file"

    __debug_entry("set_expression", stream)

    result = snap_property_set(stream)
    if result != None: return result

    x = stream.read(2)
    if x == "[:":
        result = snap_set_term(stream)
        skip_whitespace(stream)
        x = stream.read(2)
        if x != ":]":
            raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \
                                             "found: '%s'" % x)
    elif x[0] == "[":
        stream.seek(-1, 1)
        result = traditional_character_set.do(stream)
    elif x == "\\P":
        stream.seek(-2, 1)
        result = property.do(stream)
    elif x == "\\N":
        stream.seek(-2, 1)
        result = property.do_shortcut(stream, "N", "na")  # UCS Property: Name
    elif x == "\\G":
        stream.seek(-2, 1)
        result = property.do_shortcut(stream, "G",
                                      "gc")  # UCS Property: General_Category
    else:
        result = None

    return __debug_exit(result, stream)
Example #13
0
def snap_non_control_character(stream, PatternDict):
    __debug_entry("non-control characters", stream)

    # (*) read first character
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    if char_code == 0xFF:
        error_msg(
            "Character could not be interpreted as UTF8 code or End of File reached prematurely.",
            stream)
    result = StateMachine()
    result.add_transition(result.init_state_index, char_code, AcceptanceF=True)
    return __debug_exit(result, stream)
Example #14
0
def snap_term(stream, PatternDict):
    """term:  primary
              primary term 
    """
    __debug_entry("term", stream)    

    # -- primary
    result = snap_primary(stream, PatternDict) 
    __debug_print("##primary(in term):", result)
    if result == None: return __debug_exit(None, stream)
    position_1 = stream.tell()

    # -- optional 'term' 
    result_2 = snap_term(stream, PatternDict) 
    __debug_print("##term(in term):",  result_2)
    if result_2 == None: 
        stream.seek(position_1)
        return __debug_exit(result, stream)
    
    result = sequentialize.do([result, result_2], 
                              MountToFirstStateMachineF=True, 
                              CloneRemainingStateMachinesF=False)    

    return __debug_exit(__beautify(result), stream)
def do(stream, PatternDict):
    trigger_set = snap_set_expression(stream, PatternDict)

    if trigger_set == None: 
        raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \
                                         "that does not start with '[:', '[' or '\\P'")
    if trigger_set.is_empty():
        raise RegularExpressionException("Regular Expression: Character set expression results in empty set.")

    # Create state machine that triggers with the trigger set to SUCCESS
    # NOTE: The default for the ELSE transition is FAIL.
    sm = StateMachine()
    sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True)

    return __debug_exit(sm, stream)
Example #16
0
def do(stream):
    trigger_set = snap_set_expression(stream)

    if trigger_set == None:
        raise RegularExpressionException("Regular Expression: character_set_expression called for something\n" + \
                                         "that does not start with '[:', '[' or '\\P'")
    if trigger_set.is_empty():
        raise RegularExpressionException(
            "Regular Expression: Character set expression results in empty set."
        )

    # Create state machine that triggers with the trigger set to SUCCESS
    # NOTE: The default for the ELSE transition is FAIL.
    sm = StateMachine()
    sm.add_transition(sm.init_state_index, trigger_set, AcceptanceF=True)

    return __debug_exit(sm, stream)
Example #17
0
def snap_conditional_expression(stream, PatternDict):
    """conditional expression: expression
                               expression / expression                 = post conditioned expression
                               expression / expression /               = pre conditioned expression
                               expression / expression / expression    = pre and post conditioned expression
       TODO: <- ($8592) for pre-conditions
             -> ($8594) for post-conditions

    """
    __debug_entry("conditional expression", stream)

    # -- expression
    pattern_0 = snap_expression(stream, PatternDict)
    if pattern_0 == None: return __debug_exit(None, stream)

    # -- '/'
    if stream.read(1) != '/':
        # (1) expression without pre and post condition
        stream.seek(-1, 1)
        # pattern_0 is already beautified by 'snap_expression()'
        result = __construct(pattern_0)
        return __debug_exit(result, stream)

    # -- expression
    pattern_1 = snap_expression(stream, PatternDict)
    if pattern_1 == None: return __debug_exit(pattern_0, stream)

    # -- '/'
    if stream.read(1) != '/':
        # (2) expression with only a post condition
        stream.seek(-1, 1)
        #     NOTE: setup_post_context() marks state origins!
        result = __construct(pattern_0, post_context=pattern_1)
        return __debug_exit(result, stream)

    # -- expression
    pattern_2 = snap_expression(stream, PatternDict)
    if pattern_2 == None:
        # (3) expression with only a pre condition
        #     NOTE: setup_pre_context() marks the state origins!
        result = __construct(pattern_1, pre_context=pattern_0)
        return __debug_exit(result, stream)

    # (4) expression with post and pre-condition
    result = __construct(pattern_1,
                         pre_context=pattern_0,
                         post_context=pattern_2)
    return __debug_exit(result, stream)
def snap_set_list(stream, set_operation_name, PatternDict):
    __debug_entry("set_list", stream)

    skip_whitespace(stream)
    if stream.read(1) != "(": 
        raise RegularExpressionException("Missing opening bracket '%s' operation." % set_operation_name)

    set_list = []
    while 1 + 1 == 2:
        skip_whitespace(stream)
        result = snap_set_term(stream, PatternDict)
        if result == None: 
            raise RegularExpressionException("Missing set expression list after '%s' operation." % set_operation_name)
        set_list.append(result)
        skip_whitespace(stream)
        tmp = stream.read(1)
        if tmp != ",": 
            if tmp != ")":
                stream.seek(-1, 1)
                raise RegularExpressionException("Missing closing ')' after after '%s' operation." % set_operation_name)
            return __debug_exit(set_list, stream)
Example #19
0
def snap_conditional_expression(stream, PatternDict):
    """conditional expression: expression
                               expression / expression                 = post conditioned expression
                               expression / expression /               = pre conditioned expression
                               expression / expression / expression    = pre and post conditioned expression
       TODO: <- ($8592) for pre-conditions
             -> ($8594) for post-conditions

    """                     
    __debug_entry("conditional expression", stream)    

    # -- expression
    pattern_0 = snap_expression(stream, PatternDict) 
    if pattern_0 == None: return __debug_exit(None, stream)
    
    # -- '/'
    if stream.read(1) != '/': 
        # (1) expression without pre and post condition
        stream.seek(-1, 1)
        # pattern_0 is already beautified by 'snap_expression()'
        result = __construct(pattern_0)
        return __debug_exit(result, stream)
        
    # -- expression
    pattern_1 = snap_expression(stream, PatternDict) 
    if pattern_1 == None: return __debug_exit(pattern_0, stream)
    
    # -- '/'
    if stream.read(1) != '/': 
        # (2) expression with only a post condition
        stream.seek(-1, 1)
        #     NOTE: setup_post_context() marks state origins!
        result = __construct(pattern_0, post_context=pattern_1)
        return __debug_exit(result, stream)

    # -- expression
    pattern_2 = snap_expression(stream, PatternDict) 
    if pattern_2 == None: 
        # (3) expression with only a pre condition
        #     NOTE: setup_pre_context() marks the state origins!
        result = __construct(pattern_1, pre_context=pattern_0)
        return __debug_exit(result, stream)

    # (4) expression with post and pre-condition
    result = __construct(pattern_1, pre_context=pattern_0, post_context=pattern_2)
    return __debug_exit(result, stream)
Example #20
0
def snap_non_control_characters(stream):
    """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that 
       in UTF8 a character may consist of more than one byte. Creates a state machine 
       that contains solely one trigger for each character to a acceptance state.

       This function **concatinates** incoming characters, but **repetition** has preceedence
       over concatination, so it checks after each character whether it is followed by
       a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character
       is appended.
    """
    __debug_entry("non-control characters", stream)

    result = StateMachine()
    state_index = result.init_state_index
    # (*) read first character
    position = stream.tell()
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    while char_code != 0xFF:
        # (1) check against occurence of control characters
        #     this needs to come **before** the backslashed character interpretation.
        #     NOTE: A backslashed character can be a whitespace (for example '\n').
        #     (check against 0xFF to avoid overflow in function 'chr()')
        if char_code < 0xFF \
           and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()):
            stream.seek(-1, 1)
            break

        # (2) treat backslashed characters
        if char_code == ord('\\'):
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(
                    1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
        else:
            trigger_set = char_code

        # (3) read next character
        position = stream.tell()
        next_char_code = utf8.__read_one_utf8_code_from_stream(stream)
        #    -- check for repetition (repetition has preceedence over concatination)
        if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]:
            # (*) create state machine that consist of a single transition
            tmp = StateMachine()
            tmp.add_transition(tmp.init_state_index,
                               trigger_set,
                               AcceptanceF=True)
            # -- repeat the single character state machine
            stream.seek(position)
            tmp_repeated = __snap_repetition_range(tmp, stream)
            # -- append it to the result (last state must be set to acceptance for concatenation)
            result.states[state_index].set_acceptance()
            result = sequentialize.do([result, tmp_repeated],
                                      MountToFirstStateMachineF=True)
            # as soon as there is repetition there might be more than one acceptance
            # state and thus simple concatination via 'add_transition' fails.
            # let us return and check treat the remaining chars
            # at the next call to this function.
            return __debug_exit(result, stream)

        else:
            # (*) add new transition from current state to a new state triggering
            #     on the given character.
            state_index = result.add_transition(state_index, trigger_set)

        char_code = next_char_code

    # last character in the chain triggers an 'acceptance state'
    result.states[state_index].set_acceptance()

    return __debug_exit(result, stream)
Example #21
0
def snap_primary(stream, PatternDict):
    """primary:  " non_double_quote *  "              = character string
                 [ non_rect_bracket_close ]           = set of characters
                 { identifier }                       = pattern replacement
                 ( expression )
                 non_control_character+               = lonely characters
                 primary repetition_cmd
    """
    __debug_entry("primary", stream)    
    x = stream.read(1)
    if x == "": return __debug_exit(None, stream)

    def eat_this(supposed_first_char, the_string):
        if len(the_string) < 1 or the_string[0] != supposed_first_char:
            raise RegularExpressionException("missing '%s'" % supposed_first_char + "\n" + \
                                             "remaining string = '%s'" % the_string) 
        return the_string[1:]    

    # -- 'primary' primary
    if   x == "\"": result = snap_character_string.do(stream)
    elif x == "[":  
        stream.seek(-1, 1); 
        result = character_set_expression.do(stream)
    elif x == "{":  result = snap_replacement(stream, PatternDict)
    elif x == ".":  result = create_ALL_BUT_NEWLINE_state_machine()
    elif x == "(": 
        __start_position = stream.tell()
        result = snap_expression(stream, PatternDict)
        if stream.read(1) != ")": 
            stream.seek(-1, 1)
            raise RegularExpressionException("missing closing ')' after expression. found '%s'" % stream.read())

        if result == None:
            __expression_length = stream.tell() - __start_position
            stream.seek(__start_position)
            raise RegularExpressionException("expression in brackets has invalid syntax '%s'" % \
                                             stream.read(__expression_length))

    elif x.isspace():
        # a lonestanding space ends the regular expression
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    elif x in ["*", "+", "?"]:
        raise RegularExpressionException("lonely operator '%s' without expression proceeding." % x) 

    elif x not in CONTROL_CHARACTERS:
        # NOTE: The '\' is not inside the control characters---for a reason.
        #       It is used to define for example character codes using '\x' etc.
        stream.seek(-1, 1)
        result = snap_non_control_characters(stream)

    else:
        # NOTE: This includes the '$' sign which means 'end of line'
        #       because the '$' sign is in CONTROL_CHARACTERS, but is not checked
        #       against. Thus, it it good to leave here on '$' because the
        #       '$' sign is handled on the very top level.
        # this is not a valid primary
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    # -- optional repetition command? 
    result_repeated = __snap_repetition_range(result, stream) 
    ## print "##imr:", result.get_string(NormalizeF=False)
    if result_repeated != None: result = result_repeated
    return __debug_exit(__beautify(result), stream)
Example #22
0
def snap_primary(stream, PatternDict):
    """primary:  " non_double_quote *  "              = character string
                 [ non_rect_bracket_close ]           = set of characters
                 { identifier }                       = pattern replacement
                 ( expression )
                 non_control_character+               = lonely characters
                 primary repetition_cmd
    """
    __debug_entry("primary", stream)
    x = stream.read(1)
    lookahead = stream.read(1)
    if x != "" and lookahead != "": stream.seek(-1, 1)
    if x == "": return __debug_exit(None, stream)

    # -- 'primary' primary
    if x == "\"": result = snap_character_string.do(stream)
    elif x == "[":
        stream.seek(-1, 1)
        result = character_set_expression.do(stream, PatternDict)
    elif x == "{":
        result = snap_replacement(stream, PatternDict)
    elif x == ".":
        result = create_ALL_BUT_NEWLINE_state_machine()
    elif x == "(":
        result = snap_bracketed_expression(stream, PatternDict)

    elif x.isspace():
        # a lonestanding space ends the regular expression
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    elif x in ["*", "+", "?"]:
        raise RegularExpressionException(
            "lonely operator '%s' without expression proceeding." % x)

    elif x == "\\":
        if lookahead == "C":
            stream.read(1)
            result = snap_case_folded_pattern(stream, PatternDict)
        else:
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(
                    1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
            result = StateMachine()
            result.add_transition(result.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

    elif x not in CONTROL_CHARACTERS:
        # NOTE: The '\' is not inside the control characters---for a reason.
        #       It is used to define for example character codes using '\x' etc.
        stream.seek(-1, 1)
        result = snap_non_control_character(stream, PatternDict)

    else:
        # NOTE: This includes the '$' sign which means 'end of line'
        #       because the '$' sign is in CONTROL_CHARACTERS, but is not checked
        #       against. Thus, it it good to leave here on '$' because the
        #       '$' sign is handled on the very top level.
        # this is not a valid primary
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    # -- optional repetition command?
    result_repeated = __snap_repetition_range(result, stream)
    ## print "##imr:", result.get_string(NormalizeF=False)
    if result_repeated != None: result = result_repeated
    return __debug_exit(construct.beautify(result), stream)
Example #23
0
def snap_non_control_characters(stream):
    """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that 
       in UTF8 a character may consist of more than one byte. Creates a state machine 
       that contains solely one trigger for each character to a acceptance state.

       This function **concatinates** incoming characters, but **repetition** has preceedence
       over concatination, so it checks after each character whether it is followed by
       a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character
       is appended.
    """
    __debug_entry("non-control characters", stream)

    result      = StateMachine()
    state_index = result.init_state_index
    # (*) read first character
    position  = stream.tell()
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    while char_code != 0xFF:
        # (1) check against occurence of control characters
        #     this needs to come **before** the backslashed character interpretation.
        #     NOTE: A backslashed character can be a whitespace (for example '\n'). 
        #     (check against 0xFF to avoid overflow in function 'chr()') 
        if char_code < 0xFF \
           and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()):
               stream.seek(-1, 1) 
               break 

        # (2) treat backslashed characters
        if char_code == ord('\\'):
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException("Backslash followed by unrecognized character code.")
                trigger_set = char_code
        else:
            trigger_set = char_code

        # (3) read next character
        position       = stream.tell()
        next_char_code = utf8.__read_one_utf8_code_from_stream(stream)
        #    -- check for repetition (repetition has preceedence over concatination)
        if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]:
            # (*) create state machine that consist of a single transition 
            tmp = StateMachine()
            tmp.add_transition(tmp.init_state_index, trigger_set, AcceptanceF=True)
            # -- repeat the single character state machine
            stream.seek(position)
            tmp_repeated = __snap_repetition_range(tmp, stream) 
            # -- append it to the result (last state must be set to acceptance for concatenation)
            result.states[state_index].set_acceptance()
            result = sequentialize.do([result, tmp_repeated], MountToFirstStateMachineF=True)
            # as soon as there is repetition there might be more than one acceptance
            # state and thus simple concatination via 'add_transition' fails.
            # let us return and check treat the remaining chars
            # at the next call to this function.
            return __debug_exit(result, stream)

        else:
            # (*) add new transition from current state to a new state triggering
            #     on the given character.
            state_index = result.add_transition(state_index, trigger_set)

        char_code = next_char_code

    # last character in the chain triggers an 'acceptance state'
    result.states[state_index].set_acceptance()
        
    return __debug_exit(result, stream)
Example #24
0
def snap_primary(stream, PatternDict):
    """primary:  " non_double_quote *  "              = character string
                 [ non_rect_bracket_close ]           = set of characters
                 { identifier }                       = pattern replacement
                 ( expression )
                 non_control_character+               = lonely characters
                 primary repetition_cmd
    """
    __debug_entry("primary", stream)
    x = stream.read(1)
    if x == "": return __debug_exit(None, stream)

    def eat_this(supposed_first_char, the_string):
        if len(the_string) < 1 or the_string[0] != supposed_first_char:
            raise RegularExpressionException("missing '%s'" % supposed_first_char + "\n" + \
                                             "remaining string = '%s'" % the_string)
        return the_string[1:]

    # -- 'primary' primary
    if x == "\"": result = snap_character_string.do(stream)
    elif x == "[":
        stream.seek(-1, 1)
        result = character_set_expression.do(stream)
    elif x == "{":
        result = snap_replacement(stream, PatternDict)
    elif x == ".":
        result = create_ALL_BUT_NEWLINE_state_machine()
    elif x == "(":
        __start_position = stream.tell()
        result = snap_expression(stream, PatternDict)
        if stream.read(1) != ")":
            stream.seek(-1, 1)
            raise RegularExpressionException(
                "missing closing ')' after expression. found '%s'" %
                stream.read())

        if result == None:
            __expression_length = stream.tell() - __start_position
            stream.seek(__start_position)
            raise RegularExpressionException("expression in brackets has invalid syntax '%s'" % \
                                             stream.read(__expression_length))

    elif x.isspace():
        # a lonestanding space ends the regular expression
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    elif x in ["*", "+", "?"]:
        raise RegularExpressionException(
            "lonely operator '%s' without expression proceeding." % x)

    elif x not in CONTROL_CHARACTERS:
        # NOTE: The '\' is not inside the control characters---for a reason.
        #       It is used to define for example character codes using '\x' etc.
        stream.seek(-1, 1)
        result = snap_non_control_characters(stream)

    else:
        # NOTE: This includes the '$' sign which means 'end of line'
        #       because the '$' sign is in CONTROL_CHARACTERS, but is not checked
        #       against. Thus, it it good to leave here on '$' because the
        #       '$' sign is handled on the very top level.
        # this is not a valid primary
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    # -- optional repetition command?
    result_repeated = __snap_repetition_range(result, stream)
    ## print "##imr:", result.get_string(NormalizeF=False)
    if result_repeated != None: result = result_repeated
    return __debug_exit(__beautify(result), stream)