Beispiel #1
0
def split_interval_according_to_utf8_byte_sequence_length(X):
    """Split Unicode interval into intervals where all values
       have the same utf8-byte sequence length.
    """
    if X.begin < 0: X.begin = 0
    if X.end > UTF8_MAX: X.end = UTF8_MAX + 1

    if X.size() == 0: return None

    db = {}
    current_begin = X.begin
    last_L = len(unicode_to_utf8(X.end -
                                 1))  # Length of utf8 sequence corresponding
    #                                                # the last value inside the interval.
    while 1 + 1 == 2:
        L = len(unicode_to_utf8(
            current_begin))  # Length of the first unicode in utf8
        # Store the interval together with the required byte sequence length (as key)
        current_end = UTF8_BORDERS[L - 1]
        if L == last_L:
            db[L] = Interval(current_begin, X.end)
            break
        db[L] = Interval(current_begin, current_end)
        current_begin = current_end

    return db
Beispiel #2
0
def split_contigous_intervals_for_surrogates(Begin, End):
    """Splits the interval X into sub interval so that no interval runs over a 'surrogate'
       border of the last word. For that, it is simply checked if the End falls into the
       same 'surrogate' domain of 'front' (start value of front = Begin). If it does not
       an interval [front, end_of_domain) is split up and front is set to end of domain.
       This procedure repeats until front and End lie in the same domain.
    """
    assert Begin >= 0x10000
    assert End   <= 0x110000

    front_seq = unicode_to_utf16(Begin)
    back_seq  = unicode_to_utf16(End - 1)

    if front_seq[0] == back_seq[0]:
        return [Interval(Begin, End)]

    # Separate into three domains:
    #
    # (1) interval from Begin until second surrogate hits border 0xE000
    # (2) interval where the first surrogate inreases while second 
    #     surrogate iterates over [0xDC00, 0xDFFF]
    # (3) interval from begin of last surrogate border to End
    result = []
    end    = utf16_to_unicode([front_seq[0], 0xDFFF]) + 1
    # The following **must** hold according to entry condition about front and back sequence
    assert End > end
    result.append(Interval(Begin, end))
    if front_seq[0] + 1 != back_seq[0]: 
        mid_end = utf16_to_unicode([back_seq[0] - 1, 0xDFFF]) + 1
        result.append(Interval(end, mid_end)) 
        end = mid_end
    result.append(Interval(end, End)) 

    return result
Beispiel #3
0
    def add_transition(self, Trigger, TargetStateIdx):
        """Adds a transition according to trigger and target index.
           RETURNS: The target state index (may be created newly).
        """
        assert type(TargetStateIdx) == long or TargetStateIdx is None
        assert Trigger.__class__ in [int, long, list, Interval, NumberSet
                                     ] or Trigger is None

        if Trigger is None:  # This is a shorthand to trigger via the remaining triggers
            Trigger = self.get_trigger_set_union().inverse()
        elif type(Trigger) == long:
            Trigger = Interval(int(Trigger), int(Trigger + 1))
        elif type(Trigger) == int:
            Trigger = Interval(Trigger, Trigger + 1)
        elif type(Trigger) == list:
            Trigger = NumberSet(Trigger, ArgumentIsYoursF=True)

        if Trigger.__class__ == Interval:
            if self.__db.has_key(TargetStateIdx):
                self.__db[TargetStateIdx].add_interval(Trigger)
            else:
                self.__db[TargetStateIdx] = NumberSet(Trigger,
                                                      ArgumentIsYoursF=True)
        else:
            if self.__db.has_key(TargetStateIdx):
                self.__db[TargetStateIdx].unite_with(Trigger)
            else:
                self.__db[TargetStateIdx] = Trigger

        return TargetStateIdx
Beispiel #4
0
def split_contigous_intervals_for_surrogates(Begin, End):
    """Splits the interval X into sub interval so that no interval runs over a 'surrogate'
       border of the last word. For that, it is simply checked if the End falls into the
       same 'surrogate' domain of 'front' (start value of front = Begin). If it does not
       an interval [front, end_of_domain) is split up and front is set to end of domain.
       This procedure repeats until front and End lie in the same domain.
    """
    global ForbiddenRange
    assert Begin >= 0x10000
    assert End   <= 0x110000
    assert End   > Begin

    front_seq = unicode_to_utf16(Begin)
    back_seq  = unicode_to_utf16(End - 1)

    # (*) First word is the same.
    #     Then,
    #       -- it is either a one word character.
    #       -- it is a range of two word characters, but the range 
    #          extends in one contigous range in the second surrogate.
    #     In both cases, the interval is contigous.
    if front_seq[0] == back_seq[0]:
        return [Interval(Begin, End)]

    # (*) First word is NOT the same
    # Separate into three domains:
    #
    # (1) Interval from Begin until second surrogate hits border 0xE000
    # (2) Interval where the first surrogate inreases while second 
    #     surrogate iterates over [0xDC00, 0xDFFF]
    # (3) Interval from begin of last surrogate border to End
    result = []
    end    = utf16_to_unicode([front_seq[0], ForbiddenRange.end - 1]) + 1

    
    # (1) 'Begin' until second surrogate hits border 0xE000
    #      (The following **must** hold according to entry condition about 
    #       front and back sequence.)
    assert End > end
    result.append(Interval(Begin, end))

    if front_seq[0] + 1 != back_seq[0]: 
        # (2) Second surrogate iterates over [0xDC00, 0xDFFF]
        mid_end = utf16_to_unicode([back_seq[0] - 1, ForbiddenRange.end - 1]) + 1
        #     (The following **must** hold according to entry condition about 
        #      front and back sequence.)
        assert mid_end > end
        result.append(Interval(end, mid_end)) 
        end     = mid_end
         
    # (3) Last surrogate border to End
    if End > end:
        result.append(Interval(end, End)) 

    return result
Beispiel #5
0
def __separate_buffer_limit_code_transition(TransitionMap, EngineType):
    """This function ensures, that the buffer limit code is separated 
       into a single value interval. Thus the transition map can 
       transit immediate to the reload procedure.
    """
    BLC = Setup.buffer_limit_code
    for i, entry in enumerate(TransitionMap):
        interval, target_index = entry

        if target_index == E_StateIndices.RELOAD_PROCEDURE:
            assert interval.contains_only(Setup.buffer_limit_code)
            assert EngineType != E_EngineTypes.BACKWARD_INPUT_POSITION
            # Transition 'buffer limit code --> E_StateIndices.RELOAD_PROCEDURE'
            # has been setup already.
            return

        elif target_index is not E_StateIndices.DROP_OUT:
            continue

        elif not interval.contains(Setup.buffer_limit_code):
            continue

        # Found the interval that contains the buffer limit code.
        # If the interval's size is alread 1, then there is nothing to be done
        if interval.size() == 1: return

        before_begin = interval.begin
        before_end = BLC
        after_begin = BLC + 1
        after_end = interval.end

        # Replace Entry with (max.) three intervals: before, buffer limit code, after
        del TransitionMap[i]

        if after_end > after_begin:
            TransitionMap.insert(
                i, (Interval(after_begin, after_end), E_StateIndices.DROP_OUT))

        # "Target == E_StateIndices.RELOAD_PROCEDURE" => Buffer Limit Code
        TransitionMap.insert(
            i, (Interval(BLC, BLC + 1), E_StateIndices.RELOAD_PROCEDURE))

        if before_end > before_begin and before_end > 0:
            TransitionMap.insert(
                i,
                (Interval(before_begin, before_end), E_StateIndices.DROP_OUT))
        return

    # Any transition map, except for backward input position detection,
    # must have a trigger on reload.
    assert EngineType in [E_EngineTypes.BACKWARD_INPUT_POSITION, E_EngineTypes.INDENTATION_COUNTER], \
           "Engine types other than 'backward input position detection' or 'indentation counter' must contain BLC.\n" \
           "Found: %s" % repr(EngineType)
    return
Beispiel #6
0
def get_trigger_sequence_for_interval(X):
    # The interval either lies entirely >= 0x10000 or entirely < 0x10000
    assert X.begin >= 0x10000 or X.end < 0x10000

    # An interval below < 0x10000 remains the same
    if X.end < 0x10000: return [ X ]
    
    # In case that the interval >= 0x10000 it the value is split up into
    # two values.
    front_seq = unicode_to_utf16(X.begin)
    back_seq  = unicode_to_utf16(X.end - 1)

    return [ Interval(front_seq[0], back_seq[0] + 1), 
             Interval(front_seq[1], back_seq[1] + 1) ]
Beispiel #7
0
def create_ALL_BUT_NEWLINE_state_machine():
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse())

    if Setup.get_character_value_limit() != sys.maxint:
        trigger_set.intersect_with(
            Interval(0, Setup.get_character_value_limit()))

    result.add_transition(result.init_state_index,
                          trigger_set,
                          AcceptanceF=True)
    return result
Beispiel #8
0
def __create_database_file(TargetEncoding, TargetEncodingName):
    """Writes a database file for a given TargetEncodingName. The 
       TargetEncodingName is required to name the file where the 
       data is to be stored.
    """
    encoder     = codecs.getencoder(TargetEncoding)
    prev_output = -1
    db          = []
    bytes_per_char = -1
    for input in range(0x110000):
        output, n = __get_transformation(encoder, input)

        if bytes_per_char == -1: 
            bytes_per_char = n
        elif n != -1 and bytes_per_char != n:
            print "# not a constant size byte format."
            return False

        # Detect discontinuity in the mapping
        if   prev_output == -1:
            if output != -1:
                input_interval        = Interval(input)
                target_interval_begin = output

        elif output != prev_output + 1:
            # If interval was valid, append it to the database
            input_interval.end    = input
            db.append((input_interval, target_interval_begin))
            # If interval ahead is valid, prepare an object for it
            if output != -1:
                input_interval        = Interval(input)
                target_interval_begin = output

        prev_output = output

    if prev_output != -1:
        input_interval.end = input
        db.append((input_interval, target_interval_begin))

    fh = open_file_or_die(QUEX_CODEC_DB_PATH + "/%s.dat" % TargetEncoding, "wb")
    fh.write("// Describes mapping from Unicode Code pointer to Character code in %s (%s)\n" \
             % (TargetEncoding, TargetEncodingName))
    fh.write("// [SourceInterval.begin] [SourceInterval.Size]  [TargetInterval.begin] (all in hexidecimal)\n")
    for i, t in db:
        fh.write("0x%X %i 0x%X\n" % (i.begin, i.end - i.begin, t))
    fh.close()

    return True
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
Beispiel #10
0
def get_contigous_intervals(X):
    """Split Unicode interval into intervals where all values
       have the same utf16-byte sequence length. This is fairly 
       simple in comparison with utf8-byte sequence length: There
       are only two lengths: 2 bytes and 2 x 2 bytes.

       RETURNS:  [X0, List1]  

                 X0   = the sub-interval where all values are 1 word (2 byte)
                        utf16 encoded. 
                         
                        None => No such interval
                
                List1 = list of contigous sub-intervals where coded as 2 words.

                        None => No such intervals
    """
    global ForbiddenRange
    if X.begin == -sys.maxint: X.begin = 0
    if X.end   == sys.maxint:  X.end   = 0x110000
    assert X.end != X.begin     # Empty intervals are nonsensical
    assert X.end <= 0x110000    # Interval must lie in unicode range
    assert not X.check_overlap(ForbiddenRange) # The 'forbidden range' is not to be covered.

    if   X.end   <= 0x10000: return [X, None]
    elif X.begin >= 0x10000: return [None, split_contigous_intervals_for_surrogates(X.begin, X.end)]
    else:                    return [Interval(X.begin, 0x10000), split_contigous_intervals_for_surrogates(0x10000, X.end)]
Beispiel #11
0
def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence  = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    # Let me play with 'list comprehensions' just one time
    return [ Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L) ]
Beispiel #12
0
def get_supported_unicode_character_set(CodecAlias=None, FileName=None, FH=-1, LineN=None):
    assert CodecAlias is not None or FileName is not None

    mapping_list = get_codec_transformation_info(CodecAlias, FileName, FH, LineN)
    result       = NumberSet()
    for source_begin, source_end, target_begin in mapping_list:
        result.add_interval(Interval(source_begin, source_end))
    return result
Beispiel #13
0
def get_trigger_sequence_for_contigous_byte_range_interval(X, L):
    front_sequence = unicode_to_utf8(X.begin)
    back_sequence = unicode_to_utf8(X.end - 1)
    # If the interval is contigous it must produce equal length utf8 sequences

    return [
        Interval(front_sequence[i], back_sequence[i] + 1) for i in range(L)
    ]
Beispiel #14
0
def do(section_list, fh):
    """Parses a codec information file. The described codec can only be
    a 'static character length' encoding. That is every character in the
    code occupies the same number of bytes.

    RETURNS: [0] Set of characters in unicode which are covered by the
                 described codec.
             [1] Range of values in the codec elements.
    """
    source_set = NumberSet()
    drain_set = NumberSet()

    error_str = None

    try:
        while error_str is None:
            skip_whitespace(fh)
            source_begin = read_integer(fh)
            if source_begin is None:
                error_str = "Missing integer (source interval begin) in codec file."
                continue

            skip_whitespace(fh)
            source_size = read_integer(fh)
            if source_size is None:
                error_str = "Missing integer (source interval size) in codec file."
                continue

            skip_whitespace(fh)
            target_begin = read_integer(fh)
            if target_begin is None:
                error_str = "Missing integer (target interval begin) in codec file."
                continue

            source_end = source_begin + source_size
            list.append(section_list, [source_begin, source_end, target_begin])

            source_set.add_interval(Interval(source_begin, source_end))
            drain_set.add_interval(
                Interval(target_begin, target_begin + source_size))

    except EndOfStreamException:
        pass

    return source_set, drain_set, error_str
Beispiel #15
0
 def __add_case_fold(sm, Flags, trigger_set, start_state_idx, target_state_idx):
     for interval in trigger_set.get_intervals(PromiseToTreatWellF=True):
         for i in range(interval.begin, interval.end):
             fold = ucs_case_fold.get_fold_set(i, Flags)
             for x in fold:
                 if type(x) == list:
                     __add_intermediate_states(sm, x, start_state_idx, target_state_idx)
                 else:
                     trigger_set.add_interval(Interval(x, x+1))
Beispiel #16
0
    def delete_transitions_on_character_list(self, CharacterCodeList):

        for trigger_set in self.__db.values():
            for char_code in CharacterCodeList:
                if trigger_set.contains(char_code):
                    trigger_set.cut_interval(Interval(char_code,
                                                      char_code + 1))

        self.delete_transitions_on_empty_trigger_sets()
Beispiel #17
0
    def set_target(self, Character, NewTarget):
        """Set the target in the transition map for a given 'Character'.
        """
        # Find the index of the interval which contains 'Character'
        i = TransitionMap.bisect(self, Character)
        if i is None:
            self.insert(0, (Interval(Character), NewTarget))
            self.sort()
            return

        # Split the found interval, if necessary, so that the map
        # contains 'Character' --> 'NewTarget'.
        interval, target = self[i]
        assert interval.size() > 0

        new_i = None

        if target == NewTarget: 
            return # Nothing to be done

        elif interval.size() == 1:
            self[i] = (interval, NewTarget)
            new_i   = i

        elif Character == interval.end - 1:
            self.insert(i+1, (Interval(Character), NewTarget))
            interval.end -= 1
            new_i         = i + 1

        elif Character == interval.begin:
            self.insert(i, (Interval(Character), NewTarget))
            interval.begin += 1
            new_i           = i

        else:
            self.insert(i+1, (Interval(Character), NewTarget))
            self.insert(i+2, (Interval(Character+1, interval.end), target))
            interval.end = Character 
            new_i        = i + 1

        # Combine adjacent intervals which trigger to the same target.
        self.combine_adjacents(new_i)
        self.assert_continuity()
        return
Beispiel #18
0
def parse_table(Filename,
                IntervalColumnList=[],
                NumberColumnList=[],
                NumberListColumnList=[],
                CommentF=False):
    """Columns in IntervalColumnList   --> converted to Interval() objects
                  NumberColumnList     --> converted to integers (hex numbers)
                  NumberListColumnList --> converted to integer list (hex numbers)
    """
    fh = open_data_base_file(Filename)

    record_set = []
    for line in fh.readlines():
        comment_idx = line.find("#")
        comment = None
        if comment_idx != -1:
            comment = line[comment_idx + 1:]
            line = line[:comment_idx]

        if line == "" or line.isspace():
            continue

        # append content to record set
        cells = map(lambda x: x.strip(), line.split(";"))

        for i in IntervalColumnList:
            fields = cells[i].split("..")  # range: value0..value1
            assert len(fields) in [1, 2]

            if len(fields) == 2:
                begin = int("0x" + fields[0], 16)
                end = int("0x" + fields[1], 16) + 1
            else:
                begin = int("0x" + fields[0], 16)
                end = int("0x" + fields[0], 16) + 1
            cells[i] = Interval(begin, end)

        for i in NumberColumnList:
            cells[i] = int("0x" + cells[i], 16)

        for i in NumberListColumnList:
            nl = []
            for n in cells[i].split():
                nl.append(int("0x" + n, 16))
            cells[i] = nl

        # Sometimes, the comment is useful
        if CommentF:
            cells.append(comment)

        record_set.append(cells)

    # There is no need to decouple here, since the record_set is created
    # each time that the function is called.
    return record_set
Beispiel #19
0
    def load_Composition_Exclusion(self):
        # Column 0 contains what is interesting ...
        table = parse_table("CompositionExclusions.txt", NumberColumnList=[0])

        number_set = NumberSet()
        for row in table:
            begin = row[0]
            number_set.quick_append_interval(Interval(begin, begin + 1))
        number_set.clean()

        self.db["CE"].code_point_db = number_set
Beispiel #20
0
def parse_character_set(Txt_or_File, PatternStringF=False):

    sh, sh_ref, start_position = __prepare_text_or_file_stream(Txt_or_File)

    try:
        # -- parse regular expression, build state machine
        character_set = charset_expression.snap_set_expression(
            sh, blackboard.shorthand_db)

        if character_set is None:
            error_msg("No valid regular character set expression detected.",
                      sh_ref)

        # -- character set is not supposed to contain buffer limit code
        if character_set.contains(Setup.buffer_limit_code):
            character_set.cut_interval(Interval(Setup.buffer_limit_code))
        if character_set.contains(Setup.path_limit_code):
            character_set.cut_interval(Interval(Setup.path_limit_code))

    except RegularExpressionException, x:
        error_msg("Regular expression parsing:\n" + x.message, sh_ref)
Beispiel #21
0
def get_all():
    """RETURNS:

       A state machine that 'eats' absolutely everything, i.e. 


                              .--- \Any ---.
                              |            |
           (0)--- \Any --->(( 0 ))<--------'
    """
    result = StateMachine()

    i = index.get()
    state = State(AcceptanceF=True)
    state.add_transition(NumberSet(Interval(-sys.maxint, sys.maxint)), i)
    result.states[i] = state

    result.get_init_state().add_transition(
        NumberSet(Interval(-sys.maxint, sys.maxint)), i)

    return result
Beispiel #22
0
def create_ALL_BUT_NEWLINE_state_machine(stream):
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n"))).get_complement(Setup.buffer_codec.source_set)
    if trigger_set.is_empty():
        error_msg("The set of admissible characters contains only newline.\n"
                  "The '.' for 'all but newline' is an empty set.",
                  SourceRef.from_FileHandle(stream))

    result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) 
    return result
Beispiel #23
0
def get_any():
    """RETURNS:

       A state machine that 'eats' any character, but only one. 

           (0)--- \Any --->(( 0 ))
    """
    result = StateMachine()
    result.add_transition(result.init_state_index,
                          NumberSet(Interval(-sys.maxint, sys.maxint)),
                          AcceptanceF=True)

    return result
Beispiel #24
0
def __display_set(CharSet, cl):
    if Setup.query_numeric_f: display = "hex"
    else: display = "utf8"

    CharSet.intersect_with(NumberSet(Interval(0, 0x110000)))

    print "Characters:\n"
    if Setup.query_interval_f:
        __print_set_in_intervals(CharSet, display, 80)
    elif Setup.query_unicode_names_f:
        __print_set_character_names(CharSet, display, 80)
    else:
        __print_set_single_characters(CharSet, display, 80)

    print
Beispiel #25
0
def __display_set(CharSet, cl):
    if cl.search("--numeric"): display = "hex"
    else: display = "utf8"

    CharSet.intersect_with(NumberSet(Interval(0, 0x110000)))

    print "Characters:\n",
    if cl.search("--intervals"):
        __print_set_in_intervals(CharSet, display, 80)
    elif cl.search("--names"):
        __print_set_character_names(CharSet, display, 80)
    else:
        __print_set_single_characters(CharSet, display, 80)

    print
Beispiel #26
0
def arrange_trigger_map(trigger_map):
     """Arrange the trigger map: Sort, and insert 'drop-out-regions'
     """
     #  -- sort by interval
     trigger_map.sort(lambda x, y: cmp(x[0].begin, y[0].begin))
     
     #  -- insert lower and upper 'drop-out-transitions'
     if trigger_map[0][0].begin != -sys.maxint: 
         trigger_map.insert(0, [Interval(-sys.maxint, trigger_map[0][0].begin), E_StateIndices.DROP_OUT])
     if trigger_map[-1][0].end != sys.maxint: 
         trigger_map.append([Interval(trigger_map[-1][0].end, sys.maxint), E_StateIndices.DROP_OUT])

     #  -- fill gaps
     previous_end = -sys.maxint
     i    = 0
     size = len(trigger_map)
     while i < size:
         interval = trigger_map[i][0]
         if interval.begin != previous_end: 
             trigger_map.insert(i, [Interval(previous_end, interval.begin), E_StateIndices.DROP_OUT])
             i    += 1
             size += 1
         i += 1
         previous_end = interval.end
Beispiel #27
0
def split_interval_according_to_utf8_byte_sequence_length(X):
    """Split Unicode interval into intervals where all values
       have the same utf8-byte sequence length.
    """
    global utf8_border
    if X.begin == -sys.maxint: X.begin = 0
    if X.end   == sys.maxint:  X.end   = 0x110000
    assert X.end <= 0x110000  # Interval must lie in unicode range

    db = {}
    current_begin = X.begin
    LastL = len(unicode_to_utf8(X.end - 1))  # Length of utf8 sequence corresponding
    #                                        # the last value inside the interval.
    while 1 + 1 == 2:
        L = len(unicode_to_utf8(current_begin))   # Length of the first unicode in utf8
        # Store the interval together with the required byte sequence length (as key)
        current_end = utf8_border[L-1]
        if L == LastL: 
            db[L] = Interval(current_begin, X.end)
            break
        db[L] = Interval(current_begin, current_end)
        current_begin = current_end

    return db
Beispiel #28
0
def __create_database_file(TargetEncoding, TargetEncodingName):
    """Writes a database file for a given TargetEncodingName. The 
       TargetEncodingName is required to name the file where the 
       data is to be stored.
    """
    encoder = codecs.getencoder(TargetEncoding)
    prev_output = -1
    db = []
    bytes_per_char = -1
    for input in range(0x110000):
        output, n = __get_transformation(encoder, input)

        if bytes_per_char == -1:
            bytes_per_char = n
        elif n != -1 and bytes_per_char != n:
            print "# not a constant size byte format."
            return False

        # Detect discontinuity in the mapping
        if prev_output == -1:
            if output != -1:
                input_interval = Interval(input)
                target_interval_begin = output

        elif output != prev_output + 1:
            # If interval was valid, append it to the database
            input_interval.end = input
            db.append((input_interval, target_interval_begin))
            # If interval ahead is valid, prepare an object for it
            if output != -1:
                input_interval = Interval(input)
                target_interval_begin = output

        prev_output = output

    if prev_output != -1:
        input_interval.end = input
        db.append((input_interval, target_interval_begin))

    fh = open_file_or_die(QUEX_CODEC_DB_PATH + "/%s.dat" % TargetEncoding,
                          "wb")
    fh.write("// Describes mapping from Unicode Code pointer to Character code in %s (%s)\n" \
             % (TargetEncoding, TargetEncodingName))
    fh.write(
        "// [SourceInterval.begin] [SourceInterval.Size]  [TargetInterval.begin] (all in hexidecimal)\n"
    )
    for i, t in db:
        fh.write("0x%X %i 0x%X\n" % (i.begin, i.end - i.begin, t))
    fh.close()

    return True
Beispiel #29
0
def __delete_forbidden_character(sm, BLC):
    """The buffer limit code is something that **needs** to cause a drop out.
       In the drop out handling, the buffer is reloaded.

       Critical character is allowed at end of post context.

       NOTE: This operation might result in orphaned states that have to 
             be deleted.
    """
    for state in sm.states.values():
        for target_state_index, trigger_set in state.transitions().get_map(
        ).items():

            if trigger_set.contains(BLC):
                trigger_set.cut_interval(Interval(BLC, BLC + 1))

            # If the operation resulted in cutting the path to the target state, then delete it.
            if trigger_set.is_empty():
                state.transitions().delete_transitions_to_target(
                    target_state_index)
Beispiel #30
0
def convert_table_to_associative_map(table, ValueColumnIdx, ValueType,
                                     KeyColumnIdx):
    """Produces a dictionary that maps from 'keys' to NumberSets. The 
       number sets represent the code points for which the key (property)
       is valid.

       ValueColumnIdx: Column that contains the character code interval or
                       string to which one wishes to map.

       KeyColmnIdx:   Column that contains the 'key' to be used for the map

       self.db = database to contain the associative map.
    """

    db = {}
    if ValueType == "NumberSet":
        for record in table:
            key = record[KeyColumnIdx].strip()
            key = key.replace(" ", "_")
            value = record[ValueColumnIdx]

            if type(value) == int: value = Interval(value)

            db.setdefault(key, NumberSet()).quick_append_interval(value,
                                                                  SortF=False)

    elif ValueType == "number" or ValueType == "string":
        for record in table:
            key = record[KeyColumnIdx].strip()
            key = key.replace(" ", "_")
            value = record[ValueColumnIdx]
            db[key] = value
    else:
        raise BaseException("ValueType = '%s' unknown.\n" % ValueType)

    # if the content was a number set, it might be simplified, try it.
    if ValueType == "NumberSet":
        for key, number_set in db.items():
            number_set.clean()

    return db
Beispiel #31
0
def prepare_transition_map(TheState, TheAnalyzer, StateKeyStr):
    """Generate targets in the transition map which the code generation can 
       handle. The transition map will consist of pairs of
    
                          (Interval, TextTransitionCode)
    
       objects. 

       NOTE: A word about the reload procedure.
       
       Reload can end either with success (new data has been loaded), or failure
       (no more data available). In case of success the **only** the transition
       step has to be repeated. Nothing else is effected.  Stored positions are
       adapted automatically.
       
       By convention we redo the transition map, in case of reload success and 
       jump to the state's drop-out in case of failure. There is no difference
       here in the template state example.
    """
    # Transition map of the 'skeleton'        
    if TheState.transition_map_empty_f:
        # Transition Map Empty:
        # This happens, for example, if there are only keywords and no 
        # 'overlaying' identifier pattern. But, in this case also, there
        # must be something that catches the 'buffer limit code'. 
        # => Define an 'all drop out' trigger_map, and then later
        # => Adapt the trigger map, so that the 'buffer limit' is an 
        #    isolated single interval.
        TheState.transition_map = [ (Interval(-sys.maxint, sys.maxint), MegaState_Target_DROP_OUT) ]

    for i, info in enumerate(TheState.transition_map):
        interval, target = info
        new_target = prepare_target(target, TheState, TheAnalyzer.state_db, StateKeyStr)
        TheState.transition_map[i] = (interval, new_target)

    return