def snap_set_expression(stream, PatternDict): assert stream.__class__.__name__ == "StringIO" \ or stream.__class__.__name__ == "file" __debug_entry("set_expression", stream) result = snap_property_set(stream) if result != None: return result x = stream.read(2) if x == "\\C": return case_fold_expression.do(stream, PatternDict, snap_set_expression=snap_set_expression) elif x == "[:": result = snap_set_term(stream, PatternDict) skip_whitespace(stream) x = stream.read(2) if x != ":]": raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \ "found: '%s'" % x) elif x[0] == "[": stream.seek(-1, 1) result = traditional_character_set.do(stream) elif x[0] == "{": stream.seek(-1, 1) result = snap_replacement(stream, PatternDict, StateMachineF=False) else: result = None return __debug_exit(result, stream)
def __parse_property_expression(stream, PropertyLetter, EqualConditionPossibleF=True): """Parses an expression of the form '\? { X [ = Y] }' where ? = PropertyLetter. If the '=' operator is present then two fields are returned first = left hand side, second = right hand side. Othewise an element is returned. """ assert len(PropertyLetter) == 1 assert type(PropertyLetter) == str assert type(EqualConditionPossibleF) == bool # verify '\?' x = stream.read(2) if x != "\\" + PropertyLetter: raise RegularExpressionException("Unicode property letter '\\%s' expected, received '%s'." % x) skip_whitespace(stream) x = stream.read(1) if x != "{": raise RegularExpressionException("Unicode property '\\%s' not followed by '{'." % PropertyLetter) content = __snap_until(stream, "}") fields = content.split("=") if len(fields) == 0: raise RegularExpressionException("Unicode property expression '\\%s{}' cannot have no content.") if len(fields) > 2: raise RegularExpressionException("Unicode property expression '\\%s' can have at maximum one '='.") if not EqualConditionPossibleF and len(fields) == 2: raise RegularExpressionException("Unicode property expression '\\%s' does not allow '=' conditions") return map(lambda x: x.strip(), fields)
def snap_set_expression(stream): assert stream.__class__.__name__ == "StringIO" \ or stream.__class__.__name__ == "file" __debug_entry("set_expression", stream) result = snap_property_set(stream) if result != None: return result x = stream.read(2) if x == "[:": result = snap_set_term(stream) skip_whitespace(stream) x = stream.read(2) if x != ":]": raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \ "found: '%s'" % x) elif x[0] == "[": stream.seek(-1, 1) result = traditional_character_set.do(stream) elif x == "\\P": stream.seek(-2, 1) result = property.do(stream) elif x == "\\N": stream.seek(-2, 1) result = property.do_shortcut(stream, "N", "na") # UCS Property: Name elif x == "\\G": stream.seek(-2, 1) result = property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category else: result = None return __debug_exit(result, stream)
def snap_set_term(stream, PatternDict): __debug_entry("set_term", stream) operation_list = ["union", "intersection", "difference", "inverse"] character_set_list = special_character_set_db().keys() skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below word = read_identifier(stream) if word in operation_list: set_list = snap_set_list(stream, word, PatternDict) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for character_set in set_list[1:]: result.unite_with(character_set) result = result.inverse() if Setup.get_character_value_limit() != -1: result.intersect_with( Interval(0, Setup.get_character_value_limit())) return __debug_exit(result, stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in character_set_list: result = special_character_set_db()[word] elif word != "": verify_word_in_list(word, character_set_list + operation_list, "Unknown keyword '%s'." % word, stream) else: stream.seek(position) result = snap_set_expression(stream, PatternDict) return __debug_exit(result, stream)
def snap_set_term(stream, PatternDict): __debug_entry("set_term", stream) operation_list = [ "union", "intersection", "difference", "inverse"] character_set_list = special_character_set_db().keys() skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below word = read_identifier(stream) if word in operation_list: set_list = snap_set_list(stream, word, PatternDict) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for character_set in set_list[1:]: result.unite_with(character_set) result = result.inverse() if Setup.get_character_value_limit() != -1: result.intersect_with(Interval(0, Setup.get_character_value_limit())) return __debug_exit(result, stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in character_set_list: result = special_character_set_db()[word] elif word != "": verify_word_in_list(word, character_set_list + operation_list, "Unknown keyword '%s'." % word, stream) else: stream.seek(position) result = snap_set_expression(stream, PatternDict) return __debug_exit(result, stream)
def snap_set_term(stream): __debug_entry("set_term", stream) skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below try: word = read_until_non_letter(stream) stream.seek(-1, 1) # putback the non-letter except: word = "not a valid word" word = word.strip() if word in ["union", "intersection", "difference", "inverse"]: set_list = snap_set_list(stream, word) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for set in set_list[1:]: result.unite_with(set) result = result.inverse() return __debug_exit(result, stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in special_character_set_db.keys(): result = special_character_set_db[word] else: # try to snap an expression out of it stream.seek(position) result = snap_set_expression(stream) return __debug_exit(result, stream)
def snap_set_term(stream): __debug_entry("set_term", stream) skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below try: word = read_until_non_letter(stream) stream.seek(-1, 1) # putback the non-letter except: word = "not a valid word" word = word.strip() if word in [ "union", "intersection", "difference", "inverse"]: set_list = snap_set_list(stream, word) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for set in set_list[1:]: result.unite_with(set) result = result.inverse() return __debug_exit(result, stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in special_character_set_db.keys(): result = special_character_set_db[word] else: # try to snap an expression out of it stream.seek(position) result = snap_set_expression(stream) return __debug_exit(result, stream)
def snap_property_set(stream): position = stream.tell() x = stream.read(2) if x == "\\P": stream.seek(position) return property.do(stream) elif x == "\\N": stream.seek(position) return property.do_shortcut(stream, "N", "na") # UCS Property: Name elif x == "\\G": stream.seek(position) return property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category elif x == "\\E": skip_whitespace(stream) if check(stream, "{") == False: error_msg("Missing '{' after '\\E'.", stream) encoding_name = __snap_until(stream, "}").strip() return codec_db.get_supported_unicode_character_set(encoding_name, stream) else: stream.seek(position) return None
def snap_set_list(stream, set_operation_name): __debug_entry("set_list", stream) skip_whitespace(stream) if stream.read(1) != "(": raise RegularExpressionException( "Missing opening bracket '%s' operation." % set_operation_name) set_list = [] while 1 + 1 == 2: skip_whitespace(stream) result = snap_set_term(stream) if result == None: raise RegularExpressionException( "Missing set expression list after '%s' operation." % set_operation_name) set_list.append(result) skip_whitespace(stream) tmp = stream.read(1) if tmp != ",": if tmp != ")": stream.seek(-1, 1) raise RegularExpressionException( "Missing closing ')' after after '%s' operation." % set_operation_name) return __debug_exit(set_list, stream)
def snap_property_set(stream): position = stream.tell() x = stream.read(2) if x == "\\P": stream.seek(position) return property.do(stream) elif x == "\\N": stream.seek(position) return property.do_shortcut(stream, "N", "na") # UCS Property: Name elif x == "\\G": stream.seek(position) return property.do_shortcut(stream, "G", "gc") # UCS Property: General_Category elif x == "\\E": skip_whitespace(stream) if check(stream, "{") == False: error_msg("Missing '{' after '\\E'.", stream) encoding_name = __snap_until(stream, "}").strip() return codec_db.get_supported_unicode_character_set( encoding_name, stream) else: stream.seek(position) return None
def snap_set_list(stream, set_operation_name, PatternDict): __debug_entry("set_list", stream) skip_whitespace(stream) if stream.read(1) != "(": raise RegularExpressionException("Missing opening bracket '%s' operation." % set_operation_name) set_list = [] while 1 + 1 == 2: skip_whitespace(stream) result = snap_set_term(stream, PatternDict) if result == None: raise RegularExpressionException("Missing set expression list after '%s' operation." % set_operation_name) set_list.append(result) skip_whitespace(stream) tmp = stream.read(1) if tmp != ",": if tmp != ")": stream.seek(-1, 1) raise RegularExpressionException("Missing closing ')' after after '%s' operation." % set_operation_name) return __debug_exit(set_list, stream)
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None): """Parse a case fold expression of the form \C(..){ R } or \C{ R }. Assume that '\C' has been snapped already from the stream. See function ucs_case_fold_parser.get_fold_set() for details about case folding. snap_expression != None, then snap_expression is the function to parse a RE and the caller expects a state machine. snap_set_expression != None, then snap_set_expression is the function to parse a character set and caller expects a NumberSet object. """ pos = sh.tell() skip_whitespace(sh) # -- parse the optional options in '(' ')' brackets if not check(sh, "("): # By default 'single' and 'multi' character case folds are active if snap_set_expression != None: flag_txt = "s" else: flag_txt = "sm" else: flag_txt = read_until_character(sh, ")") if flag_txt == "": sh.seek(pos) error_msg("Missing closing ')' in case fold expression.", sh) flag_txt = flag_txt.replace(" ", "").replace("\t", "").replace("\n", "") for letter in flag_txt: if letter not in "smt": sh.seek(pos) error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \ "Options are: 's' for simple case fold.\n" + \ " 'm' for multi character sequence case fold.\n" + \ " 't' for special turkish case fold rules.", sh) if snap_set_expression != None and letter == "m": sh.seek(pos) error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \ "Set expressions cannot absorb multi character sequences.", sh) skip_whitespace(sh) # -- parse the expression in '{' '}' which is subject to case folding if not check(sh, "{"): sh.seek(pos) error_msg("Missing '{' for case fold expression.", sh) skip_whitespace(sh) if snap_set_expression != None: trigger_set = snap_set_expression(sh, PatternDict) if trigger_set == None: error_msg("Missing character set for case fold in set expression.\n" + "The content in '\\C{content}' should start with '[' or '[:'.", sh) # -- perform the case fold for Sets! for interval in trigger_set.get_intervals(PromiseToTreatWellF=True): for i in range(interval.begin, interval.end): fold = ucs_case_fold.get_fold_set(i, flag_txt) for x in fold: assert type(x) != list trigger_set.add_interval(Interval(x, x+1)) result = trigger_set else: sm = snap_expression(sh, PatternDict) if sm == None: error_msg("Missing expression for case fold '\C'.\n" + "The content in '\\C{content}' should start with '[' or '[:'.", sh) # -- perform the case fold for State Machines! for state_idx, state in sm.states.items(): transitions = state.transitions() for target_state_idx, trigger_set in transitions.get_map().items(): __add_case_fold(sm, flag_txt, trigger_set, state_idx, target_state_idx) result = sm if not check(sh, "}"): sh.seek(pos) error_msg("Missing '}' for case fold expression.", sh) return result
def do(sh, PatternDict, snap_expression=None, snap_set_expression=None): """Parse a case fold expression of the form \C(..){ R } or \C{ R }. Assume that '\C' has been snapped already from the stream. See function ucs_case_fold_parser.get_fold_set() for details about case folding. snap_expression != None, then snap_expression is the function to parse a RE and the caller expects a state machine. snap_set_expression != None, then snap_set_expression is the function to parse a character set and caller expects a NumberSet object. """ pos = sh.tell() skip_whitespace(sh) # -- parse the optional options in '(' ')' brackets if not check(sh, "("): # By default 'single' and 'multi' character case folds are active if snap_set_expression != None: flag_txt = "s" else: flag_txt = "sm" else: flag_txt = read_until_character(sh, ")") if flag_txt == "": sh.seek(pos) error_msg("Missing closing ')' in case fold expression.", sh) flag_txt = flag_txt.replace(" ", "").replace("\t", "").replace("\n", "") for letter in flag_txt: if letter not in "smt": sh.seek(pos) error_msg("Letter '%s' not permitted as case fold option.\n" % letter + \ "Options are: 's' for simple case fold.\n" + \ " 'm' for multi character sequence case fold.\n" + \ " 't' for special turkish case fold rules.", sh) if snap_set_expression != None and letter == "m": sh.seek(pos) error_msg("Option 'm' not permitted as case fold option in set expression.\n" + \ "Set expressions cannot absorb multi character sequences.", sh) skip_whitespace(sh) # -- parse the expression in '{' '}' which is subject to case folding if not check(sh, "{"): sh.seek(pos) error_msg("Missing '{' for case fold expression.", sh) skip_whitespace(sh) if snap_set_expression != None: trigger_set = snap_set_expression(sh, PatternDict) if trigger_set == None: error_msg( "Missing character set for case fold in set expression.\n" + "The content in '\\C{content}' should start with '[' or '[:'.", sh) # -- perform the case fold for Sets! for interval in trigger_set.get_intervals(PromiseToTreatWellF=True): for i in range(interval.begin, interval.end): fold = ucs_case_fold.get_fold_set(i, flag_txt) for x in fold: assert type(x) != list trigger_set.add_interval(Interval(x, x + 1)) result = trigger_set else: sm = snap_expression(sh, PatternDict) if sm == None: error_msg( "Missing expression for case fold '\C'.\n" + "The content in '\\C{content}' should start with '[' or '[:'.", sh) # -- perform the case fold for State Machines! for state_idx, state in sm.states.items(): transitions = state.transitions() for target_state_idx, trigger_set in transitions.get_map().items(): __add_case_fold(sm, flag_txt, trigger_set, state_idx, target_state_idx) result = sm if not check(sh, "}"): sh.seek(pos) error_msg("Missing '}' for case fold expression.", sh) return result
def do(fh): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ indentation_setup = IndentationSetup(fh) # NOTE: Catching of EOF happens in caller: parse_section(...) # skip_whitespace(fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): indentation_setup.seal() indentation_setup.consistency_check(fh) return indentation_setup # A regular expression state machine pattern_str, state_machine = regular_expression.parse(fh) skip_whitespace(fh) if not check(fh, "=>"): error_msg("Missing '=>' after character set definition.", fh) skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": error_msg("Missing identifier for indentation element definition.", fh) verify_word_in_list( identifier, ["space", "grid", "bad", "newline", "suppressor"], "Unrecognized indentation specifier '%s'." % identifier, fh) trigger_set = None if identifier in ["space", "bad", "grid"]: if len(state_machine.states) != 2: error_msg("For indentation '%s' only patterns are addmissible which\n" % identifier + \ "can be matched by a single character, e.g. \" \" or [a-z].", fh) transition_map = state_machine.get_init_state().transitions( ).get_map() assert len(transition_map) == 1 trigger_set = transition_map.values()[0] skip_whitespace(fh) if identifier == "space": value = read_integer(fh) if value != None: indentation_setup.specify_space(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": indentation_setup.specify_space(pattern_str, trigger_set, variable, fh) else: indentation_setup.specify_space(pattern_str, trigger_set, 1, fh) elif identifier == "grid": value = read_integer(fh) if value != None: indentation_setup.specify_grid(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? skip_whitespace(fh) variable = read_identifier(fh) if variable != "": indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh) else: error_msg( "Missing integer or variable name after keyword 'grid'.", fh) elif identifier == "bad": indentation_setup.specify_bad(pattern_str, trigger_set, fh) elif identifier == "newline": indentation_setup.specify_newline(pattern_str, state_machine, fh) elif identifier == "suppressor": indentation_setup.specify_suppressor(pattern_str, state_machine, fh) else: assert False, "Unreachable code reached." if not check(fh, ";"): error_msg( "Missing ';' after indentation '%s' specification." % identifier, fh)
def do(fh): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ indentation_setup = IndentationSetup(fh) # NOTE: Catching of EOF happens in caller: parse_section(...) # skip_whitespace(fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): indentation_setup.seal() indentation_setup.consistency_check(fh) return indentation_setup # A regular expression state machine pattern_str, state_machine = regular_expression.parse(fh) skip_whitespace(fh) if not check(fh, "=>"): error_msg("Missing '=>' after character set definition.", fh) skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": error_msg("Missing identifier for indentation element definition.", fh) verify_word_in_list( identifier, ["space", "grid", "bad", "newline", "suppressor"], "Unrecognized indentation specifier '%s'." % identifier, fh, ) trigger_set = None if identifier in ["space", "bad", "grid"]: if len(state_machine.states) != 2: error_msg( "For indentation '%s' only patterns are addmissible which\n" % identifier + 'can be matched by a single character, e.g. " " or [a-z].', fh, ) transition_map = state_machine.get_init_state().transitions().get_map() assert len(transition_map) == 1 trigger_set = transition_map.values()[0] skip_whitespace(fh) if identifier == "space": value = read_integer(fh) if value != None: indentation_setup.specify_space(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? variable = read_identifier(fh) if variable != "": indentation_setup.specify_space(pattern_str, trigger_set, variable, fh) else: indentation_setup.specify_space(pattern_str, trigger_set, 1, fh) elif identifier == "grid": value = read_integer(fh) if value != None: indentation_setup.specify_grid(pattern_str, trigger_set, value, fh) else: # not a number received, is it an identifier? skip_whitespace(fh) variable = read_identifier(fh) if variable != "": indentation_setup.specify_grid(pattern_str, trigger_set, variable, fh) else: error_msg("Missing integer or variable name after keyword 'grid'.", fh) elif identifier == "bad": indentation_setup.specify_bad(pattern_str, trigger_set, fh) elif identifier == "newline": indentation_setup.specify_newline(pattern_str, state_machine, fh) elif identifier == "suppressor": indentation_setup.specify_suppressor(pattern_str, state_machine, fh) else: assert False, "Unreachable code reached." if not check(fh, ";"): error_msg("Missing ';' after indentation '%s' specification." % identifier, fh)