def find_indic_matras_and_viramas(self): """Indic matras and viramas are always separate marks.""" for exemplar in list(self.clusters.keys()): count = self.clusters[exemplar] for trailer_index in range(len(exemplar.trailers)): trailer = exemplar.trailers[trailer_index] if (self.ucd.is_never_combine(trailer) or Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT)): self.split_exemplar(exemplar, trailer_index, count)
def parcel_ignorable(self): """Move Default_Ignorable_Code_Point characters to auxiliary.""" for exemplar in list(self.clusters.keys()): if exemplar.base == '': return if Char.hasBinaryProperty(exemplar.base, UProperty.DEFAULT_IGNORABLE_CODE_POINT): # The base is a Default_Ignorable_Code_Point # which needs to go in the auxiliary list. self._auxiliary.add(exemplar.base) del self.clusters[exemplar]
def need_hex_escape(self, char, is_isolated): """Determine if a characters needs to be escaped with hex digits.""" if self.ismark(char) and is_isolated: return True if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): return True if self.isformat(char): return True if self.is_space_separator(char): return True return False
def ignore_findit(self): from icu import Char, UProperty maxchar = 0x10ffff maxchar = 0xffff for usv in range(maxchar): char = chr(usv) # if ((not self.ucd.is_specific_script(char)) and # (not self.ucd.is_exemplar_wordbreak(char)) and # (not Char.isUAlphabetic(char))): if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): print('%04X' % usv) self.assertTrue(False)
def need_hex_escape(self, char, is_isolated): """Determine if a characters needs to be escaped with hex digits.""" if self.ismark(char) and is_isolated: return True if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): return True if self.isformat(char): return True if self.is_space_separator(char): return True if self.is_pua(char): return True return False
def ignore_findit(self): from icu import Char, UProperty maxchar = 0x10ffff maxchar = 0xffff for usv in xrange(maxchar): char = unichr(usv) # if ((not self.ucd.is_specific_script(char)) and # (not self.ucd.is_exemplar_wordbreak(char)) and # (not Char.isUAlphabetic(char))): if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): print '%04X' % usv self.assertTrue(False)
def process(self, text): """Analyze a string.""" i = 0 text = self.ucd.normalize('NFD', text) # Record script of each character. for char in text: script = Script.getScript(char) script_code = Script.getScriptCode(script) self.scripts[script_code] += 1 self.codes_for_scripts[script_code] = script # Record clusters while i < len(text): # Look for multigraphs (from length of max_multigraph_length down to 1) character(s) # of multigraphs already specified in a LDML file. # Longest possible matches are looked at first. for multigraph_length in range(self.max_multigraph_length, 0, -1): multigraph = text[i:i + multigraph_length] if (multigraph in self._main or multigraph in self._auxiliary or multigraph in self._index or multigraph in self._punctuation): exemplar = Exemplar(multigraph) self.clusters[exemplar] += 1 i += multigraph_length break # No multigraphs were found at this position, # so continue processing a single character # if we have not gone beyond the end of the text. if not i < len(text): break char = text[i] # Test for punctuation. if self.ucd.ispunct(char): exemplar = Exemplar(char) self.clusters[exemplar] += 1 i += 1 continue # Find grapheme clusters. # Ensure exemplar base has needed properties. if not self.allowable(char): i += 1 continue # The current character is a base character. base = char # Then find the end of the cluster # (which may consist of only base characters). length = base_length = 1 while i + length < len(text): trailer = text[i + length] if Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT): # A Default_Ignorable_Code_Point was found, so the cluster continues. length += 1 continue if self.ucd.ismark(trailer): # A Mark was found, so the cluster continues. length += 1 # Marks such as nuktas are considered part of the base. if self.ucd.is_always_combine(trailer): # A Mark such as a nukta was found, so the base continues, # as well as the cluster. base_length += 1 base = text[i:i + base_length] continue else: # No more marks, so the end of the cluster has been reached. break # Extract cluster # If no nuktas have been found, # then the base will be the single character already called base (or char). # If no non-nukta marks have been found, # then the trailers variable will be an empty string. trailers = text[i + base_length:i + length] exemplar = Exemplar(base, trailers) self.clusters[exemplar] += 1 i += length