Esempio n. 1
0
 def find_indic_matras_and_viramas(self):
     """Indic matras and viramas are always separate marks."""
     for exemplar in list(self.clusters.keys()):
         count = self.clusters[exemplar]
         for trailer_index in range(len(exemplar.trailers)):
             trailer = exemplar.trailers[trailer_index]
             if (self.ucd.is_never_combine(trailer) or
                Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT)):
                 self.split_exemplar(exemplar, trailer_index, count)
Esempio n. 2
0
 def find_indic_matras_and_viramas(self):
     """Indic matras and viramas are always separate marks."""
     for exemplar in list(self.clusters.keys()):
         count = self.clusters[exemplar]
         for trailer_index in range(len(exemplar.trailers)):
             trailer = exemplar.trailers[trailer_index]
             if (self.ucd.is_never_combine(trailer) or
                Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT)):
                 self.split_exemplar(exemplar, trailer_index, count)
Esempio n. 3
0
 def parcel_ignorable(self):
     """Move Default_Ignorable_Code_Point characters to auxiliary."""
     for exemplar in list(self.clusters.keys()):
         if exemplar.base == '':
             return
         if Char.hasBinaryProperty(exemplar.base, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
             # The base is a Default_Ignorable_Code_Point
             # which needs to go in the auxiliary list.
             self._auxiliary.add(exemplar.base)
             del self.clusters[exemplar]
Esempio n. 4
0
 def need_hex_escape(self, char, is_isolated):
     """Determine if a characters needs to be escaped with hex digits."""
     if self.ismark(char) and is_isolated:
         return True
     if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
         return True
     if self.isformat(char):
         return True
     if self.is_space_separator(char):
         return True
     return False
Esempio n. 5
0
 def parcel_ignorable(self):
     """Move Default_Ignorable_Code_Point characters to auxiliary."""
     for exemplar in list(self.clusters.keys()):
         if exemplar.base == '':
             return
         if Char.hasBinaryProperty(exemplar.base,
                                   UProperty.DEFAULT_IGNORABLE_CODE_POINT):
             # The base is a Default_Ignorable_Code_Point
             # which needs to go in the auxiliary list.
             self._auxiliary.add(exemplar.base)
             del self.clusters[exemplar]
    def ignore_findit(self):
        from icu import Char, UProperty
        maxchar = 0x10ffff
        maxchar = 0xffff
        for usv in range(maxchar):
            char = chr(usv)
            # if ((not self.ucd.is_specific_script(char)) and
            #    (not self.ucd.is_exemplar_wordbreak(char)) and
            #    (not Char.isUAlphabetic(char))):
            if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                print('%04X' % usv)

        self.assertTrue(False)
Esempio n. 7
0
 def need_hex_escape(self, char, is_isolated):
     """Determine if a characters needs to be escaped with hex digits."""
     if self.ismark(char) and is_isolated:
         return True
     if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
         return True
     if self.isformat(char):
         return True
     if self.is_space_separator(char):
         return True
     if self.is_pua(char):
         return True
     return False
Esempio n. 8
0
    def ignore_findit(self):
        from icu import Char, UProperty
        maxchar = 0x10ffff
        maxchar = 0xffff
        for usv in xrange(maxchar):
            char = unichr(usv)
            # if ((not self.ucd.is_specific_script(char)) and
            #    (not self.ucd.is_exemplar_wordbreak(char)) and
            #    (not Char.isUAlphabetic(char))):
            if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                print '%04X' % usv

        self.assertTrue(False)
Esempio n. 9
0
    def process(self, text):
        """Analyze a string."""
        i = 0
        text = self.ucd.normalize('NFD', text)

        # Record script of each character.
        for char in text:
            script = Script.getScript(char)
            script_code = Script.getScriptCode(script)
            self.scripts[script_code] += 1
            self.codes_for_scripts[script_code] = script

        # Record clusters
        while i < len(text):

            # Look for multigraphs (from length of max_multigraph_length down to 1) character(s)
            # of multigraphs already specified in a LDML file.
            # Longest possible matches are looked at first.
            for multigraph_length in range(self.max_multigraph_length, 0, -1):
                multigraph = text[i:i + multigraph_length]

                if (multigraph in self._main or
                   multigraph in self._auxiliary or
                   multigraph in self._index or
                   multigraph in self._punctuation):
                    exemplar = Exemplar(multigraph)
                    self.clusters[exemplar] += 1
                    i += multigraph_length
                    break

            # No multigraphs were found at this position,
            # so continue processing a single character
            # if we have not gone beyond the end of the text.
            if not i < len(text):
                break

            char = text[i]

            # Test for punctuation.
            if self.ucd.ispunct(char):
                exemplar = Exemplar(char)
                self.clusters[exemplar] += 1
                i += 1
                continue

            # Find grapheme clusters.

            # Ensure exemplar base has needed properties.
            if not self.allowable(char):
                i += 1
                continue

            # The current character is a base character.
            base = char

            # Then find the end of the cluster
            # (which may consist of only base characters).
            length = base_length = 1
            while i + length < len(text):
                trailer = text[i + length]
                if Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                    # A Default_Ignorable_Code_Point was found, so the cluster continues.
                    length += 1
                    continue
                if self.ucd.ismark(trailer):
                    # A Mark was found, so the cluster continues.
                    length += 1

                    # Marks such as nuktas are considered part of the base.
                    if self.ucd.is_always_combine(trailer):
                        # A Mark such as a nukta was found, so the base continues,
                        # as well as the cluster.
                        base_length += 1
                        base = text[i:i + base_length]
                    continue
                else:
                    # No more marks, so the end of the cluster has been reached.
                    break

            # Extract cluster

            # If no nuktas have been found,
            # then the base will be the single character already called base (or char).
            # If no non-nukta marks have been found,
            # then the trailers variable will be an empty string.
            trailers = text[i + base_length:i + length]
            exemplar = Exemplar(base, trailers)

            self.clusters[exemplar] += 1
            i += length