Esempio n. 1
0
    def find_second_marks(self):
        """Split clusters if a mark is a second or later stacking diacritic."""
        for exemplar in list(self.clusters.keys()):
            count = self.clusters[exemplar]
            for trailer_index in range(len(exemplar.trailers)):
                trailer = exemplar.trailers[trailer_index]

                # If the mark has already been found to be a always separate mark,
                # split the exemplar.
                if trailer in self.always_separate_marks:
                    self.split_exemplar(exemplar, trailer_index, count)

                # Only graphemes with more than one mark need to be looked at
                # for finding stacking diacritics that are separate.
                if trailer_index > 0:

                    current_mark_ccc = Char.getCombiningClass(trailer)
                    previous_mark_ccc = Char.getCombiningClass(previous_trailer)

                    # If a mark has the same combining class (ccc) as the previous mark,
                    # then the mark is a second or later stacking diacritic and is a separate mark.
                    # Also, if the mark has already been found to be a always separate mark,
                    # split the exemplar.
                    if current_mark_ccc == previous_mark_ccc:
                        self.always_separate_marks.add(trailer)
                        self.split_exemplar(exemplar, trailer_index, count)

                previous_trailer = trailer
Esempio n. 2
0
    def find_second_marks(self):
        """Split clusters if a mark is a second or later stacking diacritic."""
        for exemplar in list(self.clusters.keys()):
            count = self.clusters[exemplar]
            for trailer_index in range(len(exemplar.trailers)):
                trailer = exemplar.trailers[trailer_index]

                # If the mark has already been found to be a always separate mark,
                # split the exemplar.
                if trailer in self.always_separate_marks:
                    self.split_exemplar(exemplar, trailer_index, count)

                # Only graphemes with more than one mark need to be looked at
                # for finding stacking diacritics that are separate.
                if trailer_index > 0:

                    current_mark_ccc = Char.getCombiningClass(trailer)
                    previous_mark_ccc = Char.getCombiningClass(previous_trailer)

                    # If a mark has the same combining class (ccc) as the previous mark,
                    # then the mark is a second or later stacking diacritic and is a separate mark.
                    # Also, if the mark has already been found to be a always separate mark,
                    # split the exemplar.
                    if current_mark_ccc == previous_mark_ccc:
                        self.always_separate_marks.add(trailer)
                        self.split_exemplar(exemplar, trailer_index, count)

                previous_trailer = trailer
Esempio n. 3
0
 def __init__(self, uids, basename, logger):
     self.logger = logger
     self.uids = uids
     self.basename = basename
     if Char.isdefined(uids[0]):
         self.general = Char.charType(uids[0])
         self.cc = Char.getCombiningClass(uids[0])
     else:
         self.logger.log(
             'USV %04X not in ICU; no properties known' % uids[0], 'W')
     self.feats = set()  # feat tags that affect this char
     self.langs = set()  # lang tags that affect this char
Esempio n. 4
0
 def isnumber(char):
     """True if the character is a number (general category Nd or No)."""
     numeric_char_type = Char.charType(char)
     if (numeric_char_type == UCharCategory.DECIMAL_DIGIT_NUMBER or
        numeric_char_type == UCharCategory.OTHER_NUMBER):
         return True
     return False
Esempio n. 5
0
 def isnumber(char):
     """True if the character is a number (general category Nd or No)."""
     numeric_char_type = Char.charType(char)
     if (numeric_char_type == UCharCategory.DECIMAL_DIGIT_NUMBER or
        numeric_char_type == UCharCategory.OTHER_NUMBER):
         return True
     return False
Esempio n. 6
0
 def __init__(self, uid, basename, logger):
     self.logger = logger
     self.uid = uid
     self.basename = basename
     if Char.isdefined(uid):
         self.general = Char.charType(uid)
         self.cc = Char.getCombiningClass(uid)
         self.icuGC = Char.charType(uid)
         self.icuJT = Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE)
     else:
         self.logger.log('USV %04X not in ICU; no properties known' % uid,
                         'W')
     self.feats = set()  # feat tags that affect this char
     self.langs = set()  # lang tags that affect this char
     # Additional info from UFO:
     self.takesMarks = self.isMark = self.isBase = False
Esempio n. 7
0
    def ispunct(self, char):
        """True if the character is punctuation for purposes of finding exemplars."""

        # Some punctuation characters have other properties
        # that means they are not punctuation exemplars.
        if self.is_exemplar_wordbreak(char):
            return False

        return Char.ispunct(char)
Esempio n. 8
0
    def ismark(char):
        """True if the character is a mark (general category M)."""

        numeric_char_type = Char.charType(char)
        if (numeric_char_type == UCharCategory.NON_SPACING_MARK or
           numeric_char_type == UCharCategory.COMBINING_SPACING_MARK or
           numeric_char_type == UCharCategory.ENCLOSING_MARK):
            return True
        return False
Esempio n. 9
0
 def find_indic_matras_and_viramas(self):
     """Indic matras and viramas are always separate marks."""
     for exemplar in list(self.clusters.keys()):
         count = self.clusters[exemplar]
         for trailer_index in range(len(exemplar.trailers)):
             trailer = exemplar.trailers[trailer_index]
             if (self.ucd.is_never_combine(trailer) or
                Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT)):
                 self.split_exemplar(exemplar, trailer_index, count)
Esempio n. 10
0
    def ispunct(self, char):
        """True if the character is punctuation for purposes of finding exemplars."""

        # Some punctuation characters have other properties
        # that means they are not punctuation exemplars.
        if self.is_exemplar_wordbreak(char):
            return False

        return Char.ispunct(char)
Esempio n. 11
0
    def ismark(char):
        """True if the character is a mark (general category M)."""

        numeric_char_type = Char.charType(char)
        if (numeric_char_type == UCharCategory.NON_SPACING_MARK or
           numeric_char_type == UCharCategory.COMBINING_SPACING_MARK or
           numeric_char_type == UCharCategory.ENCLOSING_MARK):
            return True
        return False
Esempio n. 12
0
 def find_indic_matras_and_viramas(self):
     """Indic matras and viramas are always separate marks."""
     for exemplar in list(self.clusters.keys()):
         count = self.clusters[exemplar]
         for trailer_index in range(len(exemplar.trailers)):
             trailer = exemplar.trailers[trailer_index]
             if (self.ucd.is_never_combine(trailer) or
                Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT)):
                 self.split_exemplar(exemplar, trailer_index, count)
Esempio n. 13
0
 def parcel_ignorable(self):
     """Move Default_Ignorable_Code_Point characters to auxiliary."""
     for exemplar in list(self.clusters.keys()):
         if exemplar.base == '':
             return
         if Char.hasBinaryProperty(exemplar.base, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
             # The base is a Default_Ignorable_Code_Point
             # which needs to go in the auxiliary list.
             self._auxiliary.add(exemplar.base)
             del self.clusters[exemplar]
Esempio n. 14
0
 def parcel_ignorable(self):
     """Move Default_Ignorable_Code_Point characters to auxiliary."""
     for exemplar in list(self.clusters.keys()):
         if exemplar.base == '':
             return
         if Char.hasBinaryProperty(exemplar.base,
                                   UProperty.DEFAULT_IGNORABLE_CODE_POINT):
             # The base is a Default_Ignorable_Code_Point
             # which needs to go in the auxiliary list.
             self._auxiliary.add(exemplar.base)
             del self.clusters[exemplar]
Esempio n. 15
0
 def need_hex_escape(self, char, is_isolated):
     """Determine if a characters needs to be escaped with hex digits."""
     if self.ismark(char) and is_isolated:
         return True
     if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
         return True
     if self.isformat(char):
         return True
     if self.is_space_separator(char):
         return True
     return False
Esempio n. 16
0
    def ignore_findit(self):
        from icu import Char, UProperty
        maxchar = 0x10ffff
        maxchar = 0xffff
        for usv in range(maxchar):
            char = chr(usv)
            # if ((not self.ucd.is_specific_script(char)) and
            #    (not self.ucd.is_exemplar_wordbreak(char)) and
            #    (not Char.isUAlphabetic(char))):
            if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                print('%04X' % usv)

        self.assertTrue(False)
Esempio n. 17
0
 def need_hex_escape(self, char, is_isolated):
     """Determine if a characters needs to be escaped with hex digits."""
     if self.ismark(char) and is_isolated:
         return True
     if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
         return True
     if self.isformat(char):
         return True
     if self.is_space_separator(char):
         return True
     if self.is_pua(char):
         return True
     return False
Esempio n. 18
0
    def ignore_findit(self):
        from icu import Char, UProperty
        maxchar = 0x10ffff
        maxchar = 0xffff
        for usv in xrange(maxchar):
            char = unichr(usv)
            # if ((not self.ucd.is_specific_script(char)) and
            #    (not self.ucd.is_exemplar_wordbreak(char)) and
            #    (not Char.isUAlphabetic(char))):
            if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                print '%04X' % usv

        self.assertTrue(False)
Esempio n. 19
0
def normalize_string(in_str, allowed_scripts):
    """
    Normalizes in_str by replacing letters and digits in other scripts with
    exemplar values.

    Args:
        in_str: String to process
        allowed_scripts: List of script short names (like "Mymr") to preserve
    """
    # TODO: Consider checking ScriptExtensions here as well
    output = ""
    for ch in in_str:
        ch_script = Script.getScript(ch)
        ch_type = Char.charType(ch)
        ch_bucket = CHAR_TYPE_TO_BUCKET[ch_type]
        ch_digit = Char.digit(ch)
        if ch_script.getShortName() in allowed_scripts:
            # ch is in an allowed script:
            # copy directly to the output
            output += ch
        elif ch_bucket == 1:
            # ch is a letter in a disallowed script:
            # normalize to the sample char for that script
            output += Script.getSampleString(ch_script)
        elif ch_bucket == 3 and ch_digit != -1:
            # ch is a decimal digit in a disallowed script:
            # normalize to the zero digit in that numbering system
            output += chr(ord(ch) - ch_digit)
        elif ch_type == UCharCategory.CURRENCY_SYMBOL:
            # ch is a currency symbol in a disallowed script:
            # normalize to $
            output += "$"
        else:
            # all other characters:
            # copy directly to the output
            output += ch
    return output
Esempio n. 20
0
    def allowable(self, char):
        """Make sure exemplars have the needed properties."""

        # Numbers with or without diacritics need to be allowed.
        if self.ucd.isnumber(char):
            return True

        # Exemplars must be lowercase.
        if Char.isUUppercase(char):
            return False

        # Characters with a specific script can be exemplars.
        if self.ucd.is_specific_script(char):
            return True

        # Some punctuation and symbols are handled as letters.
        if self.ucd.is_exemplar_wordbreak(char):
            return True

        # Other characters must be Alphabetic.
        if Char.isUAlphabetic(char):
            return True

        return False
Esempio n. 21
0
    def is_exemplar_wordbreak(char):
        """True if the character has the Word_Break properties Katakana, ALetter, or MidLetter."""

        # The following should be exposed by PyICU, but does not seem to be implemented.
        # There are other values, but these are the ones need for this function.
        WB_ALETTER = 1
        WB_KATAKANA = 3
        # WB_MIDLETTER = 4

        numeric_wordbreak_type = Char.getIntPropertyValue(char, UProperty.WORD_BREAK)
        if (numeric_wordbreak_type == WB_KATAKANA or
           # numeric_wordbreak_type == WB_MIDLETTER or
           numeric_wordbreak_type == WB_ALETTER):
            return True
        return False
Esempio n. 22
0
    def allowable(self, char):
        """Make sure exemplars have the needed properties."""

        # Numbers with or without diacritics need to be allowed.
        if self.ucd.isnumber(char):
            return True

        # Exemplars must be lowercase.
        if Char.isUUppercase(char):
            return False

        # Characters with a specific script can be exemplars.
        if self.ucd.is_specific_script(char):
            return True

        # Some punctuation and symbols are handled as letters.
        if self.ucd.is_exemplar_wordbreak(char):
            return True

        # Other characters must be Alphabetic.
        if Char.isUAlphabetic(char):
            return True

        return False
Esempio n. 23
0
    def is_exemplar_wordbreak(char):
        """True if the character has the Word_Break properties Katakana, ALetter, or MidLetter."""

        # The following should be exposed by PyICU, but does not seem to be implemented.
        # There are other values, but these are the ones need for this function.
        WB_ALETTER = 1
        WB_KATAKANA = 3
        WB_MIDLETTER = 4

        numeric_wordbreak_type = Char.getIntPropertyValue(char, UProperty.WORD_BREAK)
        if (numeric_wordbreak_type == WB_KATAKANA or
           numeric_wordbreak_type == WB_ALETTER or
           numeric_wordbreak_type == WB_MIDLETTER):
            return True
        return False
Esempio n. 24
0
 def render(self, uids, ftml, keyUID=0, addBreaks=True, rtl=None):
     """ general purpose (but not required) function to generate ftml for a character sequence """
     if len(uids) == 0:
         return
     # Make a copy so we don't affect caller
     uids = list(uids)
     # Remember first uid and original length for later
     startUID = uids[0]
     uidLen = len(uids)
     # if keyUID wasn't supplied, use startUID
     if keyUID == 0: keyUID = startUID
     # Construct label from uids:
     label = '\n'.join(['U+{0:04X}'.format(u) for u in uids])
     # Construct comment from glyph names:
     comment = ' '.join([self._charFromUID[u].basename for u in uids])
     # see if uid list includes a mirrored char
     hasMirrored = bool(len([x for x in uids if Char.isMirrored(x)]))
     # Analyze first and last joining char
     joiningChars = [
         x for x in uids if
         Char.getIntPropertyValue(x, UProperty.JOINING_TYPE) != TRANSPARENT
     ]
     if len(joiningChars):
         # If first or last non-TRANSPARENT char is a joining char, then we need to emit examples with zwj
         uid = joiningChars[0]
         zwjBefore = Char.getIntPropertyValue(
             uid, UProperty.JOINING_TYPE) == DUAL_JOINING or (
                 Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT
                 and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE)
                 == LEFT_JOINING) or (
                     Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT
                     and Char.getIntPropertyValue(
                         uid, UProperty.JOINING_TYPE) == RIGHT_JOINING)
         uid = joiningChars[-1]
         zwjAfter = Char.getIntPropertyValue(
             uid, UProperty.JOINING_TYPE) == DUAL_JOINING or (
                 Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT
                 and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE)
                 == RIGHT_JOINING) or (
                     Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT
                     and Char.getIntPropertyValue(
                         uid, UProperty.JOINING_TYPE) == LEFT_JOINING)
     else:
         zwjBefore = zwjAfter = False
     if Char.charType(startUID) == UCharCategory.NON_SPACING_MARK:
         # First char is a NSM... prefix a suitable base
         uids.insert(0, self.diacBase)
         zwjBefore = False  # No longer any need to put zwj before
     elif Char.isUWhiteSpace(startUID):
         # First char is whitespace -- prefix with baseline brackets:
         uids.insert(0, 0xF130)
     lastNonMark = [
         x for x in uids
         if Char.charType(x) != UCharCategory.NON_SPACING_MARK
     ][-1]
     if Char.isUWhiteSpace(lastNonMark):
         # Last non-mark is whitespace -- append baseline brackets:
         uids.append(0xF131)
     s = ''.join([chr(uid) for uid in uids])
     if zwjBefore or zwjAfter:
         # Show contextual forms:
         t = u'{0} '.format(s)
         if zwjAfter:
             t += u'{0}\u200D '.format(s)
             if zwjBefore:
                 t += u'\u200D{0}\u200D '.format(s)
         if zwjBefore:
             t += u'\u200D{0} '.format(s)
         if zwjBefore and zwjAfter:
             t += u'{0}{0}{0}'.format(s)
         if addBreaks: ftml.closeTest()
         ftml.addToTest(keyUID, t, label=label, comment=comment, rtl=rtl)
         if addBreaks: ftml.closeTest()
     elif hasMirrored and self.rtlEnable:
         # Contains mirrored and rtl enabled:
         if addBreaks: ftml.closeTest()
         ftml.addToTest(
             keyUID,
             u'{0} LTR: \u202A{0}\u202C RTL: \u202B{0}\u202C'.format(s),
             label=label,
             comment=comment,
             rtl=rtl)
         if addBreaks: ftml.closeTest()
     # elif is LRE, RLE, PDF
     # elif is LRI, RLI, FSI, PDI
     elif uidLen > 1:
         ftml.addToTest(keyUID, s, label=label, comment=comment, rtl=rtl)
     else:
         ftml.addToTest(keyUID, s, comment=comment, rtl=rtl)
Esempio n. 25
0
 def isformat(char):
     """True if the character is a format character (general category Cf)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.FORMAT_CHAR:
         return True
     return False
Esempio n. 26
0
    def __init__(self, input_name, input_n, input_t, input_clusters_num,
                 input_embedding_dim, input_hunits, input_dropout_rate,
                 input_output_dim, input_epochs, input_training_data,
                 input_evaluation_data, input_language, input_embedding_type):
        self.name = input_name
        self.n = input_n
        self.t = input_t
        if self.t % self.n != 0:
            print("Warning: t is not divided by n")
        self.clusters_num = input_clusters_num
        # batch_size is the number of batches used in each iteration of back propagation to update model weights
        # The default value is self.t/self.n, but it can be set to other values as well. The only constraint is that
        # self.t should always be greater than self.batch_size * self.n
        self.batch_size = self.t // self.n
        self.embedding_dim = input_embedding_dim
        self.hunits = input_hunits
        self.dropout_rate = input_dropout_rate
        self.output_dim = input_output_dim
        self.epochs = input_epochs
        self.training_data = input_training_data
        self.evaluation_data = input_evaluation_data
        self.language = input_language
        self.embedding_type = input_embedding_type
        self.model = None

        # Constructing the grapheme cluster dictionary -- this will be used if self.embedding_type is Grapheme Clusters
        ratios = None
        if self.language == "Thai":
            if "exclusive" in self.training_data:
                ratios = constants.THAI_EXCLUSIVE_GRAPH_CLUST_RATIO
            else:
                ratios = constants.THAI_GRAPH_CLUST_RATIO
        elif self.language == "Burmese":
            if "exclusive" in self.training_data:
                ratios = constants.BURMESE_EXCLUSIVE_GRAPH_CLUST_RATIO
            else:
                ratios = constants.BURMESE_GRAPH_CLUST_RATIO
        elif self.language == "Thai_Burmese":
            ratios = constants.THAI_BURMESE_GRAPH_CLUST_RATIO
        else:
            print("Warning: the input language is not supported")
        cnt = 0
        self.graph_clust_dic = dict()
        for key in ratios.keys():
            if cnt < self.clusters_num - 1:
                self.graph_clust_dic[key] = cnt
            if cnt == self.clusters_num - 1:
                break
            cnt += 1

        # Loading the code points dictionary -- this will be used if self.embedding_type is Code Points
        # If you want to group some of the code points into buckets, that code should go here to change
        # self.codepoint_dic appropriately
        if self.language == "Thai":
            self.codepoint_dic = constants.THAI_CODE_POINT_DICTIONARY
        if self.language == "Burmese":
            self.codepoint_dic = constants.BURMESE_CODE_POINT_DICTIONARY
        self.codepoints_num = len(self.codepoint_dic) + 1

        # Constructing the letters dictionary -- this will be used if self.embedding_type is Generalized Vectors
        self.letters_dic = dict()
        if self.language in ["Thai", "Burmese"]:
            smallest_unicode_dec = None
            largest_unicode_dec = None

            # Defining the Unicode box for model's language
            if self.language == "Thai":
                smallest_unicode_dec = int("0E01", 16)
                largest_unicode_dec = int("0E5B", 16)
            elif self.language == "Burmese":
                smallest_unicode_dec = int("1000", 16)
                largest_unicode_dec = int("109F", 16)

            # Defining the code point buckets that will get their own individual embedding vector
            # 1: Letters, 2: Marks, 3: Digits, 4: Separators, 5: Punctuations, 6: Symbols, 7: Others
            separate_slot_buckets = []
            separate_codepoints = []
            if self.embedding_type == "generalized_vectors_123":
                separate_slot_buckets = [1, 2, 3]
            elif self.embedding_type == "generalized_vectors_12":
                separate_slot_buckets = [1, 2]
            elif self.embedding_type == "generalized_vectors_12d0":
                separate_slot_buckets = [1, 2]
                if self.language == "Burmese":
                    separate_codepoints = [4160, 4240]
                if self.language == "Thai":
                    separate_codepoints = [3664]
            elif self.embedding_type == "generalized_vectors_125":
                separate_slot_buckets = [1, 2, 5]
            elif self.embedding_type == "generalized_vectors_1235":
                separate_slot_buckets = [1, 2, 3, 5]

            # Constructing letters dictionary
            cnt = 0
            for i in range(smallest_unicode_dec, largest_unicode_dec + 1):
                ch = chr(i)
                if constants.CHAR_TYPE_TO_BUCKET[Char.charType(
                        ch)] in separate_slot_buckets:
                    self.letters_dic[ch] = cnt
                    cnt += 1
            for unicode_dec in separate_codepoints:
                ch = chr(unicode_dec)
                self.letters_dic[ch] = cnt
                cnt += 1

            # After making the letters dictionary, we can call different versions of the generalized vectors same thing
            if "generalized_vectors" in self.embedding_type:
                self.embedding_type = "generalized_vectors"

        else:
            print(
                "Warning: the generalized_vectros embedding type is not supported for this language"
            )
Esempio n. 27
0
 def isnukta(char):
     """True if the character is a nukta."""
     if Char.getCombiningClass(char) == 7:
         return True
     return False
Esempio n. 28
0
 def is_space_separator(char):
     """True if the character is space separator (general category Zs)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.SPACE_SEPARATOR:
         return True
     return False
Esempio n. 29
0
    def process(self, text):
        """Analyze a string."""
        i = 0
        text = self.ucd.normalize('NFD', text)

        # Record script of each character.
        for char in text:
            script = Script.getScript(char)
            script_code = Script.getScriptCode(script)
            self.scripts[script_code] += 1
            self.codes_for_scripts[script_code] = script

        # Record clusters
        while i < len(text):

            # Look for multigraphs (from length of max_multigraph_length down to 1) character(s)
            # of multigraphs already specified in a LDML file.
            # Longest possible matches are looked at first.
            for multigraph_length in range(self.max_multigraph_length, 0, -1):
                multigraph = text[i:i + multigraph_length]

                if (multigraph in self._main or
                   multigraph in self._auxiliary or
                   multigraph in self._index or
                   multigraph in self._punctuation):
                    exemplar = Exemplar(multigraph)
                    self.clusters[exemplar] += 1
                    i += multigraph_length
                    break

            # No multigraphs were found at this position,
            # so continue processing a single character
            # if we have not gone beyond the end of the text.
            if not i < len(text):
                break

            char = text[i]

            # Test for punctuation.
            if self.ucd.ispunct(char):
                exemplar = Exemplar(char)
                self.clusters[exemplar] += 1
                i += 1
                continue

            # Find grapheme clusters.

            # Ensure exemplar base has needed properties.
            if not self.allowable(char):
                i += 1
                continue

            # The current character is a base character.
            base = char

            # Then find the end of the cluster
            # (which may consist of only base characters).
            length = base_length = 1
            while i + length < len(text):
                trailer = text[i + length]
                if Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                    # A Default_Ignorable_Code_Point was found, so the cluster continues.
                    length += 1
                    continue
                if self.ucd.ismark(trailer):
                    # A Mark was found, so the cluster continues.
                    length += 1

                    # Marks such as nuktas are considered part of the base.
                    if self.ucd.is_always_combine(trailer):
                        # A Mark such as a nukta was found, so the base continues,
                        # as well as the cluster.
                        base_length += 1
                        base = text[i:i + base_length]
                    continue
                else:
                    # No more marks, so the end of the cluster has been reached.
                    break

            # Extract cluster

            # If no nuktas have been found,
            # then the base will be the single character already called base (or char).
            # If no non-nukta marks have been found,
            # then the trailers variable will be an empty string.
            trailers = text[i + base_length:i + length]
            exemplar = Exemplar(base, trailers)

            self.clusters[exemplar] += 1
            i += length
Esempio n. 30
0
 def isnukta(char):
     """True if the character is a nukta."""
     if Char.getCombiningClass(char) == 7:
         return True
     return False
Esempio n. 31
0
 def is_space_separator(char):
     """True if the character is space separator (general category Zs)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.SPACE_SEPARATOR:
         return True
     return False
Esempio n. 32
0
 def isformat(char):
     """True if the character is a format character (general category Cf)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.FORMAT_CHAR:
         return True
     return False