Esempio n. 1
0
 def isnumber(char):
     """True if the character is a number (general category Nd or No)."""
     numeric_char_type = Char.charType(char)
     if (numeric_char_type == UCharCategory.DECIMAL_DIGIT_NUMBER or
        numeric_char_type == UCharCategory.OTHER_NUMBER):
         return True
     return False
Esempio n. 2
0
 def isnumber(char):
     """True if the character is a number (general category Nd or No)."""
     numeric_char_type = Char.charType(char)
     if (numeric_char_type == UCharCategory.DECIMAL_DIGIT_NUMBER or
        numeric_char_type == UCharCategory.OTHER_NUMBER):
         return True
     return False
Esempio n. 3
0
 def __init__(self, uid, basename, logger):
     self.logger = logger
     self.uid = uid
     self.basename = basename
     if Char.isdefined(uid):
         self.general = Char.charType(uid)
         self.cc = Char.getCombiningClass(uid)
         self.icuGC = Char.charType(uid)
         self.icuJT = Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE)
     else:
         self.logger.log('USV %04X not in ICU; no properties known' % uid,
                         'W')
     self.feats = set()  # feat tags that affect this char
     self.langs = set()  # lang tags that affect this char
     # Additional info from UFO:
     self.takesMarks = self.isMark = self.isBase = False
Esempio n. 4
0
    def ismark(char):
        """True if the character is a mark (general category M)."""

        numeric_char_type = Char.charType(char)
        if (numeric_char_type == UCharCategory.NON_SPACING_MARK or
           numeric_char_type == UCharCategory.COMBINING_SPACING_MARK or
           numeric_char_type == UCharCategory.ENCLOSING_MARK):
            return True
        return False
Esempio n. 5
0
    def ismark(char):
        """True if the character is a mark (general category M)."""

        numeric_char_type = Char.charType(char)
        if (numeric_char_type == UCharCategory.NON_SPACING_MARK or
           numeric_char_type == UCharCategory.COMBINING_SPACING_MARK or
           numeric_char_type == UCharCategory.ENCLOSING_MARK):
            return True
        return False
Esempio n. 6
0
 def __init__(self, uids, basename, logger):
     self.logger = logger
     self.uids = uids
     self.basename = basename
     if Char.isdefined(uids[0]):
         self.general = Char.charType(uids[0])
         self.cc = Char.getCombiningClass(uids[0])
     else:
         self.logger.log(
             'USV %04X not in ICU; no properties known' % uids[0], 'W')
     self.feats = set()  # feat tags that affect this char
     self.langs = set()  # lang tags that affect this char
Esempio n. 7
0
def normalize_string(in_str, allowed_scripts):
    """
    Normalizes in_str by replacing letters and digits in other scripts with
    exemplar values.

    Args:
        in_str: String to process
        allowed_scripts: List of script short names (like "Mymr") to preserve
    """
    # TODO: Consider checking ScriptExtensions here as well
    output = ""
    for ch in in_str:
        ch_script = Script.getScript(ch)
        ch_type = Char.charType(ch)
        ch_bucket = CHAR_TYPE_TO_BUCKET[ch_type]
        ch_digit = Char.digit(ch)
        if ch_script.getShortName() in allowed_scripts:
            # ch is in an allowed script:
            # copy directly to the output
            output += ch
        elif ch_bucket == 1:
            # ch is a letter in a disallowed script:
            # normalize to the sample char for that script
            output += Script.getSampleString(ch_script)
        elif ch_bucket == 3 and ch_digit != -1:
            # ch is a decimal digit in a disallowed script:
            # normalize to the zero digit in that numbering system
            output += chr(ord(ch) - ch_digit)
        elif ch_type == UCharCategory.CURRENCY_SYMBOL:
            # ch is a currency symbol in a disallowed script:
            # normalize to $
            output += "$"
        else:
            # all other characters:
            # copy directly to the output
            output += ch
    return output
Esempio n. 8
0
 def render(self, uids, ftml, keyUID=0, addBreaks=True, rtl=None):
     """ general purpose (but not required) function to generate ftml for a character sequence """
     if len(uids) == 0:
         return
     # Make a copy so we don't affect caller
     uids = list(uids)
     # Remember first uid and original length for later
     startUID = uids[0]
     uidLen = len(uids)
     # if keyUID wasn't supplied, use startUID
     if keyUID == 0: keyUID = startUID
     # Construct label from uids:
     label = '\n'.join(['U+{0:04X}'.format(u) for u in uids])
     # Construct comment from glyph names:
     comment = ' '.join([self._charFromUID[u].basename for u in uids])
     # see if uid list includes a mirrored char
     hasMirrored = bool(len([x for x in uids if Char.isMirrored(x)]))
     # Analyze first and last joining char
     joiningChars = [
         x for x in uids if
         Char.getIntPropertyValue(x, UProperty.JOINING_TYPE) != TRANSPARENT
     ]
     if len(joiningChars):
         # If first or last non-TRANSPARENT char is a joining char, then we need to emit examples with zwj
         uid = joiningChars[0]
         zwjBefore = Char.getIntPropertyValue(
             uid, UProperty.JOINING_TYPE) == DUAL_JOINING or (
                 Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT
                 and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE)
                 == LEFT_JOINING) or (
                     Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT
                     and Char.getIntPropertyValue(
                         uid, UProperty.JOINING_TYPE) == RIGHT_JOINING)
         uid = joiningChars[-1]
         zwjAfter = Char.getIntPropertyValue(
             uid, UProperty.JOINING_TYPE) == DUAL_JOINING or (
                 Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT
                 and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE)
                 == RIGHT_JOINING) or (
                     Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT
                     and Char.getIntPropertyValue(
                         uid, UProperty.JOINING_TYPE) == LEFT_JOINING)
     else:
         zwjBefore = zwjAfter = False
     if Char.charType(startUID) == UCharCategory.NON_SPACING_MARK:
         # First char is a NSM... prefix a suitable base
         uids.insert(0, self.diacBase)
         zwjBefore = False  # No longer any need to put zwj before
     elif Char.isUWhiteSpace(startUID):
         # First char is whitespace -- prefix with baseline brackets:
         uids.insert(0, 0xF130)
     lastNonMark = [
         x for x in uids
         if Char.charType(x) != UCharCategory.NON_SPACING_MARK
     ][-1]
     if Char.isUWhiteSpace(lastNonMark):
         # Last non-mark is whitespace -- append baseline brackets:
         uids.append(0xF131)
     s = ''.join([chr(uid) for uid in uids])
     if zwjBefore or zwjAfter:
         # Show contextual forms:
         t = u'{0} '.format(s)
         if zwjAfter:
             t += u'{0}\u200D '.format(s)
             if zwjBefore:
                 t += u'\u200D{0}\u200D '.format(s)
         if zwjBefore:
             t += u'\u200D{0} '.format(s)
         if zwjBefore and zwjAfter:
             t += u'{0}{0}{0}'.format(s)
         if addBreaks: ftml.closeTest()
         ftml.addToTest(keyUID, t, label=label, comment=comment, rtl=rtl)
         if addBreaks: ftml.closeTest()
     elif hasMirrored and self.rtlEnable:
         # Contains mirrored and rtl enabled:
         if addBreaks: ftml.closeTest()
         ftml.addToTest(
             keyUID,
             u'{0} LTR: \u202A{0}\u202C RTL: \u202B{0}\u202C'.format(s),
             label=label,
             comment=comment,
             rtl=rtl)
         if addBreaks: ftml.closeTest()
     # elif is LRE, RLE, PDF
     # elif is LRI, RLI, FSI, PDI
     elif uidLen > 1:
         ftml.addToTest(keyUID, s, label=label, comment=comment, rtl=rtl)
     else:
         ftml.addToTest(keyUID, s, comment=comment, rtl=rtl)
Esempio n. 9
0
    def __init__(self, input_name, input_n, input_t, input_clusters_num,
                 input_embedding_dim, input_hunits, input_dropout_rate,
                 input_output_dim, input_epochs, input_training_data,
                 input_evaluation_data, input_language, input_embedding_type):
        self.name = input_name
        self.n = input_n
        self.t = input_t
        if self.t % self.n != 0:
            print("Warning: t is not divided by n")
        self.clusters_num = input_clusters_num
        # batch_size is the number of batches used in each iteration of back propagation to update model weights
        # The default value is self.t/self.n, but it can be set to other values as well. The only constraint is that
        # self.t should always be greater than self.batch_size * self.n
        self.batch_size = self.t // self.n
        self.embedding_dim = input_embedding_dim
        self.hunits = input_hunits
        self.dropout_rate = input_dropout_rate
        self.output_dim = input_output_dim
        self.epochs = input_epochs
        self.training_data = input_training_data
        self.evaluation_data = input_evaluation_data
        self.language = input_language
        self.embedding_type = input_embedding_type
        self.model = None

        # Constructing the grapheme cluster dictionary -- this will be used if self.embedding_type is Grapheme Clusters
        ratios = None
        if self.language == "Thai":
            if "exclusive" in self.training_data:
                ratios = constants.THAI_EXCLUSIVE_GRAPH_CLUST_RATIO
            else:
                ratios = constants.THAI_GRAPH_CLUST_RATIO
        elif self.language == "Burmese":
            if "exclusive" in self.training_data:
                ratios = constants.BURMESE_EXCLUSIVE_GRAPH_CLUST_RATIO
            else:
                ratios = constants.BURMESE_GRAPH_CLUST_RATIO
        elif self.language == "Thai_Burmese":
            ratios = constants.THAI_BURMESE_GRAPH_CLUST_RATIO
        else:
            print("Warning: the input language is not supported")
        cnt = 0
        self.graph_clust_dic = dict()
        for key in ratios.keys():
            if cnt < self.clusters_num - 1:
                self.graph_clust_dic[key] = cnt
            if cnt == self.clusters_num - 1:
                break
            cnt += 1

        # Loading the code points dictionary -- this will be used if self.embedding_type is Code Points
        # If you want to group some of the code points into buckets, that code should go here to change
        # self.codepoint_dic appropriately
        if self.language == "Thai":
            self.codepoint_dic = constants.THAI_CODE_POINT_DICTIONARY
        if self.language == "Burmese":
            self.codepoint_dic = constants.BURMESE_CODE_POINT_DICTIONARY
        self.codepoints_num = len(self.codepoint_dic) + 1

        # Constructing the letters dictionary -- this will be used if self.embedding_type is Generalized Vectors
        self.letters_dic = dict()
        if self.language in ["Thai", "Burmese"]:
            smallest_unicode_dec = None
            largest_unicode_dec = None

            # Defining the Unicode box for model's language
            if self.language == "Thai":
                smallest_unicode_dec = int("0E01", 16)
                largest_unicode_dec = int("0E5B", 16)
            elif self.language == "Burmese":
                smallest_unicode_dec = int("1000", 16)
                largest_unicode_dec = int("109F", 16)

            # Defining the code point buckets that will get their own individual embedding vector
            # 1: Letters, 2: Marks, 3: Digits, 4: Separators, 5: Punctuations, 6: Symbols, 7: Others
            separate_slot_buckets = []
            separate_codepoints = []
            if self.embedding_type == "generalized_vectors_123":
                separate_slot_buckets = [1, 2, 3]
            elif self.embedding_type == "generalized_vectors_12":
                separate_slot_buckets = [1, 2]
            elif self.embedding_type == "generalized_vectors_12d0":
                separate_slot_buckets = [1, 2]
                if self.language == "Burmese":
                    separate_codepoints = [4160, 4240]
                if self.language == "Thai":
                    separate_codepoints = [3664]
            elif self.embedding_type == "generalized_vectors_125":
                separate_slot_buckets = [1, 2, 5]
            elif self.embedding_type == "generalized_vectors_1235":
                separate_slot_buckets = [1, 2, 3, 5]

            # Constructing letters dictionary
            cnt = 0
            for i in range(smallest_unicode_dec, largest_unicode_dec + 1):
                ch = chr(i)
                if constants.CHAR_TYPE_TO_BUCKET[Char.charType(
                        ch)] in separate_slot_buckets:
                    self.letters_dic[ch] = cnt
                    cnt += 1
            for unicode_dec in separate_codepoints:
                ch = chr(unicode_dec)
                self.letters_dic[ch] = cnt
                cnt += 1

            # After making the letters dictionary, we can call different versions of the generalized vectors same thing
            if "generalized_vectors" in self.embedding_type:
                self.embedding_type = "generalized_vectors"

        else:
            print(
                "Warning: the generalized_vectros embedding type is not supported for this language"
            )
Esempio n. 10
0
 def is_space_separator(char):
     """True if the character is space separator (general category Zs)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.SPACE_SEPARATOR:
         return True
     return False
Esempio n. 11
0
 def isformat(char):
     """True if the character is a format character (general category Cf)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.FORMAT_CHAR:
         return True
     return False
Esempio n. 12
0
 def is_space_separator(char):
     """True if the character is space separator (general category Zs)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.SPACE_SEPARATOR:
         return True
     return False
Esempio n. 13
0
 def isformat(char):
     """True if the character is a format character (general category Cf)."""
     numeric_char_type = Char.charType(char)
     if numeric_char_type == UCharCategory.FORMAT_CHAR:
         return True
     return False