def isnumber(char): """True if the character is a number (general category Nd or No).""" numeric_char_type = Char.charType(char) if (numeric_char_type == UCharCategory.DECIMAL_DIGIT_NUMBER or numeric_char_type == UCharCategory.OTHER_NUMBER): return True return False
def __init__(self, uid, basename, logger): self.logger = logger self.uid = uid self.basename = basename if Char.isdefined(uid): self.general = Char.charType(uid) self.cc = Char.getCombiningClass(uid) self.icuGC = Char.charType(uid) self.icuJT = Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE) else: self.logger.log('USV %04X not in ICU; no properties known' % uid, 'W') self.feats = set() # feat tags that affect this char self.langs = set() # lang tags that affect this char # Additional info from UFO: self.takesMarks = self.isMark = self.isBase = False
def ismark(char): """True if the character is a mark (general category M).""" numeric_char_type = Char.charType(char) if (numeric_char_type == UCharCategory.NON_SPACING_MARK or numeric_char_type == UCharCategory.COMBINING_SPACING_MARK or numeric_char_type == UCharCategory.ENCLOSING_MARK): return True return False
def __init__(self, uids, basename, logger): self.logger = logger self.uids = uids self.basename = basename if Char.isdefined(uids[0]): self.general = Char.charType(uids[0]) self.cc = Char.getCombiningClass(uids[0]) else: self.logger.log( 'USV %04X not in ICU; no properties known' % uids[0], 'W') self.feats = set() # feat tags that affect this char self.langs = set() # lang tags that affect this char
def normalize_string(in_str, allowed_scripts): """ Normalizes in_str by replacing letters and digits in other scripts with exemplar values. Args: in_str: String to process allowed_scripts: List of script short names (like "Mymr") to preserve """ # TODO: Consider checking ScriptExtensions here as well output = "" for ch in in_str: ch_script = Script.getScript(ch) ch_type = Char.charType(ch) ch_bucket = CHAR_TYPE_TO_BUCKET[ch_type] ch_digit = Char.digit(ch) if ch_script.getShortName() in allowed_scripts: # ch is in an allowed script: # copy directly to the output output += ch elif ch_bucket == 1: # ch is a letter in a disallowed script: # normalize to the sample char for that script output += Script.getSampleString(ch_script) elif ch_bucket == 3 and ch_digit != -1: # ch is a decimal digit in a disallowed script: # normalize to the zero digit in that numbering system output += chr(ord(ch) - ch_digit) elif ch_type == UCharCategory.CURRENCY_SYMBOL: # ch is a currency symbol in a disallowed script: # normalize to $ output += "$" else: # all other characters: # copy directly to the output output += ch return output
def render(self, uids, ftml, keyUID=0, addBreaks=True, rtl=None): """ general purpose (but not required) function to generate ftml for a character sequence """ if len(uids) == 0: return # Make a copy so we don't affect caller uids = list(uids) # Remember first uid and original length for later startUID = uids[0] uidLen = len(uids) # if keyUID wasn't supplied, use startUID if keyUID == 0: keyUID = startUID # Construct label from uids: label = '\n'.join(['U+{0:04X}'.format(u) for u in uids]) # Construct comment from glyph names: comment = ' '.join([self._charFromUID[u].basename for u in uids]) # see if uid list includes a mirrored char hasMirrored = bool(len([x for x in uids if Char.isMirrored(x)])) # Analyze first and last joining char joiningChars = [ x for x in uids if Char.getIntPropertyValue(x, UProperty.JOINING_TYPE) != TRANSPARENT ] if len(joiningChars): # If first or last non-TRANSPARENT char is a joining char, then we need to emit examples with zwj uid = joiningChars[0] zwjBefore = Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == DUAL_JOINING or ( Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE) == LEFT_JOINING) or ( Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == RIGHT_JOINING) uid = joiningChars[-1] zwjAfter = Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == DUAL_JOINING or ( Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE) == RIGHT_JOINING) or ( Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == LEFT_JOINING) else: zwjBefore = zwjAfter = False if Char.charType(startUID) == UCharCategory.NON_SPACING_MARK: # First char is a NSM... prefix a suitable base uids.insert(0, self.diacBase) zwjBefore = False # No longer any need to put zwj before elif Char.isUWhiteSpace(startUID): # First char is whitespace -- prefix with baseline brackets: uids.insert(0, 0xF130) lastNonMark = [ x for x in uids if Char.charType(x) != UCharCategory.NON_SPACING_MARK ][-1] if Char.isUWhiteSpace(lastNonMark): # Last non-mark is whitespace -- append baseline brackets: uids.append(0xF131) s = ''.join([chr(uid) for uid in uids]) if zwjBefore or zwjAfter: # Show contextual forms: t = u'{0} '.format(s) if zwjAfter: t += u'{0}\u200D '.format(s) if zwjBefore: t += u'\u200D{0}\u200D '.format(s) if zwjBefore: t += u'\u200D{0} '.format(s) if zwjBefore and zwjAfter: t += u'{0}{0}{0}'.format(s) if addBreaks: ftml.closeTest() ftml.addToTest(keyUID, t, label=label, comment=comment, rtl=rtl) if addBreaks: ftml.closeTest() elif hasMirrored and self.rtlEnable: # Contains mirrored and rtl enabled: if addBreaks: ftml.closeTest() ftml.addToTest( keyUID, u'{0} LTR: \u202A{0}\u202C RTL: \u202B{0}\u202C'.format(s), label=label, comment=comment, rtl=rtl) if addBreaks: ftml.closeTest() # elif is LRE, RLE, PDF # elif is LRI, RLI, FSI, PDI elif uidLen > 1: ftml.addToTest(keyUID, s, label=label, comment=comment, rtl=rtl) else: ftml.addToTest(keyUID, s, comment=comment, rtl=rtl)
def __init__(self, input_name, input_n, input_t, input_clusters_num, input_embedding_dim, input_hunits, input_dropout_rate, input_output_dim, input_epochs, input_training_data, input_evaluation_data, input_language, input_embedding_type): self.name = input_name self.n = input_n self.t = input_t if self.t % self.n != 0: print("Warning: t is not divided by n") self.clusters_num = input_clusters_num # batch_size is the number of batches used in each iteration of back propagation to update model weights # The default value is self.t/self.n, but it can be set to other values as well. The only constraint is that # self.t should always be greater than self.batch_size * self.n self.batch_size = self.t // self.n self.embedding_dim = input_embedding_dim self.hunits = input_hunits self.dropout_rate = input_dropout_rate self.output_dim = input_output_dim self.epochs = input_epochs self.training_data = input_training_data self.evaluation_data = input_evaluation_data self.language = input_language self.embedding_type = input_embedding_type self.model = None # Constructing the grapheme cluster dictionary -- this will be used if self.embedding_type is Grapheme Clusters ratios = None if self.language == "Thai": if "exclusive" in self.training_data: ratios = constants.THAI_EXCLUSIVE_GRAPH_CLUST_RATIO else: ratios = constants.THAI_GRAPH_CLUST_RATIO elif self.language == "Burmese": if "exclusive" in self.training_data: ratios = constants.BURMESE_EXCLUSIVE_GRAPH_CLUST_RATIO else: ratios = constants.BURMESE_GRAPH_CLUST_RATIO elif self.language == "Thai_Burmese": ratios = constants.THAI_BURMESE_GRAPH_CLUST_RATIO else: print("Warning: the input language is not supported") cnt = 0 self.graph_clust_dic = dict() for key in ratios.keys(): if cnt < self.clusters_num - 1: self.graph_clust_dic[key] = cnt if cnt == self.clusters_num - 1: break cnt += 1 # Loading the code points dictionary -- this will be used if self.embedding_type is Code Points # If you want to group some of the code points into buckets, that code should go here to change # self.codepoint_dic appropriately if self.language == "Thai": self.codepoint_dic = constants.THAI_CODE_POINT_DICTIONARY if self.language == "Burmese": self.codepoint_dic = constants.BURMESE_CODE_POINT_DICTIONARY self.codepoints_num = len(self.codepoint_dic) + 1 # Constructing the letters dictionary -- this will be used if self.embedding_type is Generalized Vectors self.letters_dic = dict() if self.language in ["Thai", "Burmese"]: smallest_unicode_dec = None largest_unicode_dec = None # Defining the Unicode box for model's language if self.language == "Thai": smallest_unicode_dec = int("0E01", 16) largest_unicode_dec = int("0E5B", 16) elif self.language == "Burmese": smallest_unicode_dec = int("1000", 16) largest_unicode_dec = int("109F", 16) # Defining the code point buckets that will get their own individual embedding vector # 1: Letters, 2: Marks, 3: Digits, 4: Separators, 5: Punctuations, 6: Symbols, 7: Others separate_slot_buckets = [] separate_codepoints = [] if self.embedding_type == "generalized_vectors_123": separate_slot_buckets = [1, 2, 3] elif self.embedding_type == "generalized_vectors_12": separate_slot_buckets = [1, 2] elif self.embedding_type == "generalized_vectors_12d0": separate_slot_buckets = [1, 2] if self.language == "Burmese": separate_codepoints = [4160, 4240] if self.language == "Thai": separate_codepoints = [3664] elif self.embedding_type == "generalized_vectors_125": separate_slot_buckets = [1, 2, 5] elif self.embedding_type == "generalized_vectors_1235": separate_slot_buckets = [1, 2, 3, 5] # Constructing letters dictionary cnt = 0 for i in range(smallest_unicode_dec, largest_unicode_dec + 1): ch = chr(i) if constants.CHAR_TYPE_TO_BUCKET[Char.charType( ch)] in separate_slot_buckets: self.letters_dic[ch] = cnt cnt += 1 for unicode_dec in separate_codepoints: ch = chr(unicode_dec) self.letters_dic[ch] = cnt cnt += 1 # After making the letters dictionary, we can call different versions of the generalized vectors same thing if "generalized_vectors" in self.embedding_type: self.embedding_type = "generalized_vectors" else: print( "Warning: the generalized_vectros embedding type is not supported for this language" )
def is_space_separator(char): """True if the character is space separator (general category Zs).""" numeric_char_type = Char.charType(char) if numeric_char_type == UCharCategory.SPACE_SEPARATOR: return True return False
def isformat(char): """True if the character is a format character (general category Cf).""" numeric_char_type = Char.charType(char) if numeric_char_type == UCharCategory.FORMAT_CHAR: return True return False