def find_second_marks(self): """Split clusters if a mark is a second or later stacking diacritic.""" for exemplar in list(self.clusters.keys()): count = self.clusters[exemplar] for trailer_index in range(len(exemplar.trailers)): trailer = exemplar.trailers[trailer_index] # If the mark has already been found to be a always separate mark, # split the exemplar. if trailer in self.always_separate_marks: self.split_exemplar(exemplar, trailer_index, count) # Only graphemes with more than one mark need to be looked at # for finding stacking diacritics that are separate. if trailer_index > 0: current_mark_ccc = Char.getCombiningClass(trailer) previous_mark_ccc = Char.getCombiningClass(previous_trailer) # If a mark has the same combining class (ccc) as the previous mark, # then the mark is a second or later stacking diacritic and is a separate mark. # Also, if the mark has already been found to be a always separate mark, # split the exemplar. if current_mark_ccc == previous_mark_ccc: self.always_separate_marks.add(trailer) self.split_exemplar(exemplar, trailer_index, count) previous_trailer = trailer
def __init__(self, uids, basename, logger): self.logger = logger self.uids = uids self.basename = basename if Char.isdefined(uids[0]): self.general = Char.charType(uids[0]) self.cc = Char.getCombiningClass(uids[0]) else: self.logger.log( 'USV %04X not in ICU; no properties known' % uids[0], 'W') self.feats = set() # feat tags that affect this char self.langs = set() # lang tags that affect this char
def isnumber(char): """True if the character is a number (general category Nd or No).""" numeric_char_type = Char.charType(char) if (numeric_char_type == UCharCategory.DECIMAL_DIGIT_NUMBER or numeric_char_type == UCharCategory.OTHER_NUMBER): return True return False
def __init__(self, uid, basename, logger): self.logger = logger self.uid = uid self.basename = basename if Char.isdefined(uid): self.general = Char.charType(uid) self.cc = Char.getCombiningClass(uid) self.icuGC = Char.charType(uid) self.icuJT = Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE) else: self.logger.log('USV %04X not in ICU; no properties known' % uid, 'W') self.feats = set() # feat tags that affect this char self.langs = set() # lang tags that affect this char # Additional info from UFO: self.takesMarks = self.isMark = self.isBase = False
def ispunct(self, char): """True if the character is punctuation for purposes of finding exemplars.""" # Some punctuation characters have other properties # that means they are not punctuation exemplars. if self.is_exemplar_wordbreak(char): return False return Char.ispunct(char)
def ismark(char): """True if the character is a mark (general category M).""" numeric_char_type = Char.charType(char) if (numeric_char_type == UCharCategory.NON_SPACING_MARK or numeric_char_type == UCharCategory.COMBINING_SPACING_MARK or numeric_char_type == UCharCategory.ENCLOSING_MARK): return True return False
def find_indic_matras_and_viramas(self): """Indic matras and viramas are always separate marks.""" for exemplar in list(self.clusters.keys()): count = self.clusters[exemplar] for trailer_index in range(len(exemplar.trailers)): trailer = exemplar.trailers[trailer_index] if (self.ucd.is_never_combine(trailer) or Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT)): self.split_exemplar(exemplar, trailer_index, count)
def parcel_ignorable(self): """Move Default_Ignorable_Code_Point characters to auxiliary.""" for exemplar in list(self.clusters.keys()): if exemplar.base == '': return if Char.hasBinaryProperty(exemplar.base, UProperty.DEFAULT_IGNORABLE_CODE_POINT): # The base is a Default_Ignorable_Code_Point # which needs to go in the auxiliary list. self._auxiliary.add(exemplar.base) del self.clusters[exemplar]
def need_hex_escape(self, char, is_isolated): """Determine if a characters needs to be escaped with hex digits.""" if self.ismark(char) and is_isolated: return True if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): return True if self.isformat(char): return True if self.is_space_separator(char): return True return False
def ignore_findit(self): from icu import Char, UProperty maxchar = 0x10ffff maxchar = 0xffff for usv in range(maxchar): char = chr(usv) # if ((not self.ucd.is_specific_script(char)) and # (not self.ucd.is_exemplar_wordbreak(char)) and # (not Char.isUAlphabetic(char))): if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): print('%04X' % usv) self.assertTrue(False)
def need_hex_escape(self, char, is_isolated): """Determine if a characters needs to be escaped with hex digits.""" if self.ismark(char) and is_isolated: return True if Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): return True if self.isformat(char): return True if self.is_space_separator(char): return True if self.is_pua(char): return True return False
def ignore_findit(self): from icu import Char, UProperty maxchar = 0x10ffff maxchar = 0xffff for usv in xrange(maxchar): char = unichr(usv) # if ((not self.ucd.is_specific_script(char)) and # (not self.ucd.is_exemplar_wordbreak(char)) and # (not Char.isUAlphabetic(char))): if self.ucd.isformat(char) and not Char.hasBinaryProperty(char, UProperty.DEFAULT_IGNORABLE_CODE_POINT): print '%04X' % usv self.assertTrue(False)
def normalize_string(in_str, allowed_scripts): """ Normalizes in_str by replacing letters and digits in other scripts with exemplar values. Args: in_str: String to process allowed_scripts: List of script short names (like "Mymr") to preserve """ # TODO: Consider checking ScriptExtensions here as well output = "" for ch in in_str: ch_script = Script.getScript(ch) ch_type = Char.charType(ch) ch_bucket = CHAR_TYPE_TO_BUCKET[ch_type] ch_digit = Char.digit(ch) if ch_script.getShortName() in allowed_scripts: # ch is in an allowed script: # copy directly to the output output += ch elif ch_bucket == 1: # ch is a letter in a disallowed script: # normalize to the sample char for that script output += Script.getSampleString(ch_script) elif ch_bucket == 3 and ch_digit != -1: # ch is a decimal digit in a disallowed script: # normalize to the zero digit in that numbering system output += chr(ord(ch) - ch_digit) elif ch_type == UCharCategory.CURRENCY_SYMBOL: # ch is a currency symbol in a disallowed script: # normalize to $ output += "$" else: # all other characters: # copy directly to the output output += ch return output
def allowable(self, char): """Make sure exemplars have the needed properties.""" # Numbers with or without diacritics need to be allowed. if self.ucd.isnumber(char): return True # Exemplars must be lowercase. if Char.isUUppercase(char): return False # Characters with a specific script can be exemplars. if self.ucd.is_specific_script(char): return True # Some punctuation and symbols are handled as letters. if self.ucd.is_exemplar_wordbreak(char): return True # Other characters must be Alphabetic. if Char.isUAlphabetic(char): return True return False
def is_exemplar_wordbreak(char): """True if the character has the Word_Break properties Katakana, ALetter, or MidLetter.""" # The following should be exposed by PyICU, but does not seem to be implemented. # There are other values, but these are the ones need for this function. WB_ALETTER = 1 WB_KATAKANA = 3 # WB_MIDLETTER = 4 numeric_wordbreak_type = Char.getIntPropertyValue(char, UProperty.WORD_BREAK) if (numeric_wordbreak_type == WB_KATAKANA or # numeric_wordbreak_type == WB_MIDLETTER or numeric_wordbreak_type == WB_ALETTER): return True return False
def is_exemplar_wordbreak(char): """True if the character has the Word_Break properties Katakana, ALetter, or MidLetter.""" # The following should be exposed by PyICU, but does not seem to be implemented. # There are other values, but these are the ones need for this function. WB_ALETTER = 1 WB_KATAKANA = 3 WB_MIDLETTER = 4 numeric_wordbreak_type = Char.getIntPropertyValue(char, UProperty.WORD_BREAK) if (numeric_wordbreak_type == WB_KATAKANA or numeric_wordbreak_type == WB_ALETTER or numeric_wordbreak_type == WB_MIDLETTER): return True return False
def render(self, uids, ftml, keyUID=0, addBreaks=True, rtl=None): """ general purpose (but not required) function to generate ftml for a character sequence """ if len(uids) == 0: return # Make a copy so we don't affect caller uids = list(uids) # Remember first uid and original length for later startUID = uids[0] uidLen = len(uids) # if keyUID wasn't supplied, use startUID if keyUID == 0: keyUID = startUID # Construct label from uids: label = '\n'.join(['U+{0:04X}'.format(u) for u in uids]) # Construct comment from glyph names: comment = ' '.join([self._charFromUID[u].basename for u in uids]) # see if uid list includes a mirrored char hasMirrored = bool(len([x for x in uids if Char.isMirrored(x)])) # Analyze first and last joining char joiningChars = [ x for x in uids if Char.getIntPropertyValue(x, UProperty.JOINING_TYPE) != TRANSPARENT ] if len(joiningChars): # If first or last non-TRANSPARENT char is a joining char, then we need to emit examples with zwj uid = joiningChars[0] zwjBefore = Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == DUAL_JOINING or ( Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE) == LEFT_JOINING) or ( Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == RIGHT_JOINING) uid = joiningChars[-1] zwjAfter = Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == DUAL_JOINING or ( Char.charDirection(uid) == UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue(uid, UProperty.JOINING_TYPE) == RIGHT_JOINING) or ( Char.charDirection(uid) != UCharDirection.LEFT_TO_RIGHT and Char.getIntPropertyValue( uid, UProperty.JOINING_TYPE) == LEFT_JOINING) else: zwjBefore = zwjAfter = False if Char.charType(startUID) == UCharCategory.NON_SPACING_MARK: # First char is a NSM... prefix a suitable base uids.insert(0, self.diacBase) zwjBefore = False # No longer any need to put zwj before elif Char.isUWhiteSpace(startUID): # First char is whitespace -- prefix with baseline brackets: uids.insert(0, 0xF130) lastNonMark = [ x for x in uids if Char.charType(x) != UCharCategory.NON_SPACING_MARK ][-1] if Char.isUWhiteSpace(lastNonMark): # Last non-mark is whitespace -- append baseline brackets: uids.append(0xF131) s = ''.join([chr(uid) for uid in uids]) if zwjBefore or zwjAfter: # Show contextual forms: t = u'{0} '.format(s) if zwjAfter: t += u'{0}\u200D '.format(s) if zwjBefore: t += u'\u200D{0}\u200D '.format(s) if zwjBefore: t += u'\u200D{0} '.format(s) if zwjBefore and zwjAfter: t += u'{0}{0}{0}'.format(s) if addBreaks: ftml.closeTest() ftml.addToTest(keyUID, t, label=label, comment=comment, rtl=rtl) if addBreaks: ftml.closeTest() elif hasMirrored and self.rtlEnable: # Contains mirrored and rtl enabled: if addBreaks: ftml.closeTest() ftml.addToTest( keyUID, u'{0} LTR: \u202A{0}\u202C RTL: \u202B{0}\u202C'.format(s), label=label, comment=comment, rtl=rtl) if addBreaks: ftml.closeTest() # elif is LRE, RLE, PDF # elif is LRI, RLI, FSI, PDI elif uidLen > 1: ftml.addToTest(keyUID, s, label=label, comment=comment, rtl=rtl) else: ftml.addToTest(keyUID, s, comment=comment, rtl=rtl)
def isformat(char): """True if the character is a format character (general category Cf).""" numeric_char_type = Char.charType(char) if numeric_char_type == UCharCategory.FORMAT_CHAR: return True return False
def __init__(self, input_name, input_n, input_t, input_clusters_num, input_embedding_dim, input_hunits, input_dropout_rate, input_output_dim, input_epochs, input_training_data, input_evaluation_data, input_language, input_embedding_type): self.name = input_name self.n = input_n self.t = input_t if self.t % self.n != 0: print("Warning: t is not divided by n") self.clusters_num = input_clusters_num # batch_size is the number of batches used in each iteration of back propagation to update model weights # The default value is self.t/self.n, but it can be set to other values as well. The only constraint is that # self.t should always be greater than self.batch_size * self.n self.batch_size = self.t // self.n self.embedding_dim = input_embedding_dim self.hunits = input_hunits self.dropout_rate = input_dropout_rate self.output_dim = input_output_dim self.epochs = input_epochs self.training_data = input_training_data self.evaluation_data = input_evaluation_data self.language = input_language self.embedding_type = input_embedding_type self.model = None # Constructing the grapheme cluster dictionary -- this will be used if self.embedding_type is Grapheme Clusters ratios = None if self.language == "Thai": if "exclusive" in self.training_data: ratios = constants.THAI_EXCLUSIVE_GRAPH_CLUST_RATIO else: ratios = constants.THAI_GRAPH_CLUST_RATIO elif self.language == "Burmese": if "exclusive" in self.training_data: ratios = constants.BURMESE_EXCLUSIVE_GRAPH_CLUST_RATIO else: ratios = constants.BURMESE_GRAPH_CLUST_RATIO elif self.language == "Thai_Burmese": ratios = constants.THAI_BURMESE_GRAPH_CLUST_RATIO else: print("Warning: the input language is not supported") cnt = 0 self.graph_clust_dic = dict() for key in ratios.keys(): if cnt < self.clusters_num - 1: self.graph_clust_dic[key] = cnt if cnt == self.clusters_num - 1: break cnt += 1 # Loading the code points dictionary -- this will be used if self.embedding_type is Code Points # If you want to group some of the code points into buckets, that code should go here to change # self.codepoint_dic appropriately if self.language == "Thai": self.codepoint_dic = constants.THAI_CODE_POINT_DICTIONARY if self.language == "Burmese": self.codepoint_dic = constants.BURMESE_CODE_POINT_DICTIONARY self.codepoints_num = len(self.codepoint_dic) + 1 # Constructing the letters dictionary -- this will be used if self.embedding_type is Generalized Vectors self.letters_dic = dict() if self.language in ["Thai", "Burmese"]: smallest_unicode_dec = None largest_unicode_dec = None # Defining the Unicode box for model's language if self.language == "Thai": smallest_unicode_dec = int("0E01", 16) largest_unicode_dec = int("0E5B", 16) elif self.language == "Burmese": smallest_unicode_dec = int("1000", 16) largest_unicode_dec = int("109F", 16) # Defining the code point buckets that will get their own individual embedding vector # 1: Letters, 2: Marks, 3: Digits, 4: Separators, 5: Punctuations, 6: Symbols, 7: Others separate_slot_buckets = [] separate_codepoints = [] if self.embedding_type == "generalized_vectors_123": separate_slot_buckets = [1, 2, 3] elif self.embedding_type == "generalized_vectors_12": separate_slot_buckets = [1, 2] elif self.embedding_type == "generalized_vectors_12d0": separate_slot_buckets = [1, 2] if self.language == "Burmese": separate_codepoints = [4160, 4240] if self.language == "Thai": separate_codepoints = [3664] elif self.embedding_type == "generalized_vectors_125": separate_slot_buckets = [1, 2, 5] elif self.embedding_type == "generalized_vectors_1235": separate_slot_buckets = [1, 2, 3, 5] # Constructing letters dictionary cnt = 0 for i in range(smallest_unicode_dec, largest_unicode_dec + 1): ch = chr(i) if constants.CHAR_TYPE_TO_BUCKET[Char.charType( ch)] in separate_slot_buckets: self.letters_dic[ch] = cnt cnt += 1 for unicode_dec in separate_codepoints: ch = chr(unicode_dec) self.letters_dic[ch] = cnt cnt += 1 # After making the letters dictionary, we can call different versions of the generalized vectors same thing if "generalized_vectors" in self.embedding_type: self.embedding_type = "generalized_vectors" else: print( "Warning: the generalized_vectros embedding type is not supported for this language" )
def isnukta(char): """True if the character is a nukta.""" if Char.getCombiningClass(char) == 7: return True return False
def is_space_separator(char): """True if the character is space separator (general category Zs).""" numeric_char_type = Char.charType(char) if numeric_char_type == UCharCategory.SPACE_SEPARATOR: return True return False
def process(self, text): """Analyze a string.""" i = 0 text = self.ucd.normalize('NFD', text) # Record script of each character. for char in text: script = Script.getScript(char) script_code = Script.getScriptCode(script) self.scripts[script_code] += 1 self.codes_for_scripts[script_code] = script # Record clusters while i < len(text): # Look for multigraphs (from length of max_multigraph_length down to 1) character(s) # of multigraphs already specified in a LDML file. # Longest possible matches are looked at first. for multigraph_length in range(self.max_multigraph_length, 0, -1): multigraph = text[i:i + multigraph_length] if (multigraph in self._main or multigraph in self._auxiliary or multigraph in self._index or multigraph in self._punctuation): exemplar = Exemplar(multigraph) self.clusters[exemplar] += 1 i += multigraph_length break # No multigraphs were found at this position, # so continue processing a single character # if we have not gone beyond the end of the text. if not i < len(text): break char = text[i] # Test for punctuation. if self.ucd.ispunct(char): exemplar = Exemplar(char) self.clusters[exemplar] += 1 i += 1 continue # Find grapheme clusters. # Ensure exemplar base has needed properties. if not self.allowable(char): i += 1 continue # The current character is a base character. base = char # Then find the end of the cluster # (which may consist of only base characters). length = base_length = 1 while i + length < len(text): trailer = text[i + length] if Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT): # A Default_Ignorable_Code_Point was found, so the cluster continues. length += 1 continue if self.ucd.ismark(trailer): # A Mark was found, so the cluster continues. length += 1 # Marks such as nuktas are considered part of the base. if self.ucd.is_always_combine(trailer): # A Mark such as a nukta was found, so the base continues, # as well as the cluster. base_length += 1 base = text[i:i + base_length] continue else: # No more marks, so the end of the cluster has been reached. break # Extract cluster # If no nuktas have been found, # then the base will be the single character already called base (or char). # If no non-nukta marks have been found, # then the trailers variable will be an empty string. trailers = text[i + base_length:i + length] exemplar = Exemplar(base, trailers) self.clusters[exemplar] += 1 i += length