def run(self): decompositionEntries, flagEntries = self.read() # Remove pseudo characters by merging entries if not self.includePseudoCharacters: decompositionEntries, flagEntries = self._removePseudoCharacters(decompositionEntries, flagEntries) # Remove minimal component entries if not self.includeMinimal: for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph].copy(): if len(decomposition) == 1: decompositionEntries[char][glyph].remove(decomposition) del flagEntries[char][glyph][decomposition] # Merge similar decompositions, removing inferior ones self._mergeSimilarDecompositions(decompositionEntries, flagEntries) # Write entries for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for idx, decomposition in enumerate(sorted(decompositionEntries[char][glyph])): decompStr = CharacterLookup.decompositionToString(decomposition) if type(char) == type(0): # pseudo character char = "#%d" % char flagStr = "".join(sorted(flagEntries[char][glyph][decomposition])) print( '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s' % {"char": char, "decomp": decompStr, "glyph": glyph, "index": idx, "flags": flagStr} ).encode(default_encoding)
def run(self): decompositionEntries, flagEntries = self.read() # Remove pseudo characters by merging entries if not self.includePseudoCharacters: decompositionEntries, flagEntries = self._removePseudoCharacters( decompositionEntries, flagEntries) # Remove minimal component entries if not self.includeMinimal: for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for decomposition \ in decompositionEntries[char][glyph].copy(): if len(decomposition) == 1: decompositionEntries[char][glyph].remove( decomposition) del flagEntries[char][glyph][decomposition] # Merge similar decompositions, removing inferior ones self._mergeSimilarDecompositions(decompositionEntries, flagEntries) # Write entries for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for idx, decomposition in enumerate( sorted(decompositionEntries[char][glyph])): decompStr = CharacterLookup.decompositionToString( decomposition) if type(char) == type(0): # pseudo character char = '#%d' % char flagStr = ''.join( sorted(flagEntries[char][glyph][decomposition])) print( '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s' % { 'char': char, 'decomp': decompStr, 'glyph': glyph, 'index': idx, 'flags': flagStr }).encode(default_encoding)
def _removePseudoCharacters(self, decompositionEntries, flagEntries): """ Removes all pseudo character entries and subsitutes their occurence by their own entries. """ def substitutePseudoCharacters(decomposition): newDecomposition = [] for c in decomposition: if type(c) != type(()): # IDS newDecomposition.append([[c]]) else: char, _ = c if type(char) == type(0): if c in pseudoCharacterMap: # get all decompositions of this pseudo character newPseudoDecomp = [] for decomp in pseudoCharacterMap[c]: newDecomps = substitutePseudoCharacters(decomp) if newDecomps: newPseudoDecomp.extend(newDecomps) newDecomposition.append(newPseudoDecomp) else: return else: # normal char newDecomposition.append([[c]]) # all combinations of sub-decompositions flatDecomp = set() for newDecomp in cross(*newDecomposition): flatEntry = [] for entry in newDecomp: flatEntry.extend(entry) flatDecomp.add(tuple(flatEntry)) return flatDecomp # find pseude characters first pseudoCharacterMap = {} for char in decompositionEntries: if type(char) == type(0): for glyph in decompositionEntries[char]: pseudoCharacterMap[(char, glyph)] = decompositionEntries[char][glyph] # now apply newDecompositionsEntries = {} newFlagEntries = {} for char in decompositionEntries: if type(char) == type(0): continue newDecompositionsEntries[char] = {} newFlagEntries[char] = {} for glyph in decompositionEntries[char]: newDecompositionsEntries[char][glyph] = set() newFlagEntries[char][glyph] = {} for decomposition in decompositionEntries[char][glyph]: newDecompositions = substitutePseudoCharacters(decomposition) if newDecompositions: newDecompositionsEntries[char][glyph].update(newDecompositions) # transfer flags for newDecomposition in newDecompositions: newFlagEntries[char][glyph][newDecomposition] = flagEntries[char][glyph][decomposition] elif not self.quiet: print >>sys.stderr, ( "Unable to resolve decomposition" + " with pseudo character for '%s': " % char + CharacterLookup.decompositionToString(decomposition) ).encode(default_encoding) return newDecompositionsEntries, newFlagEntries
def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries): """ Merges two decompositions, if they are the same, except: - one has an unknown component while the other doesn't, - one has a subtree that is the decomposition of the corresponding component of the other decomposition. """ def consumeComponent(decomposition): """ Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香} consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}. """ if type(decomposition[0]) == type(()): # consume one component return decomposition[1:] if CharacterLookup.isBinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) return consumeComponent(decomposition) elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) decomposition = consumeComponent(decomposition) return consumeComponent(decomposition) def compareTrees(decompositionA, decompositionB): """ Checks for similar decomposition trees, taking care of unknown components. Returns C{None} if the trees are not equal, a integer if the trees are similar. If the left tree (decompositionA) should be preferred a negative number is returned, or a positive number for the right tree (decompositionB). If C{0} is returned, both trees are equally good to choose from. """ if not decompositionA and not decompositionB: # equal return 0 elif not decompositionA or not decompositionB: # if all preceding components are the same that shouldn't happen raise ValueError() elif decompositionA[0] == decompositionB[0]: return compareTrees(decompositionA[1:], decompositionB[1:]) elif type(decompositionA[0]) == type(()) and decompositionA[0][0] == u"?": decompositionB = consumeComponent(decompositionB) result = compareTrees(decompositionA[1:], decompositionB) if result is None or result < 0: # unequal or the left side is preferred later on return None else: return +1 elif type(decompositionB[0]) == type(()) and decompositionB[0][0] == u"?": decompositionA = consumeComponent(decompositionA) result = compareTrees(decompositionA, decompositionB[1:]) if result is None or result > 0: # unequal or the right side is preferred later on return None else: return -1 elif CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0]): # No way these decompositions can be equal # (simplified subseq. checking) return None elif CharacterLookup.isIDSOperator(decompositionA[0]): # expand tree B char, glyph = decompositionB[0] if char in decompositionEntries and glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph]: result = compareTrees(decompositionA, decomposition + decompositionB[1:]) if result is not None and result >= 0: # right side preferred and so do we... # A shorted description is better return 1 return None elif CharacterLookup.isIDSOperator(decompositionB[0]): # expand tree A char, glyph = decompositionA[0] if char in decompositionEntries and glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph]: result = compareTrees(decomposition + decompositionA[1:], decompositionB) if result is not None and result <= 0: # left side preferred and so do we... # A shorted description is better return -1 return None else: return None for char in decompositionEntries: for glyph in decompositionEntries[char]: idxA = 0 decompositions = list(decompositionEntries[char][glyph]) flagsDict = flagEntries[char][glyph] # Check every decomposition with all others to the right while idxA < len(decompositions): idxB = idxA + 1 while idxB < len(decompositions): try: result = compareTrees(decompositions[idxA], decompositions[idxB]) if result is not None and result == 0: # Entries are equal, we can transfer flags flagsDict[decompositions[idxA]].update(flagsDict[decompositions[idxB]]) del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result < 0: del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result > 0: del flagsDict[decompositions[idxA]] del decompositions[idxA] # No need for further testing for this decomp break else: # Only increase if the list didn't shift to the # left idxB += 1 except ValueError: print >>sys.stderr, ( "Error comparing decompositions %s and %s" % ( CharacterLookup.decompositionToString(decompositions[idxA]), CharacterLookup.decompositionToString(decompositions[idxB]), ) ).encode(default_encoding) idxB += 1 else: idxA += 1 decompositionEntries[char][glyph] = set(decompositions)
def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries): """ Merges two decompositions, if they are the same, except: - one has an unknown component while the other doesn't, - one has a subtree that is the decomposition of the corresponding component of the other decomposition. """ def consumeComponent(decomposition): """ Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香} consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}. """ if type(decomposition[0]) == type(()): # consume one component return decomposition[1:] if CharacterLookup.isBinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) return consumeComponent(decomposition) elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) decomposition = consumeComponent(decomposition) return consumeComponent(decomposition) def compareTrees(decompositionA, decompositionB): """ Checks for similar decomposition trees, taking care of unknown components. Returns C{None} if the trees are not equal, a integer if the trees are similar. If the left tree (decompositionA) should be preferred a negative number is returned, or a positive number for the right tree (decompositionB). If C{0} is returned, both trees are equally good to choose from. """ if not decompositionA and not decompositionB: # equal return 0 elif not decompositionA or not decompositionB: # if all preceding components are the same that shouldn't happen raise ValueError() elif decompositionA[0] == decompositionB[0]: return compareTrees(decompositionA[1:], decompositionB[1:]) elif (type(decompositionA[0]) == type(()) and decompositionA[0][0] == u'?'): decompositionB = consumeComponent(decompositionB) result = compareTrees(decompositionA[1:], decompositionB) if result is None or result < 0: # unequal or the left side is preferred later on return None else: return +1 elif (type(decompositionB[0]) == type(()) and decompositionB[0][0] == u'?'): decompositionA = consumeComponent(decompositionA) result = compareTrees(decompositionA, decompositionB[1:]) if result is None or result > 0: # unequal or the right side is preferred later on return None else: return -1 elif (CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0])): # No way these decompositions can be equal # (simplified subseq. checking) return None elif CharacterLookup.isIDSOperator(decompositionA[0]): # expand tree B char, glyph = decompositionB[0] if (char in decompositionEntries and glyph in decompositionEntries[char]): for decomposition in decompositionEntries[char][glyph]: result = compareTrees( decompositionA, decomposition + decompositionB[1:]) if result is not None and result >= 0: # right side preferred and so do we... # A shorted description is better return 1 return None elif CharacterLookup.isIDSOperator(decompositionB[0]): # expand tree A char, glyph = decompositionA[0] if (char in decompositionEntries and glyph in decompositionEntries[char]): for decomposition in decompositionEntries[char][glyph]: result = compareTrees( decomposition + decompositionA[1:], decompositionB) if result is not None and result <= 0: # left side preferred and so do we... # A shorted description is better return -1 return None else: return None for char in decompositionEntries: for glyph in decompositionEntries[char]: idxA = 0 decompositions = list(decompositionEntries[char][glyph]) flagsDict = flagEntries[char][glyph] # Check every decomposition with all others to the right while idxA < len(decompositions): idxB = idxA + 1 while idxB < len(decompositions): try: result = compareTrees(decompositions[idxA], decompositions[idxB]) if result is not None and result == 0: # Entries are equal, we can transfer flags flagsDict[decompositions[idxA]].update( flagsDict[decompositions[idxB]]) del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result < 0: del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result > 0: del flagsDict[decompositions[idxA]] del decompositions[idxA] # No need for further testing for this decomp break else: # Only increase if the list didn't shift to the # left idxB += 1 except ValueError: print >> sys.stderr, ( "Error comparing decompositions %s and %s" % (CharacterLookup.decompositionToString( decompositions[idxA]), CharacterLookup.decompositionToString( decompositions[idxB])))\ .encode(default_encoding) idxB += 1 else: idxA += 1 decompositionEntries[char][glyph] = set(decompositions)