Esempio n. 1
0
    def run(self):
        decompositionEntries, flagEntries = self.read()

        # Remove pseudo characters by merging entries
        if not self.includePseudoCharacters:
            decompositionEntries, flagEntries = self._removePseudoCharacters(decompositionEntries, flagEntries)

        # Remove minimal component entries
        if not self.includeMinimal:
            for char in sorted(decompositionEntries.keys()):
                for glyph in decompositionEntries[char]:
                    for decomposition in decompositionEntries[char][glyph].copy():

                        if len(decomposition) == 1:
                            decompositionEntries[char][glyph].remove(decomposition)
                            del flagEntries[char][glyph][decomposition]

        # Merge similar decompositions, removing inferior ones
        self._mergeSimilarDecompositions(decompositionEntries, flagEntries)

        # Write entries
        for char in sorted(decompositionEntries.keys()):
            for glyph in decompositionEntries[char]:
                for idx, decomposition in enumerate(sorted(decompositionEntries[char][glyph])):
                    decompStr = CharacterLookup.decompositionToString(decomposition)
                    if type(char) == type(0):
                        # pseudo character
                        char = "#%d" % char
                    flagStr = "".join(sorted(flagEntries[char][glyph][decomposition]))
                    print(
                        '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s'
                        % {"char": char, "decomp": decompStr, "glyph": glyph, "index": idx, "flags": flagStr}
                    ).encode(default_encoding)
    def run(self):
        decompositionEntries, flagEntries = self.read()

        # Remove pseudo characters by merging entries
        if not self.includePseudoCharacters:
            decompositionEntries, flagEntries = self._removePseudoCharacters(
                decompositionEntries, flagEntries)

        # Remove minimal component entries
        if not self.includeMinimal:
            for char in sorted(decompositionEntries.keys()):
                for glyph in decompositionEntries[char]:
                    for decomposition \
                        in decompositionEntries[char][glyph].copy():

                        if len(decomposition) == 1:
                            decompositionEntries[char][glyph].remove(
                                decomposition)
                            del flagEntries[char][glyph][decomposition]

        # Merge similar decompositions, removing inferior ones
        self._mergeSimilarDecompositions(decompositionEntries, flagEntries)

        # Write entries
        for char in sorted(decompositionEntries.keys()):
            for glyph in decompositionEntries[char]:
                for idx, decomposition in enumerate(
                        sorted(decompositionEntries[char][glyph])):
                    decompStr = CharacterLookup.decompositionToString(
                        decomposition)
                    if type(char) == type(0):
                        # pseudo character
                        char = '#%d' % char
                    flagStr = ''.join(
                        sorted(flagEntries[char][glyph][decomposition]))
                    print(
                        '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s'
                        % {
                            'char': char,
                            'decomp': decompStr,
                            'glyph': glyph,
                            'index': idx,
                            'flags': flagStr
                        }).encode(default_encoding)
Esempio n. 3
0
    def _removePseudoCharacters(self, decompositionEntries, flagEntries):
        """
        Removes all pseudo character entries and subsitutes their occurence
        by their own entries.
        """

        def substitutePseudoCharacters(decomposition):
            newDecomposition = []
            for c in decomposition:
                if type(c) != type(()):
                    # IDS
                    newDecomposition.append([[c]])
                else:
                    char, _ = c
                    if type(char) == type(0):
                        if c in pseudoCharacterMap:
                            # get all decompositions of this pseudo character
                            newPseudoDecomp = []
                            for decomp in pseudoCharacterMap[c]:
                                newDecomps = substitutePseudoCharacters(decomp)
                                if newDecomps:
                                    newPseudoDecomp.extend(newDecomps)
                            newDecomposition.append(newPseudoDecomp)
                        else:
                            return
                    else:
                        # normal char
                        newDecomposition.append([[c]])
            # all combinations of sub-decompositions
            flatDecomp = set()
            for newDecomp in cross(*newDecomposition):
                flatEntry = []
                for entry in newDecomp:
                    flatEntry.extend(entry)
                flatDecomp.add(tuple(flatEntry))
            return flatDecomp

        # find pseude characters first
        pseudoCharacterMap = {}
        for char in decompositionEntries:
            if type(char) == type(0):
                for glyph in decompositionEntries[char]:
                    pseudoCharacterMap[(char, glyph)] = decompositionEntries[char][glyph]

        # now apply
        newDecompositionsEntries = {}
        newFlagEntries = {}
        for char in decompositionEntries:
            if type(char) == type(0):
                continue
            newDecompositionsEntries[char] = {}
            newFlagEntries[char] = {}
            for glyph in decompositionEntries[char]:
                newDecompositionsEntries[char][glyph] = set()
                newFlagEntries[char][glyph] = {}
                for decomposition in decompositionEntries[char][glyph]:
                    newDecompositions = substitutePseudoCharacters(decomposition)
                    if newDecompositions:
                        newDecompositionsEntries[char][glyph].update(newDecompositions)
                        # transfer flags
                        for newDecomposition in newDecompositions:
                            newFlagEntries[char][glyph][newDecomposition] = flagEntries[char][glyph][decomposition]
                    elif not self.quiet:
                        print >>sys.stderr, (
                            "Unable to resolve decomposition"
                            + " with pseudo character for '%s': " % char
                            + CharacterLookup.decompositionToString(decomposition)
                        ).encode(default_encoding)

        return newDecompositionsEntries, newFlagEntries
Esempio n. 4
0
    def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries):
        """
        Merges two decompositions, if they are the same, except:
            - one has an unknown component while the other doesn't,
            - one has a subtree that is the decomposition of the corresponding
              component of the other decomposition.
        """

        def consumeComponent(decomposition):
            """
            Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香}
            consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}.
            """
            if type(decomposition[0]) == type(()):
                # consume one component
                return decomposition[1:]

            if CharacterLookup.isBinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                return consumeComponent(decomposition)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                decomposition = consumeComponent(decomposition)
                return consumeComponent(decomposition)

        def compareTrees(decompositionA, decompositionB):
            """
            Checks for similar decomposition trees, taking care of unknown
            components.

            Returns C{None} if the trees are not equal, a integer if the trees
            are similar. If the left tree (decompositionA) should be preferred a
            negative number is returned, or a positive number for the right tree
            (decompositionB). If C{0} is returned, both trees are equally good
            to choose from.
            """
            if not decompositionA and not decompositionB:
                # equal
                return 0
            elif not decompositionA or not decompositionB:
                # if all preceding components are the same that shouldn't happen
                raise ValueError()
            elif decompositionA[0] == decompositionB[0]:
                return compareTrees(decompositionA[1:], decompositionB[1:])

            elif type(decompositionA[0]) == type(()) and decompositionA[0][0] == u"?":
                decompositionB = consumeComponent(decompositionB)
                result = compareTrees(decompositionA[1:], decompositionB)
                if result is None or result < 0:
                    # unequal or the left side is preferred later on
                    return None
                else:
                    return +1

            elif type(decompositionB[0]) == type(()) and decompositionB[0][0] == u"?":
                decompositionA = consumeComponent(decompositionA)
                result = compareTrees(decompositionA, decompositionB[1:])
                if result is None or result > 0:
                    # unequal or the right side is preferred later on
                    return None
                else:
                    return -1

            elif CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0]):
                # No way these decompositions can be equal
                #   (simplified subseq. checking)
                return None

            elif CharacterLookup.isIDSOperator(decompositionA[0]):
                # expand tree B
                char, glyph = decompositionB[0]
                if char in decompositionEntries and glyph in decompositionEntries[char]:

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(decompositionA, decomposition + decompositionB[1:])
                        if result is not None and result >= 0:
                            # right side preferred and so do we...
                            #   A shorted description is better
                            return 1

                return None

            elif CharacterLookup.isIDSOperator(decompositionB[0]):
                # expand tree A
                char, glyph = decompositionA[0]
                if char in decompositionEntries and glyph in decompositionEntries[char]:

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(decomposition + decompositionA[1:], decompositionB)
                        if result is not None and result <= 0:
                            # left side preferred and so do we...
                            #   A shorted description is better
                            return -1
                return None
            else:
                return None

        for char in decompositionEntries:
            for glyph in decompositionEntries[char]:
                idxA = 0
                decompositions = list(decompositionEntries[char][glyph])
                flagsDict = flagEntries[char][glyph]
                # Check every decomposition with all others to the right
                while idxA < len(decompositions):
                    idxB = idxA + 1
                    while idxB < len(decompositions):
                        try:
                            result = compareTrees(decompositions[idxA], decompositions[idxB])
                            if result is not None and result == 0:
                                # Entries are equal, we can transfer flags
                                flagsDict[decompositions[idxA]].update(flagsDict[decompositions[idxB]])
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result < 0:
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result > 0:
                                del flagsDict[decompositions[idxA]]
                                del decompositions[idxA]
                                # No need for further testing for this decomp
                                break
                            else:
                                # Only increase if the list didn't shift to the
                                #   left
                                idxB += 1
                        except ValueError:
                            print >>sys.stderr, (
                                "Error comparing decompositions %s and %s"
                                % (
                                    CharacterLookup.decompositionToString(decompositions[idxA]),
                                    CharacterLookup.decompositionToString(decompositions[idxB]),
                                )
                            ).encode(default_encoding)
                            idxB += 1
                    else:
                        idxA += 1
                decompositionEntries[char][glyph] = set(decompositions)
    def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries):
        """
        Merges two decompositions, if they are the same, except:
            - one has an unknown component while the other doesn't,
            - one has a subtree that is the decomposition of the corresponding
              component of the other decomposition.
        """
        def consumeComponent(decomposition):
            """
            Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香}
            consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}.
            """
            if type(decomposition[0]) == type(()):
                # consume one component
                return decomposition[1:]

            if CharacterLookup.isBinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                return consumeComponent(decomposition)
            elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]):
                decomposition = consumeComponent(decomposition[1:])
                decomposition = consumeComponent(decomposition)
                return consumeComponent(decomposition)

        def compareTrees(decompositionA, decompositionB):
            """
            Checks for similar decomposition trees, taking care of unknown
            components.

            Returns C{None} if the trees are not equal, a integer if the trees
            are similar. If the left tree (decompositionA) should be preferred a
            negative number is returned, or a positive number for the right tree
            (decompositionB). If C{0} is returned, both trees are equally good
            to choose from.
            """
            if not decompositionA and not decompositionB:
                # equal
                return 0
            elif not decompositionA or not decompositionB:
                # if all preceding components are the same that shouldn't happen
                raise ValueError()
            elif decompositionA[0] == decompositionB[0]:
                return compareTrees(decompositionA[1:], decompositionB[1:])

            elif (type(decompositionA[0]) == type(())
                  and decompositionA[0][0] == u'?'):
                decompositionB = consumeComponent(decompositionB)
                result = compareTrees(decompositionA[1:], decompositionB)
                if result is None or result < 0:
                    # unequal or the left side is preferred later on
                    return None
                else:
                    return +1

            elif (type(decompositionB[0]) == type(())
                  and decompositionB[0][0] == u'?'):
                decompositionA = consumeComponent(decompositionA)
                result = compareTrees(decompositionA, decompositionB[1:])
                if result is None or result > 0:
                    # unequal or the right side is preferred later on
                    return None
                else:
                    return -1

            elif (CharacterLookup.isIDSOperator(decompositionA[0])
                  and CharacterLookup.isIDSOperator(decompositionB[0])):
                # No way these decompositions can be equal
                #   (simplified subseq. checking)
                return None

            elif CharacterLookup.isIDSOperator(decompositionA[0]):
                # expand tree B
                char, glyph = decompositionB[0]
                if (char in decompositionEntries
                        and glyph in decompositionEntries[char]):

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(
                            decompositionA, decomposition + decompositionB[1:])
                        if result is not None and result >= 0:
                            # right side preferred and so do we...
                            #   A shorted description is better
                            return 1

                return None

            elif CharacterLookup.isIDSOperator(decompositionB[0]):
                # expand tree A
                char, glyph = decompositionA[0]
                if (char in decompositionEntries
                        and glyph in decompositionEntries[char]):

                    for decomposition in decompositionEntries[char][glyph]:
                        result = compareTrees(
                            decomposition + decompositionA[1:], decompositionB)
                        if result is not None and result <= 0:
                            # left side preferred and so do we...
                            #   A shorted description is better
                            return -1
                return None
            else:
                return None

        for char in decompositionEntries:
            for glyph in decompositionEntries[char]:
                idxA = 0
                decompositions = list(decompositionEntries[char][glyph])
                flagsDict = flagEntries[char][glyph]
                # Check every decomposition with all others to the right
                while idxA < len(decompositions):
                    idxB = idxA + 1
                    while idxB < len(decompositions):
                        try:
                            result = compareTrees(decompositions[idxA],
                                                  decompositions[idxB])
                            if result is not None and result == 0:
                                # Entries are equal, we can transfer flags
                                flagsDict[decompositions[idxA]].update(
                                    flagsDict[decompositions[idxB]])
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result < 0:
                                del flagsDict[decompositions[idxB]]
                                del decompositions[idxB]
                            elif result is not None and result > 0:
                                del flagsDict[decompositions[idxA]]
                                del decompositions[idxA]
                                # No need for further testing for this decomp
                                break
                            else:
                                # Only increase if the list didn't shift to the
                                #   left
                                idxB += 1
                        except ValueError:
                            print >> sys.stderr, (
                                "Error comparing decompositions %s and %s"
                                % (CharacterLookup.decompositionToString(
                                    decompositions[idxA]),
                                    CharacterLookup.decompositionToString(
                                        decompositions[idxB])))\
                                    .encode(default_encoding)
                            idxB += 1
                    else:
                        idxA += 1
                decompositionEntries[char][glyph] = set(decompositions)