コード例 #1
0
ファイル: checkcjkradicals.py プロジェクト: ninchanese/cjklib
def main():
    cjk = CharacterLookup('T')
    cjkSimplified = CharacterLookup('C')

    fileEntryCount = 0
    databaseMissingEntryCount = 0
    noEntryCount = 0
    wrongEquivalentCount = 0
    seenRadicalFormIndices = set()
    seenRadicalVariantIndices = set()
    for line in sys.stdin:
        line = line.decode(default_encoding)

        if re.match(r'\s*#', line) or re.match(r'\s+$', line):
            continue
        else:
            fileEntryCount = fileEntryCount + 1

            matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \
                + r"\s+([1234567890ABCDEF]{4,5})\s*$", line)
            if matchObj:
                index, variant, radicalCP, equivalentCP = matchObj.groups()
                radicalIdx = int(index)
                radicalForm = chr(int(radicalCP, 16))
                equivalentForm = chr(int(equivalentCP, 16))

                if variant:
                    seenRadicalVariantIndices.add(radicalIdx)
                else:
                    seenRadicalFormIndices.add(radicalIdx)
                # check radicalForm
                if not variant:
                    targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)])
                else:
                    targetForms = set()
                    # add simplified form, if different
                    simplifiedForm = cjkSimplified.getKangxiRadicalForm(
                        radicalIdx)
                    if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
                        targetForms.add(simplifiedForm)
                    # add simplified variant
                    targetForms.update(
                        set(cjkSimplified.getKangxiRadicalVariantForms(
                            radicalIdx)) \
                        - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))

                if radicalForm not in targetForms:
                    # cjklib is missing something
                    print(("No entry for radical form '%s' with index %d%s"
                        % (radicalForm, radicalIdx, variant))\
                        .encode(default_encoding))
                    databaseMissingEntryCount += 1
                if targetForms - set([radicalForm]):
                    # CJKRadicals.txt is missing something
                    for form in targetForms - set([radicalForm]):
                        print(("Database entry '%s' with radical index %d%s" \
                            % (form, radicalIdx, variant) \
                            + " not included in table")\
                            .encode(default_encoding))
                    noEntryCount += 1

                # check equivalentForm
                libraryEquivalentForm \
                    = cjk.getRadicalFormEquivalentCharacter(radicalForm)
                if libraryEquivalentForm != equivalentForm:
                    print(("Equivalent radical form '%s' with index %d%s"
                        % (libraryEquivalentForm, radicalIdx, variant) \
                        + " not backed by table: '%s'" % equivalentForm)\
                        .encode(default_encoding))
                    wrongEquivalentCount += 1

            else:
                print(("error reading line: '" + line + "'")\
                    .encode(default_encoding))


    for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices:
        print(("No table entry for radical index %d" % radicalIdx)\
            .encode(default_encoding))
        noEntryCount += 1

    for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices:
        simplifiedForms = set()
        # add simplified form, if different
        simplifiedForm = cjkSimplified.getKangxiRadicalForm(
            radicalIdx)
        if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
            simplifiedForms.add(simplifiedForm)
        # add simplified variant
        simplifiedForms.update(
            set(cjkSimplified.getKangxiRadicalVariantForms(
                radicalIdx)) \
            - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))
        for form in simplifiedForms:
            print(("No table entry for simplified radical %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding))
            noEntryCount += 1

    for radicalIdx in range(1, 215):
        otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \
            - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx))
        for form in otherVariants:
            print(("No table entry for variant %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding))
            noEntryCount += 1

    print("Total %d entries" % fileEntryCount \
        + ", %d missing from cjklib" % databaseMissingEntryCount \
        + ", %d mismatches in equivalent forms" % wrongEquivalentCount \
        + ", not found in source list: %d" % noEntryCount)
コード例 #2
0
ファイル: checkcjkradicals.py プロジェクト: KentVu/cjklib
def main():
    cjk = CharacterLookup('T')
    cjkSimplified = CharacterLookup('C')

    fileEntryCount = 0
    databaseMissingEntryCount = 0
    noEntryCount = 0
    wrongEquivalentCount = 0
    seenRadicalFormIndices = set()
    seenRadicalVariantIndices = set()
    for line in sys.stdin:
        line = line.decode(default_encoding)

        if re.match(r'\s*#', line) or re.match(r'\s+$', line):
            continue
        else:
            fileEntryCount = fileEntryCount + 1

            matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \
                + r"\s+([1234567890ABCDEF]{4,5})\s*$", line)
            if matchObj:
                index, variant, radicalCP, equivalentCP = matchObj.groups()
                radicalIdx = int(index)
                radicalForm = unichr(int(radicalCP, 16))
                equivalentForm = unichr(int(equivalentCP, 16))

                if variant:
                    seenRadicalVariantIndices.add(radicalIdx)
                else:
                    seenRadicalFormIndices.add(radicalIdx)
                # check radicalForm
                if not variant:
                    targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)])
                else:
                    targetForms = set()
                    # add simplified form, if different
                    simplifiedForm = cjkSimplified.getKangxiRadicalForm(
                        radicalIdx)
                    if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
                        targetForms.add(simplifiedForm)
                    # add simplified variant
                    targetForms.update(
                        set(cjkSimplified.getKangxiRadicalVariantForms(
                            radicalIdx)) \
                        - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))

                if radicalForm not in targetForms:
                    # cjklib is missing something
                    print ("No entry for radical form '%s' with index %d%s"
                        % (radicalForm, radicalIdx, variant))\
                        .encode(default_encoding)
                    databaseMissingEntryCount += 1
                if targetForms - set([radicalForm]):
                    # CJKRadicals.txt is missing something
                    for form in targetForms - set([radicalForm]):
                        print ("Database entry '%s' with radical index %d%s" \
                            % (form, radicalIdx, variant) \
                            + " not included in table")\
                            .encode(default_encoding)
                    noEntryCount += 1

                # check equivalentForm
                libraryEquivalentForm \
                    = cjk.getRadicalFormEquivalentCharacter(radicalForm)
                if libraryEquivalentForm != equivalentForm:
                    print ("Equivalent radical form '%s' with index %d%s"
                        % (libraryEquivalentForm, radicalIdx, variant) \
                        + " not backed by table: '%s'" % equivalentForm)\
                        .encode(default_encoding)
                    wrongEquivalentCount += 1

            else:
                print ("error reading line: '" + line + "'")\
                    .encode(default_encoding)


    for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices:
        print ("No table entry for radical index %d" % radicalIdx)\
            .encode(default_encoding)
        noEntryCount += 1

    for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices:
        simplifiedForms = set()
        # add simplified form, if different
        simplifiedForm = cjkSimplified.getKangxiRadicalForm(
            radicalIdx)
        if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx):
            simplifiedForms.add(simplifiedForm)
        # add simplified variant
        simplifiedForms.update(
            set(cjkSimplified.getKangxiRadicalVariantForms(
                radicalIdx)) \
            - set(cjk.getKangxiRadicalVariantForms(radicalIdx)))
        for form in simplifiedForms:
            print ("No table entry for simplified radical %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding)
            noEntryCount += 1

    for radicalIdx in range(1, 215):
        otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \
            - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx))
        for form in otherVariants:
            print ("No table entry for variant %s with index %d'"
                % (form, radicalIdx)).encode(default_encoding)
            noEntryCount += 1

    print "Total %d entries" % fileEntryCount \
        + ", %d missing from cjklib" % databaseMissingEntryCount \
        + ", %d mismatches in equivalent forms" % wrongEquivalentCount \
        + ", not found in source list: %d" % noEntryCount