Exemple #1
0
def ezechiele():
    ccToTokens = ezechieleData.mapping
    phrase = ezechieleData.verse
    littera = createLetter(ccToTokens, phrase)

    imshow('lectera minaces', littera)
    waitKey(0)
    destroyAllWindows()

    if not path.exists(ezechieleMinace):
        print('writing image to ', ezechieleMinace)
Exemple #2
0
def testDatasetBuilderMinacesLittera_augmented():
    """
    Give a color word C of length L produces L(L-1)/2 + 1 black and white images. L(L-1)/2 for the substrings and one for the original.

    AnnotatedTks(C): ["e", "cc", "l", "i", "e"]
    "040v/599_532_32_88.png"
    #  2 (4)
            ecc [0, 1]
            ccl [1, 2]
            li [2, 3]
            ie [3, 4]
    #  3 (3)
            eccl [0, 2]
            ccli [1, 3]
            lie [2, 4]
    #  4 (2)
            eccli [0, 3]
            cclie [1, 4]
    #  5 (1)
            ecclie [0, 4]
    :return: None
    """
    inputImage = "040v/599_532_32_88.png"  # ecclie

    with open(images2ColorsBBxesJSON, "r") as f:
        data = load(f)[inputImage]

    colors, tokensBBxs = data["col"], data["tks"]

    wlen = len(tokensBBxs)

    # targetImage = imread(path.join(color_words, inputImage))

    print("###")
    pprint(tokensBBxs)
    producedSubstrings = set()

    index = getIndex(indexName='baselineIndex')

    for substrLen in range(2, wlen + 1):
        print("\n# ", substrLen)
        # original color word
        if substrLen == wlen:
            chars = [t[1] for t in tokensBBxs]
            targetWrd = "".join(chars)
            producedSubstrings.add(targetWrd)
            print("       ", targetWrd)

            # TARGET
            targetChars2Imgs = {targetWrd: [inputImage, chars]}
            target = createLetter(targetChars2Imgs, [targetWrd],
                                  toWhitePaper=False,
                                  is256=True,
                                  separate=False)

            # CONDITION
            condChar2Images, condOrderedComps = query(index,
                                                      text=" ".join(chars),
                                                      forceHead=len(chars) > 1)
            condition = createLetter(condChar2Images,
                                     condOrderedComps,
                                     toWhitePaper=False,
                                     is256=True,
                                     separate=False)
            a2b = hstack((target, condition))
            del target, condition

            # DISPLAY
            thisImgName = renameSubImage(inputImage, wlen, 0)
            imshow(thisImgName, a2b)
            waitKey(0)
            destroyAllWindows()
        else:
            for offset in range(0, wlen - substrLen + 1):
                _tksBBxes = tokensBBxs[offset:offset + substrLen]
                chars = [t[1] for t in _tksBBxes]
                targetWrd = "".join(chars)
                print("       {}   [{}, {}]".format(targetWrd, offset,
                                                    offset + substrLen - 1))

                # TARGET
                targetChars2Imgs = {targetWrd: [inputImage, chars]}
                targetOrdComps = [targetWrd]
                target = createLetter(targetChars2Imgs,
                                      targetOrdComps,
                                      toWhitePaper=False,
                                      is256=True)

                # CONDITION
                condChar2Images, condOrderedComps = query(
                    index, text=" ".join(chars), forceHead=len(chars) > 1)
                condition = createLetter(condChar2Images,
                                         condOrderedComps,
                                         toWhitePaper=False,
                                         is256=True,
                                         separate=False)
                a2b = hstack((target, condition))
                del target, condition

                # DISPLAY
                thisImgName = renameSubImage(inputImage, substrLen, offset)
                imshow(thisImgName, a2b)
                waitKey(0)
                destroyAllWindows()

                producedSubstrings.add(targetWrd)

    print("\n", producedSubstrings)
    assert len(producedSubstrings) == int(wlen * (wlen - 1) / 2.0)
def symbolsClass():
    """
    Produces 3 differt (vertical) threatening letters, one for each set of tokens:
    lowercases, simple uppercases and specials

    A vertical threatening letters for convenience, since it is easier than a horizontal arrangement.

    eg.
            a       _split()
            b       _s
            c       _s

            ;       _s
            .       _s
            ,       _s

    :return: None
    """
    with open(symbols, "r") as s:
        _symbols = load(s)

    ccToTokens = _symbols['symbol']
    sequence = _symbols['alphabetSeq']

    # SIMLE 1-grams
    simpleSeqUpper, simpleSeqLower, specialSeq = [], [], []
    for el in sequence:
        if el != " ":
            if el.isupper():
                simpleSeqUpper.append(el)
            elif (len(el) == 1 and el.islower and el.isalpha()) or (len(el) == 2 and el[0] == '_'):
                simpleSeqLower.append(el)
            else:
                specialSeq.append(el)

    litteraSimpleLowerTks = createLetter(ccToTokens, simpleSeqLower, vertical=True)
    litteraSimpleUpperTks = createLetter(ccToTokens, simpleSeqUpper, vertical=True)
    litteraSpecialTks = createLetter(ccToTokens, specialSeq, vertical=True)

    imshow('littera simple l', litteraSimpleLowerTks)
    waitKey(0)
    destroyAllWindows()
    print(simpleSeqLower)

    imshow('littera simple u', litteraSimpleUpperTks)
    waitKey(0)
    destroyAllWindows()
    print(simpleSeqUpper)

    imshow('littera specials', litteraSpecialTks)
    waitKey(0)
    destroyAllWindows()
    print(specialSeq)

    # Saving images with simple/special tokens
    if not path.exists(symbolsCarolingian_simpleLower):
        print("writing ", symbolsCarolingian_simpleLower)
        imwrite(symbolsCarolingian_simpleLower, litteraSimpleLowerTks)
    else:
        print(symbolsCarolingian_simpleLower)

    if not path.exists(symbolsCarolingian_simpleUpper):
        print("writing ", symbolsCarolingian_simpleUpper)
        imwrite(symbolsCarolingian_simpleUpper, litteraSimpleUpperTks)
    else:
        print(symbolsCarolingian_simpleUpper)

    if not path.exists(symbolsCarolingian_special):
        print("writing ", symbolsCarolingian_special)
        imwrite(symbolsCarolingian_special, litteraSpecialTks)
    else:
        print(symbolsCarolingian_special)
def datasetBuilderMinaceLetter(trainSetProp=0.9):
    index = getIndex(indexName='baselineIndex')

    with open(images2ColorsBBxesJSON,
              'r') as w, open(transcriptedWords_holesFree, 'r') as hf:
        wordsNColors = load(w)
        holesFree = set(load(hf))  # no holes between tokens

        totNumWords = len(holesFree)
        imagesShuffled = sample(holesFree, totNumWords)
        trainingSetSize = round(totNumWords * trainSetProp)

        # testSetSize = totNumWords - trainingSetSize

        def len2Image(imageName):
            page, image = imageName.split('/')
            return str(len(
                wordsNColors[imageName]['tks'])) + '##' + page + '#' + image

        print('\n\n#############################')
        print('TRAINING SET BUILDING')
        print('#############################\n')
        c = 1
        for imTrain in imagesShuffled[:trainingSetSize]:
            try:
                imName = len2Image(imTrain)
                # TARGET = manuscript/real image
                targetColors = wordsNColors[imTrain]["col"]
                targetBBxes = wordsNColors[imTrain]['tks']
                targetImage = imread(color_words + '/' + imTrain)
                target = getAnnotatedBBxes(targetImage,
                                           targetColors,
                                           targetBBxes,
                                           keepSize=True)
                #         fitting the 256x256 format
                target256 = zeros((256, 256), dtype=uint8)
                hasBigChar = int(
                    max([(tk[0][4] - meanHeight) * goesBelowLine(tk[1])
                         for tk in targetBBxes] or [0.0]))
                yOff = 126 + round(3 - target.shape[0] + hasBigChar)
                xOff = round((256 - target.shape[1]) / 2)

                target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]] = \
                    bitwise_or(target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]], target)

                tokensList = [t[1] for t in targetBBxes]
                #
                # CONDITION = sketch/letter
                #       CONDITION 1 = minaces littera with singly taken tokens attached one another
                char2Images, orederedComps = query(
                    index,
                    text=" ".join(tokensList),
                    forceHead=len(tokensList) > 1)
                conditionAttached = createLetter(char2Images,
                                                 orederedComps,
                                                 toWhitePaper=False,
                                                 is256=True,
                                                 separate=False)
                a2bAttached = hstack((target256, conditionAttached))
                #
                #       CONDITION 2 = minaces littera with singly taken tokens separated by spaces
                conditionSeparate = createLetter(char2Images,
                                                 orederedComps,
                                                 toWhitePaper=False,
                                                 is256=True,
                                                 separate=True)
                a2bSeparate = hstack((target256, conditionSeparate))

                # check sizes
                assert conditionAttached.shape == (256, 256)
                assert conditionSeparate.shape == (256, 256)
                assert target256.shape == (256, 256)
                assert a2bAttached.shape == (256, 256 * 2)
                assert a2bSeparate.shape == (256, 256 * 2)

                # writing out
                writtenTrainAttached = imwrite(
                    path.join(trainDirAttached, imName), a2bAttached)
                writtenTrainAttachedUnpaired = imwrite(
                    path.join(trainDirAttachedUnpaired, imName),
                    conditionAttached)

                writtenTrainSeparate = imwrite(
                    path.join(trainDirSeparate, imName), a2bSeparate)
                writtenTrainSeparateUnpaired = imwrite(
                    path.join(trainDirSeparateUnpaired, imName),
                    conditionSeparate)

                writtenTargetUnpaired = imwrite(
                    path.join(targetDirUnpaired, imName), target256)

                if not (writtenTrainSeparate, writtenTrainSeparateUnpaired,
                        writtenTrainAttached, writtenTrainAttachedUnpaired,
                        writtenTargetUnpaired):
                    print(imName)
                    raise Exception('Not written\n')

                # freeing memory
                del a2bAttached, a2bSeparate, target, target256, conditionAttached, conditionSeparate

                if c % 50 == 0:
                    print(' -----  TRAINIG SET number of processed images: ',
                          c)
                c += 1

            except SizeException as s:
                print(s)
                pass

        print('\n\n#############################')
        print('TEST SET BUILDING')
        print('#############################\n')
        c = 1
        for imTest in imagesShuffled[trainingSetSize:]:
            try:
                imName = len2Image(imTest)

                # TARGET = manuscript/real image
                targetColors = wordsNColors[imTest]["col"]
                targetBBxes = wordsNColors[imTest]["tks"]
                targetImage = imread(color_words + '/' + imTest)
                target = getAnnotatedBBxes(targetImage, targetColors,
                                           targetBBxes)
                #         fitting the 256x256 format
                target256 = zeros((256, 256), dtype=uint8)
                hasBigChar = int(
                    max([(tk[0][4] - meanHeight) * goesBelowLine(tk[1])
                         for tk in targetBBxes] or [0.0]))
                yOff = 126 + round(3 - target.shape[0] + hasBigChar)
                xOff = round((256 - target.shape[1]) / 2)

                target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]] = \
                    bitwise_or(target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]], target)

                tokensList = [t[1] for t in targetBBxes]
                #
                # CONDITION = sketch/letter
                #       CONDITION 1 = minaces littera with ligatures
                char2Images, orederedComps = query(index,
                                                   text="".join(tokensList))
                conditionAttached = createLetter(char2Images,
                                                 orederedComps,
                                                 toWhitePaper=False,
                                                 is256=True)
                a2bAttached = hstack((target256, conditionAttached))
                #
                #       CONDITION 2 = minaces littera without ligatures
                char2Images, orederedComps = query(
                    index,
                    text=" ".join(tokensList),
                    forceHead=len(tokensList) > 1)
                conditionSeparate = createLetter(char2Images,
                                                 orederedComps,
                                                 toWhitePaper=False,
                                                 is256=True,
                                                 separate=True)
                a2bSeparate = hstack((target256, conditionSeparate))

                # check sizes
                assert conditionAttached.shape == (256, 256)
                assert conditionSeparate.shape == (256, 256)
                assert target256.shape == (256, 256)
                assert a2bAttached.shape == (256, 256 * 2)
                assert a2bSeparate.shape == (256, 256 * 2)

                # print(imName, '\n')
                # imshow(imName+' lig', a2bAttached)
                # waitKey(0)
                # imshow(imName+' ligcond', conditionAttached)
                # waitKey(0)
                #
                # imshow(imName+' sep', a2bSeparate)
                # waitKey(0)
                # imshow(imName + ' sepcond', conditionSeparate)
                # waitKey(0)
                #
                # imshow(imName+' target', target256)
                # waitKey(0)

                # destroyAllWindows()

                # writing out
                writtenTestAttached = imwrite(
                    path.join(testDirAttached, imName), a2bAttached)
                writtenTestAttachedUnpaired = imwrite(
                    path.join(testDirAttachedUnpaired, imName),
                    conditionAttached)

                writtenTestSeparate = imwrite(
                    path.join(testDirSeparate, imName), a2bSeparate)
                writtenTestSeparateUnpaired = imwrite(
                    path.join(testDirSeparateUnpaired, imName),
                    conditionSeparate)

                writtenTargetUnpaired = imwrite(
                    path.join(testDirUnpaire, imName), target256)

                if not (writtenTestAttached, writtenTestAttachedUnpaired,
                        writtenTestSeparate, writtenTestSeparateUnpaired,
                        writtenTargetUnpaired):
                    print(imName)
                    raise Exception('Not written\n')

                # freeing memory
                del a2bAttached, a2bSeparate, target, target256, conditionAttached, conditionSeparate

                if c % 50 == 0:
                    print(' -----  TEST SET number of processed images: ', c)
                c += 1

            except SizeException as s:
                print(s)
                pass
def ezechieleTest256(ccToTokens=ezechieleData.mapping,
                     phrase=ezechieleData.verse):
    littera = createLetter(ccToTokens, phrase, is256=True)
    imshow('lectera minaces', littera)
    waitKey(0)
    destroyAllWindows()
def datasetBuilderMinacesLittera_augmented(trainSetProp=0.9):
    index = getIndex(indexName='baselineIndex')

    with open(images2ColorsBBxesJSON, 'r') as w, open(transcriptedWords_holesFree, 'r') as hf:
        wordsNColors = load(w)
        holesFree = set(load(hf))  # no holes between tokens

        # TRAIN/TEST SET SPLITTING
        imgIndices2len = [(hfImg, l) for hfImg in holesFree
                          for l in range(2 if len(wordsNColors[hfImg]["tks"]) > 1 else 1, len(wordsNColors[hfImg]["tks"])+1)]
        # totNumWords = len(holesFree)
        # imagesShuffled = sample(holesFree, totNumWords)
        #trainingSetSize = round(totNumWords * trainSetProp)

        print('\n\n#############################')
        print('TRAINING SET BUILDING')
        print('#############################\n')
        c = 1

        for imTrain in imagesShuffled[:trainingSetSize]:
            colors, tokensBBxs = wordsNColors["col"], wordsNColors["tks"]
            wlen = len(tokensBBxs)

            if wlen > 1:
                for substrLen in range(2, wlen + 1):
                    try:
                        # original color word
                        if substrLen == wlen:
                            chars = [t[1] for t in tokensBBxs]
                            targetWrd = "".join(chars)
                            # TARGET
                            #       tokens to crop (all)
                            targetChars2Imgs = {targetWrd: [imTrain, chars]}
                            #       menace letter
                            target = createLetter(targetChars2Imgs, [targetWrd], toWhitePaper=False, is256=True,
                                                  separate=False)

                            # CONDITION
                            condChar2Images, condOrderedComps = query(index, text=" ".join(chars), forceHead=len(chars) > 1)
                            condition = createLetter(condChar2Images, condOrderedComps, toWhitePaper=False, is256=True,
                                                     separate=False)
                            a2b = hstack((target, condition))
                            assert a2b.shape == (256, 256 * 2)

                            del target, condition
                            # WRITE OUT
                            thisImgName = renameSubImage(imTrain, wlen, 0)
                            a2bWriteStatus = imwrite(path.join(menaceLetterDatasetTRAIN, thisImgName), a2b)
                            assert  a2bWriteStatus

                            del a2b

                            if c % 50 == 0:
                                print(' -----  TRAINIG SET number of processed images: ', c)
                            c += 1

                        # SUBSTRINGS
                        else:


                    except SizeException as s:
                        print(s)
                        pass