def ezechiele(): ccToTokens = ezechieleData.mapping phrase = ezechieleData.verse littera = createLetter(ccToTokens, phrase) imshow('lectera minaces', littera) waitKey(0) destroyAllWindows() if not path.exists(ezechieleMinace): print('writing image to ', ezechieleMinace)
def testDatasetBuilderMinacesLittera_augmented(): """ Give a color word C of length L produces L(L-1)/2 + 1 black and white images. L(L-1)/2 for the substrings and one for the original. AnnotatedTks(C): ["e", "cc", "l", "i", "e"] "040v/599_532_32_88.png" # 2 (4) ecc [0, 1] ccl [1, 2] li [2, 3] ie [3, 4] # 3 (3) eccl [0, 2] ccli [1, 3] lie [2, 4] # 4 (2) eccli [0, 3] cclie [1, 4] # 5 (1) ecclie [0, 4] :return: None """ inputImage = "040v/599_532_32_88.png" # ecclie with open(images2ColorsBBxesJSON, "r") as f: data = load(f)[inputImage] colors, tokensBBxs = data["col"], data["tks"] wlen = len(tokensBBxs) # targetImage = imread(path.join(color_words, inputImage)) print("###") pprint(tokensBBxs) producedSubstrings = set() index = getIndex(indexName='baselineIndex') for substrLen in range(2, wlen + 1): print("\n# ", substrLen) # original color word if substrLen == wlen: chars = [t[1] for t in tokensBBxs] targetWrd = "".join(chars) producedSubstrings.add(targetWrd) print(" ", targetWrd) # TARGET targetChars2Imgs = {targetWrd: [inputImage, chars]} target = createLetter(targetChars2Imgs, [targetWrd], toWhitePaper=False, is256=True, separate=False) # CONDITION condChar2Images, condOrderedComps = query(index, text=" ".join(chars), forceHead=len(chars) > 1) condition = createLetter(condChar2Images, condOrderedComps, toWhitePaper=False, is256=True, separate=False) a2b = hstack((target, condition)) del target, condition # DISPLAY thisImgName = renameSubImage(inputImage, wlen, 0) imshow(thisImgName, a2b) waitKey(0) destroyAllWindows() else: for offset in range(0, wlen - substrLen + 1): _tksBBxes = tokensBBxs[offset:offset + substrLen] chars = [t[1] for t in _tksBBxes] targetWrd = "".join(chars) print(" {} [{}, {}]".format(targetWrd, offset, offset + substrLen - 1)) # TARGET targetChars2Imgs = {targetWrd: [inputImage, chars]} targetOrdComps = [targetWrd] target = createLetter(targetChars2Imgs, targetOrdComps, toWhitePaper=False, is256=True) # CONDITION condChar2Images, condOrderedComps = query( index, text=" ".join(chars), forceHead=len(chars) > 1) condition = createLetter(condChar2Images, condOrderedComps, toWhitePaper=False, is256=True, separate=False) a2b = hstack((target, condition)) del target, condition # DISPLAY thisImgName = renameSubImage(inputImage, substrLen, offset) imshow(thisImgName, a2b) waitKey(0) destroyAllWindows() producedSubstrings.add(targetWrd) print("\n", producedSubstrings) assert len(producedSubstrings) == int(wlen * (wlen - 1) / 2.0)
def symbolsClass(): """ Produces 3 differt (vertical) threatening letters, one for each set of tokens: lowercases, simple uppercases and specials A vertical threatening letters for convenience, since it is easier than a horizontal arrangement. eg. a _split() b _s c _s ; _s . _s , _s :return: None """ with open(symbols, "r") as s: _symbols = load(s) ccToTokens = _symbols['symbol'] sequence = _symbols['alphabetSeq'] # SIMLE 1-grams simpleSeqUpper, simpleSeqLower, specialSeq = [], [], [] for el in sequence: if el != " ": if el.isupper(): simpleSeqUpper.append(el) elif (len(el) == 1 and el.islower and el.isalpha()) or (len(el) == 2 and el[0] == '_'): simpleSeqLower.append(el) else: specialSeq.append(el) litteraSimpleLowerTks = createLetter(ccToTokens, simpleSeqLower, vertical=True) litteraSimpleUpperTks = createLetter(ccToTokens, simpleSeqUpper, vertical=True) litteraSpecialTks = createLetter(ccToTokens, specialSeq, vertical=True) imshow('littera simple l', litteraSimpleLowerTks) waitKey(0) destroyAllWindows() print(simpleSeqLower) imshow('littera simple u', litteraSimpleUpperTks) waitKey(0) destroyAllWindows() print(simpleSeqUpper) imshow('littera specials', litteraSpecialTks) waitKey(0) destroyAllWindows() print(specialSeq) # Saving images with simple/special tokens if not path.exists(symbolsCarolingian_simpleLower): print("writing ", symbolsCarolingian_simpleLower) imwrite(symbolsCarolingian_simpleLower, litteraSimpleLowerTks) else: print(symbolsCarolingian_simpleLower) if not path.exists(symbolsCarolingian_simpleUpper): print("writing ", symbolsCarolingian_simpleUpper) imwrite(symbolsCarolingian_simpleUpper, litteraSimpleUpperTks) else: print(symbolsCarolingian_simpleUpper) if not path.exists(symbolsCarolingian_special): print("writing ", symbolsCarolingian_special) imwrite(symbolsCarolingian_special, litteraSpecialTks) else: print(symbolsCarolingian_special)
def datasetBuilderMinaceLetter(trainSetProp=0.9): index = getIndex(indexName='baselineIndex') with open(images2ColorsBBxesJSON, 'r') as w, open(transcriptedWords_holesFree, 'r') as hf: wordsNColors = load(w) holesFree = set(load(hf)) # no holes between tokens totNumWords = len(holesFree) imagesShuffled = sample(holesFree, totNumWords) trainingSetSize = round(totNumWords * trainSetProp) # testSetSize = totNumWords - trainingSetSize def len2Image(imageName): page, image = imageName.split('/') return str(len( wordsNColors[imageName]['tks'])) + '##' + page + '#' + image print('\n\n#############################') print('TRAINING SET BUILDING') print('#############################\n') c = 1 for imTrain in imagesShuffled[:trainingSetSize]: try: imName = len2Image(imTrain) # TARGET = manuscript/real image targetColors = wordsNColors[imTrain]["col"] targetBBxes = wordsNColors[imTrain]['tks'] targetImage = imread(color_words + '/' + imTrain) target = getAnnotatedBBxes(targetImage, targetColors, targetBBxes, keepSize=True) # fitting the 256x256 format target256 = zeros((256, 256), dtype=uint8) hasBigChar = int( max([(tk[0][4] - meanHeight) * goesBelowLine(tk[1]) for tk in targetBBxes] or [0.0])) yOff = 126 + round(3 - target.shape[0] + hasBigChar) xOff = round((256 - target.shape[1]) / 2) target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]] = \ bitwise_or(target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]], target) tokensList = [t[1] for t in targetBBxes] # # CONDITION = sketch/letter # CONDITION 1 = minaces littera with singly taken tokens attached one another char2Images, orederedComps = query( index, text=" ".join(tokensList), forceHead=len(tokensList) > 1) conditionAttached = createLetter(char2Images, orederedComps, toWhitePaper=False, is256=True, separate=False) a2bAttached = hstack((target256, conditionAttached)) # # CONDITION 2 = minaces littera with singly taken tokens separated by spaces conditionSeparate = createLetter(char2Images, orederedComps, toWhitePaper=False, is256=True, separate=True) a2bSeparate = hstack((target256, conditionSeparate)) # check sizes assert conditionAttached.shape == (256, 256) assert conditionSeparate.shape == (256, 256) assert target256.shape == (256, 256) assert a2bAttached.shape == (256, 256 * 2) assert a2bSeparate.shape == (256, 256 * 2) # writing out writtenTrainAttached = imwrite( path.join(trainDirAttached, imName), a2bAttached) writtenTrainAttachedUnpaired = imwrite( path.join(trainDirAttachedUnpaired, imName), conditionAttached) writtenTrainSeparate = imwrite( path.join(trainDirSeparate, imName), a2bSeparate) writtenTrainSeparateUnpaired = imwrite( path.join(trainDirSeparateUnpaired, imName), conditionSeparate) writtenTargetUnpaired = imwrite( path.join(targetDirUnpaired, imName), target256) if not (writtenTrainSeparate, writtenTrainSeparateUnpaired, writtenTrainAttached, writtenTrainAttachedUnpaired, writtenTargetUnpaired): print(imName) raise Exception('Not written\n') # freeing memory del a2bAttached, a2bSeparate, target, target256, conditionAttached, conditionSeparate if c % 50 == 0: print(' ----- TRAINIG SET number of processed images: ', c) c += 1 except SizeException as s: print(s) pass print('\n\n#############################') print('TEST SET BUILDING') print('#############################\n') c = 1 for imTest in imagesShuffled[trainingSetSize:]: try: imName = len2Image(imTest) # TARGET = manuscript/real image targetColors = wordsNColors[imTest]["col"] targetBBxes = wordsNColors[imTest]["tks"] targetImage = imread(color_words + '/' + imTest) target = getAnnotatedBBxes(targetImage, targetColors, targetBBxes) # fitting the 256x256 format target256 = zeros((256, 256), dtype=uint8) hasBigChar = int( max([(tk[0][4] - meanHeight) * goesBelowLine(tk[1]) for tk in targetBBxes] or [0.0])) yOff = 126 + round(3 - target.shape[0] + hasBigChar) xOff = round((256 - target.shape[1]) / 2) target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]] = \ bitwise_or(target256[yOff:yOff + target.shape[0], xOff:xOff + target.shape[1]], target) tokensList = [t[1] for t in targetBBxes] # # CONDITION = sketch/letter # CONDITION 1 = minaces littera with ligatures char2Images, orederedComps = query(index, text="".join(tokensList)) conditionAttached = createLetter(char2Images, orederedComps, toWhitePaper=False, is256=True) a2bAttached = hstack((target256, conditionAttached)) # # CONDITION 2 = minaces littera without ligatures char2Images, orederedComps = query( index, text=" ".join(tokensList), forceHead=len(tokensList) > 1) conditionSeparate = createLetter(char2Images, orederedComps, toWhitePaper=False, is256=True, separate=True) a2bSeparate = hstack((target256, conditionSeparate)) # check sizes assert conditionAttached.shape == (256, 256) assert conditionSeparate.shape == (256, 256) assert target256.shape == (256, 256) assert a2bAttached.shape == (256, 256 * 2) assert a2bSeparate.shape == (256, 256 * 2) # print(imName, '\n') # imshow(imName+' lig', a2bAttached) # waitKey(0) # imshow(imName+' ligcond', conditionAttached) # waitKey(0) # # imshow(imName+' sep', a2bSeparate) # waitKey(0) # imshow(imName + ' sepcond', conditionSeparate) # waitKey(0) # # imshow(imName+' target', target256) # waitKey(0) # destroyAllWindows() # writing out writtenTestAttached = imwrite( path.join(testDirAttached, imName), a2bAttached) writtenTestAttachedUnpaired = imwrite( path.join(testDirAttachedUnpaired, imName), conditionAttached) writtenTestSeparate = imwrite( path.join(testDirSeparate, imName), a2bSeparate) writtenTestSeparateUnpaired = imwrite( path.join(testDirSeparateUnpaired, imName), conditionSeparate) writtenTargetUnpaired = imwrite( path.join(testDirUnpaire, imName), target256) if not (writtenTestAttached, writtenTestAttachedUnpaired, writtenTestSeparate, writtenTestSeparateUnpaired, writtenTargetUnpaired): print(imName) raise Exception('Not written\n') # freeing memory del a2bAttached, a2bSeparate, target, target256, conditionAttached, conditionSeparate if c % 50 == 0: print(' ----- TEST SET number of processed images: ', c) c += 1 except SizeException as s: print(s) pass
def ezechieleTest256(ccToTokens=ezechieleData.mapping, phrase=ezechieleData.verse): littera = createLetter(ccToTokens, phrase, is256=True) imshow('lectera minaces', littera) waitKey(0) destroyAllWindows()
def datasetBuilderMinacesLittera_augmented(trainSetProp=0.9): index = getIndex(indexName='baselineIndex') with open(images2ColorsBBxesJSON, 'r') as w, open(transcriptedWords_holesFree, 'r') as hf: wordsNColors = load(w) holesFree = set(load(hf)) # no holes between tokens # TRAIN/TEST SET SPLITTING imgIndices2len = [(hfImg, l) for hfImg in holesFree for l in range(2 if len(wordsNColors[hfImg]["tks"]) > 1 else 1, len(wordsNColors[hfImg]["tks"])+1)] # totNumWords = len(holesFree) # imagesShuffled = sample(holesFree, totNumWords) #trainingSetSize = round(totNumWords * trainSetProp) print('\n\n#############################') print('TRAINING SET BUILDING') print('#############################\n') c = 1 for imTrain in imagesShuffled[:trainingSetSize]: colors, tokensBBxs = wordsNColors["col"], wordsNColors["tks"] wlen = len(tokensBBxs) if wlen > 1: for substrLen in range(2, wlen + 1): try: # original color word if substrLen == wlen: chars = [t[1] for t in tokensBBxs] targetWrd = "".join(chars) # TARGET # tokens to crop (all) targetChars2Imgs = {targetWrd: [imTrain, chars]} # menace letter target = createLetter(targetChars2Imgs, [targetWrd], toWhitePaper=False, is256=True, separate=False) # CONDITION condChar2Images, condOrderedComps = query(index, text=" ".join(chars), forceHead=len(chars) > 1) condition = createLetter(condChar2Images, condOrderedComps, toWhitePaper=False, is256=True, separate=False) a2b = hstack((target, condition)) assert a2b.shape == (256, 256 * 2) del target, condition # WRITE OUT thisImgName = renameSubImage(imTrain, wlen, 0) a2bWriteStatus = imwrite(path.join(menaceLetterDatasetTRAIN, thisImgName), a2b) assert a2bWriteStatus del a2b if c % 50 == 0: print(' ----- TRAINIG SET number of processed images: ', c) c += 1 # SUBSTRINGS else: except SizeException as s: print(s) pass