Python getText Exemples, checkwordsegmentations.getText Python Exemples

Exemple #1

0

Afficher le fichier

def calculateErrorRate(groundTruthPath="NewDataset/text/", predictedPath="OutputTextFiles/", statsFile="CER.txt"):
    with open(statsFile, 'w') as f:
        files = os.listdir(predictedPath)
        totalError = 0
        for file in files:
            realText = ' '.join(getText(groundTruthPath + file))
            predictedText = ' '.join(getText(predictedPath + file))
            error = levenshtein_distance(realText, predictedText) / len(realText)
            totalError += error
            f.write(f"file: {file}\t\tCER: {error}\n")
        totalError /= len(files)
        f.write(f"Total CER: {totalError}")

Exemple #2

0

Afficher le fichier

Fichier : ocr.py Projet : omarsgalal/Arabic-OCR

def img2txt(imgName, txtPath , outfile):
    # read image
    img = cv2.imread(imgName)
    # import pdb; pdb.set_trace()
    # segment character from image
    print("Segmenting image to characters ...")
    segmentedChars = img2Chars(img)

    # determine spaces
    spaces = [len(l[2]) for l in segmentedChars]

    # list of character images only
    segmentedChars = [c for l in segmentedChars for c in l[2]]

    # compute features of chars
    print("\nExtracting features for characters ...")
    features = [prepareCharImg(c) for c in segmentedChars]



    # predict characters
    print("\nPredicting characters ...")
    # predictions_prop = model.predict_proba(features)
    # predictions = predictions_prop.argsort(axis=1)[:,-3:][:,::-1]
    # predictions = np.argmax(predictions_prop, axis=1)
    predictions = model.predict(features)
    # adding spaces

    wordsEncodes = []
    for space in spaces: wordsEncodes.append(predictions[:space]); predictions = predictions[space:]

    # decoding characters
    textList = []
    for wordEncoding in wordsEncodes: textList.append(''.join([chars_decode[charCode] for charCode in wordEncoding]))
    finalText = ' '.join(textList)

    # for wordEncoding in wordsEncodes: textList.append([[chars_decode[charCode] for charCode in charsCode] for charsCode in wordEncoding])
    # # import pdb; pdb.set_trace()
    # finalText = ' '.join([ ''.join([ c[0] for c in w ])  for w in textList])

    # post processing heeeeere

    print("\npost processing ...")
    # withNGram = postProcessing(textList)
    # withNGram = postprocessing_v2(finalText)
    withNGram = finalText
    # wordsList = loadWordsList()
    # text = [getNearestWord(text[i], wordsList) for i in tqdm(range(len(text)))]
    # text = ' '.join(text)

    # writing text
    writeText(outfile, finalText)
    # writeText(outfile.split('.')[0] + '-Ngram.txt', withNGram)

    originalTxt = getText(txtPath)
    # finalText = originalTxt[:4] + finalText[4:]
    error = editdistance.eval(finalText, originalTxt) / len(originalTxt)
    errorNgram = editdistance.eval(withNGram, originalTxt) / len(originalTxt)
    print('acc: ', 1-error, ' acc n gram: ', 1-errorNgram)
    return 1-error, 1-errorNgram

Exemple #3

0

Afficher le fichier

Fichier : helper_functions.py Projet : omarsgalal/Arabic-OCR

def collectAllWords(path):
    files = os.listdir(path)
    wordSet = set()
    for i in tqdm(range(len(files))):
        words = getText(path + files[i])
        for word in words:
            wordSet.add(word)
    wordSet = list(wordSet)
    with open('all_words.pkl', 'wb') as fid:
        cPickle.dump(wordSet, fid)

Exemple #4

0

Afficher le fichier

Fichier : make_letters_dataset.py Projet : omarsgalal/Arabic-OCR

def saveSeparateLetters(fileName, pathText, pathImg, pathToSave):
    image = cv2.imread(pathImg + fileName)
    segmentedChars = img2Chars(image)
    wordsText = getText(pathText + fileName[:-4] + ".txt")
    if len(segmentedChars) == len(wordsText):
        print("Saving Segmented Character ...")
        for i in tqdm(range(len(segmentedChars))):
            realLetters = word2Chars(wordsText[i])
            if len(realLetters) == segmentedChars[i][1]:
                for j, letImg in enumerate(segmentedChars[i][2]):
                    cv2.imwrite(
                        pathToSave + str(chars_codes[realLetters[j]]) +
                        f"/{lastChars[realLetters[j]]}.png", letImg)
                    lastChars[realLetters[j]] += 1

Exemple #5

0

Afficher le fichier

Fichier : ocr.py Projet : omarsgalal/Arabic-OCR

def img2txt2(imgName, textFile, outfile):
    # read image
    img = cv2.imread(imgName)

    # real text
    realText = ' '.join(getText(textFile))

    # segment character from image
    print("Segmenting image to characters ...")
    segmentedChars = img2Chars(img)

    # loading the model
    model = loadModel(MODEL_NAME)


    
    f = open(outfile, 'w')
    allText = ""
    for i in tqdm(range(len(segmentedChars))):
        currentWord = ""
        for c in segmentedChars[i][2]:
            prediction = model.predict([prepareCharImg(c)])
            char = chars_decode[prediction[0]]
            currentWord += char
            allText += char
        allText += ' '
        f.write(currentWord)
        f.write(' ')
    f.close()

    with open("hello.txt", 'w') as f:
        f.write(realText)
        f.write("\n\n\n\n")
        f.write(allText)
    
    print("\n\n accuracy:")
    print(len(allText), len(realText))
    print(levenshtein_distance(realText, allText) / len(realText))

    print("another accuracy")
    realList = realText.split(' ')
    allList = allText.split(' ')
    errors = 0
    for i in range(len(realList)):
        errors += levenshtein_distance(realList[i], allList[i])
    print(errors / len(realText))

Exemple #6

0

Afficher le fichier

def printSegmentedWordChars(fileName, pathText, pathImg):
    image = cv2.imread(pathImg + fileName)
    segmentedChars = img2Chars(image)
    wordsText = getText(pathText + fileName[:-4] + ".txt")

    # if numWords != len(wordsText):
    if not os.path.exists(fileName[:-4]):
        os.mkdir(fileName[:-4])
    if not os.path.exists(fileName[:-4] + "/correct"):
        os.mkdir(fileName[:-4] + "/correct")
    if not os.path.exists(fileName[:-4] + "/false"):
        os.mkdir(fileName[:-4] + "/false")
    for i, word in enumerate(segmentedChars):
        wordLength = len(wordsText[i])
        if "لا" in wordsText[i]:
            wordLength -= 1
        if word[1] == wordLength:
            cv2.imwrite(f"{fileName[:-4]}/correct/{i}.png", word[0])
        else:
            cv2.imwrite(f"{fileName[:-4]}/false/{i}.png", word[0])
    file = open(f"{fileName[:-4]}/words.txt", 'w')
    for j, i in enumerate(wordsText):
        file.write(f"{i}\n")
    file.close()

Exemple #7

0

Afficher le fichier

def postProcessFolder(srcFolder="OutputTextFiles/", dstFolder="LevTextFiles/"):
    files = os.listdir(srcFolder)
    for f in files:
        text = ' '.join(getText(srcFolder + f))
        processedText = postProcessTextWithLev(text)
        writeText(dstFolder + f, processedText)