Example #1
0
def generatePairedTableOnlyOcr(imageTable):
    foundOneMatch = False
    singular = []
    # used imagehashing to figure out pairs
    for elemOneKey, elemOneValue in imageTable.items():
        one = elemOneValue["ocr_text"]
        for elemTwoKey, elemTwoValue in imageTable2.items():
            two = elemTwoValue["ocr_text"]
            if elemOneKey != elemTwoKey:
                ocrDiff = commonMethods.percentageEditDistance(one, two)
                if ocrDiff <= 0.3:
                    imageTable[elemOneKey]["strictMatch"].add(elemTwoKey)
                    foundOneMatch = True
                elif ocrDiff < 0.26:
                    imageTable[elemOneKey]["matchNamesRemove"].add(elemTwoKey)
                    foundOneMatch = True

        if (not foundOneMatch):
            singular.append(elemOneKey)
        foundOneMatch = False  # reset value for next element
    #end of outer for loop

    # the singulars are taken care of
    imageTable = commonMethods.addSingulars(imageTable, singular)
    return imageTable
def findImagePairs(imageTable):

    addList = []  # ( name, ocr_difference, hash_difference )
    removeList = []
    for key, value in imageTable.iteritems():
        # HERE
        # print("KEY :"),
        # print(key)
        for matchKey in value["matchNamesRemove"]:

            diffOCRdistance = commonMethods.percentageEditDistance( imageTable[key]["ocr_text"],\
                                                                    imageTable[matchKey]["ocr_text"] )
            diffHashDistance = commonMethods.percentHashDifference( imageTable[key]["hash_value"],\
                                                                    imageTable[matchKey]["hash_value"] )
            # print(matchKey, " " , diffOCRdistance, diffHashDistance)

            # a picture is a strict match if the ocr is within 3 percent and
            # image structure is within a 5 percent difference
            if diffHashDistance <= THRESH_HASH_DISTANCE_STRICT and diffOCRdistance <= THRESH_OCR_DISTANCE_STRICT:
                imageTable[key]['strictMatch'].add(matchKey)

            # if image is imageDominant
            elif imageTable[key]["imageDominant"] or imageTable[matchKey][
                    "imageDominant"]:
                if diffHashDistance < THRESH_HASH_DISTANCE_STRICT_IMAGE_DOMINANT:
                    imageTable[key]['strictMatch'].add(matchKey)
                elif diffHashDistance > THRESH_HASH_DISTANCE_REMOVE_IMAGE_DOMINANT or diffOCRdistance > THRESH_OCR_DISTANCE_REMOVE_IMAGE_DOMINANT:
                    removeList.append(matchKey)

            # if the picture is both not similar in image structure and writing
            elif diffHashDistance > THRESH_HASH_DISTANCE_REMOVE and diffOCRdistance > THRESH_OCR_DISTANCE_REMOVE:
                removeList.append(matchKey)

            # if not a strict match or needing to be removed checked it against
            # other elements that also do not need to be removed
            # CAN BE DONE WITH removed elements also but for the time being
            # have not checked the feasibility with all elements
            else:
                addList.append((matchKey, diffOCRdistance, diffHashDistance))

        # populate the matchNamesAdd
        experimentGraph(imageTable, key, addList)
        # prune the matchNamesRemove
        commonMethods.removeItemsFromSet(imageTable, key, removeList)

        removeList = []
        addList = []

    return imageTable
Example #3
0
def generatePairedTableOnlyOcr2(imageTable, imageTable2):
    # What format do I want results?
    # Image1  Image2  confidence (0/1/2)
    # UIC.CS108F17/L1709171030_Q5.jpg UIC.CS108F18/L1709101035_Q4.jpg

    result = ''

    # used imagehashing to figure out pairs
    for elemOneKey, elemOneValue in imageTable.items():
        one = elemOneValue["ocr_text"]
        for elemTwoKey, elemTwoValue in imageTable2.items():
            two = elemTwoValue["ocr_text"]
            if elemOneKey != elemTwoKey:
                ocrDiff = commonMethods.percentageEditDistance(one, two)
                if ocrDiff <= 0.3:
                    # 1 means probable match
                    result += "{}\t{}\t1\n".format(elemOneKey, elemTwoKey)
                elif ocrDiff < 0.26:
                    # 0 means (almost) positive match
                    result += "{}\t{}\t0\n".format(elemOneKey, elemTwoKey)

    # the singulars are taken care of
    return result
Example #4
0
def find_image_matches(class1, table1, class2, table2, htmlout=False):
    '''Match the images in class1 against the images from class2
Returns either an HTML file for checking visually,
or a

For output, the matches show:
course1/image1 course2/image2 match_score

possible match_score values are:
0 for high confidence match (both phash and text-ocr are below threshold)
1 medium-high confidence (text match, but no phash match)
2 medium confidence match (phash and text at threshold 2, or text at threshold 1)
anything else we leave it out.
    '''
    print("comparing {} with {}".format(len(table1), len(table2)))
    done = set()
    result = ''
    html = ''
    both = 0
    neither = 0
    phash = 0
    text = 0
    for filename1, image1 in table1.items():
        for filename2, image2 in table2.items():
            key = "{}-{}".format(filename1, filename2)
            if key in done:
                continue
            done.add(key)
            # also add the opposite of the key, so f1-f2 and f2-f1
            done.add("{}-{}".format(filename2, filename1))
            diff_text = percentageEditDistance(image1.text, image2.text)
            diff_phash = percentHashDifference(image1.image_hash,
                                               image2.image_hash)
            msg = '{:.2} {:.2} for {} and {}'.format(float(diff_text),
                                                     float(diff_phash),
                                                     filename1, filename2)
            if diff_text < THRESHOLD_TEXT and diff_phash < THRESHOLD_PHASH:
                #print("both: {}".format(msg))
                # BEST: both phash and ocr text indicate a match
                both += 1
                html += tohtml('both!', diff_text, diff_phash, class1,
                               filename1, class2, filename2)
                result += "{}\t{}\t0\n".format(filename1, filename2)
            elif diff_text < THRESHOLD_TEXT:
                #print("text: {}".format(msg))
                # second best: text matches
                text += 1
                html += tohtml('text only', diff_text, diff_phash, class1,
                               filename1, class2, filename2)
                result += "{}\t{}\t1\n".format(filename1, filename2)
            elif diff_phash < THRESHOLD_PHASH and diff_text < THRESHOLD_TEXT_WITH_PHASH:
                #print("phash: {}".format(msg))
                # third best: phash matches, text is below a less strict threshold
                phash += 1
                html += tohtml('phash only', diff_text, diff_phash, class1,
                               filename1, class2, filename2)
                result += "{}\t{}\t2\n".format(filename1, filename2)
            else:
                # print("neither: {}".format(msg))
                neither += 1
            # result += "{}\t{}\t{:.2}\t{:.2}\n".format(filename1, filename2, float(diff_text), float(diff_phash))
    print("both: {} phash: {} text: {} neither: {}".format(
        both, phash, text, neither))
    if htmlout:
        return '''
<html>
<head><title></title></head>
<body>
<table>
<tr>
    <th>message</th>
    <th>diff text</th>
    <th>diff phash</th>
    <th>file1</th>
    <th>file2</th>
</tr>
{}
</table>
</body>
</html>
'''.format(html)
    else:
        return result