def test_image_text_mask_with_east(testdata_dir):
    img = CompareImage(testdata_dir / 'Beach_date.png',
                       placeholder_file=testdata_dir / 'pattern_mask.json',
                       ocr_engine='east')
    assert len(img.placeholders) >= 1
    img_with_mask = img.get_image_with_placeholders()
    assert img != img_with_mask
예제 #2
0
    def get_text_from_document(self, image):
        """Gets Text Content from documents/images ``image``.

        Text content is returned as a list of strings. None if no text is identified.


        Examples:
        | ${text} | Get Text From Document| reference.pdf | #Gets Text Content from .pdf |
        | ${text} | Get Text From Document| reference.jpg | #Gets Text Content from .jpg |
        | List Should Contain Value | ${text} | Test String | #Checks if list contains a specific string |

        """

        img = CompareImage(image)
        if img.extension == '.pdf':
            text = []
            for i in range(len(img.opencv_images)):
                tdict = json.loads(img.mupdfdoc[i].get_text("json"))
                for block in tdict['blocks']:
                    if block['type'] == 0:
                        for line in block['lines']:
                            if line['spans'][0]['text']:
                                text.append(line['spans'][0]['text'])
        else:
            try:
                img.get_ocr_text_data()
                text = [x for x in img.text_content[0]['text'] if x]
            except:
                text = None
        return text
예제 #3
0
def test_multipage_pdf(testdata_dir):
    img = CompareImage(testdata_dir / 'sample.pdf')
    assert len(img.opencv_images) == 2
    assert type(img.opencv_images) == list
    assert type(img.opencv_images[0]) == numpy.ndarray
    assert type(img.opencv_images[1]) == numpy.ndarray
    pass
예제 #4
0
def test_pdf_text_content(testdata_dir):
    img = CompareImage(testdata_dir / 'sample_1_page.pdf')
    assert len(img.mupdfdoc.get_page_text(0, "WORDS")) > 0
예제 #5
0
def test_image_text_content(testdata_dir):
    img = CompareImage(testdata_dir / 'Beach_date.png')
    img.get_ocr_text_data()
    assert "01-Jan-2021" in img.text_content[0]['text']
    assert "123456789" in img.text_content[0]['text']
예제 #6
0
def test_single_pdf(testdata_dir):
    img = CompareImage(testdata_dir / 'sample_1_page.pdf')
    assert len(img.opencv_images) == 1
    assert type(img.opencv_images) == list
    assert type(img.opencv_images[0]) == numpy.ndarray
    pass
def test_image_area_mask(testdata_dir):
    img = CompareImage(testdata_dir / 'Beach_date.png',
                       placeholder_file=testdata_dir / 'area_mask.json')
    assert len(img.placeholders) == 1
    img_with_mask = img.get_image_with_placeholders()
    assert img != img_with_mask
예제 #8
0
def test_simple_text_from_pdf(testdata_dir):
    img = CompareImage(testdata_dir / 'sample_1_page.pdf')
    img.get_ocr_text_data()
    assert 'FB1DES0A3D5EFE2A60B0B1AE616C653' in img.text_content[0]['text']
예제 #9
0
def test_single_png(testdata_dir):
    img = CompareImage(testdata_dir / 'text_big.png')
    assert len(img.opencv_images) == 1
    assert type(img.opencv_images) == list
    assert type(img.opencv_images[0]) == numpy.ndarray
def test_single_png_with_barcode(testdata_dir):
    img = CompareImage(testdata_dir / 'datamatrix.png', contains_barcodes=True)
    assert len(img.placeholders) == 2
def test_single_pdf_without_barcode(testdata_dir):
    img = CompareImage(testdata_dir / 'sample_1_page.pdf',
                       contains_barcodes=True)
    assert len(img.placeholders) == 0
예제 #12
0
    def check_for_differences(self, reference, candidate, i, detected_differences, compare_options, reference_pdf_content=None, candidate_pdf_content=None):
        images_are_equal = True
        with futures.ThreadPoolExecutor(max_workers=2) as parallel_executor:
            grayA_future = parallel_executor.submit(cv2.cvtColor, reference, cv2.COLOR_BGR2GRAY)
            grayB_future = parallel_executor.submit(cv2.cvtColor, candidate, cv2.COLOR_BGR2GRAY)
            grayA = grayA_future.result()
            grayB = grayB_future.result()

        if reference.shape[0] != candidate.shape[0] or reference.shape[1] != candidate.shape[1]:
            self.add_screenshot_to_log(reference, "_reference_page_" + str(i+1))
            self.add_screenshot_to_log(candidate, "_candidate_page_" + str(i+1))
            raise AssertionError(f'The compared images have different dimensions:\nreference:{reference.shape}\ncandidate:{candidate.shape}')
        
        # compute the Structural Similarity Index (SSIM) between the two
        # images, ensuring that the difference image is returned
        (score, diff) = metrics.structural_similarity(grayA, grayB, gaussian_weights=True, full=True)
        score = abs(1-score)
        
        if self.take_screenshots:
            # Not necessary to take screenshots for every successful comparison
            self.add_screenshot_to_log(np.concatenate((reference, candidate), axis=1), "_page_" + str(i+1) + "_compare_concat")
               
        if (score > self.threshold):
        
            diff = (diff * 255).astype("uint8")

            thresh = cv2.threshold(diff, 0, 255,
                cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
            
            reference_with_rect, candidate_with_rect , cnts= self.get_images_with_highlighted_differences(thresh, reference.copy(), candidate.copy(), extension=int(os.getenv('EXTENSION', 2)))
            blended_images = self.overlay_two_images(reference_with_rect, candidate_with_rect)
            
            cv2.putText(reference_with_rect,self.REFERENCE_LABEL, self.BOTTOM_LEFT_CORNER_OF_TEXT, self.FONT, self.FONT_SCALE, self.FONT_COLOR, self.LINE_TYPE)
            cv2.putText(candidate_with_rect,self.CANDIDATE_LABEL, self.BOTTOM_LEFT_CORNER_OF_TEXT, self.FONT, self.FONT_SCALE, self.FONT_COLOR, self.LINE_TYPE)
            
            self.add_screenshot_to_log(np.concatenate((reference_with_rect, candidate_with_rect), axis=1), "_page_" + str(i+1) + "_rectangles_concat")
            self.add_screenshot_to_log(blended_images, "_page_" + str(i+1) + "_blended")

            if self.show_diff:
                self.add_screenshot_to_log(np.concatenate((diff, thresh), axis=1), "_page_" + str(i+1) + "_diff")

            images_are_equal=False

            if (compare_options["ignore_watermarks"] == True and len(cnts)==1) or compare_options["watermark_file"] is not None:
                if (compare_options["ignore_watermarks"] == True and len(cnts)==1):
                    (x, y, w, h) = cv2.boundingRect(cnts[0])
                    diff_center_x = abs((x+w/2)-(reference.shape[1]/2))
                    diff_center_y = abs((y+h/2)-(reference.shape[0]/2))
                    if (diff_center_x < reference.shape[1] * self.WATERMARK_CENTER_OFFSET) and (w * 25.4 / self.DPI < self.WATERMARK_WIDTH) and (h * 25.4 / self.DPI < self.WATERMARK_HEIGHT):
                        images_are_equal=True
                        print("A watermark position was identified. After ignoring watermark area, both images are equal")
                        return
                if compare_options["watermark_file"] is not None:
                    watermark_file = compare_options["watermark_file"]
                    if isinstance(watermark_file, str):
                        if os.path.isdir(watermark_file):
                            watermark_file = [str(os.path.join(watermark_file, f)) for f in os.listdir(watermark_file) if os.path.isfile(os.path.join(watermark_file, f))]
                        else:
                            watermark_file = [watermark_file]
                    if isinstance(watermark_file, list):
                        try:
                            for single_watermark in watermark_file:
                                try:
                                    watermark = CompareImage(single_watermark, DPI=self.DPI).opencv_images[0]
                                except:
                                    print(f'Watermark file {single_watermark} could not be loaded. Continue with next item.')
                                    continue
                                watermark_gray = cv2.cvtColor(watermark, cv2.COLOR_BGR2GRAY)
                                watermark_gray = (watermark_gray * 255).astype("uint8")
                                mask = cv2.threshold(watermark_gray, 10, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
                                mask = cv2.dilate(mask, None, iterations=1)
                                mask_inv = cv2.bitwise_not(mask)
                                if thresh.shape[0:2] == mask_inv.shape[0:2]:
                                    result = cv2.bitwise_and(thresh, thresh, mask=mask_inv)
                                else:
                                    print(f"The shape of watermark and image are different. Continue with next item")
                                    print(f"Document: {thresh.shape}\nMask: {mask_inv.shape}")
                                    continue
                                if self.show_diff:
                                    print(f"The diff after watermark removal")
                                    self.add_screenshot_to_log(result, "_page_" + str(i + 1) + "_watermark_diff")
                                if cv2.countNonZero(result) == 0:
                                    images_are_equal=True
                                    print("A watermark file was provided. After removing watermark area, both images are equal")
                                    return
                        except:
                            raise AssertionError('The provided watermark_file format is invalid. Please provide a path to a file or a list of files.')
                    else:
                        raise AssertionError('The provided watermark_file format is invalid. Please provide a path to a file or a list of files.')
                        

            if(compare_options["check_text_content"]==True) and images_are_equal is not True:
                if compare_options["get_pdf_content"] is not True:
                    #x, y, w, h = self.get_diff_rectangle(thresh)
                    images_are_equal=True
                    for c in range(len(cnts)):
                        (x, y, w, h) = cv2.boundingRect(cnts[c])
                        diff_area_reference = reference[y:y+h, x:x+w]
                        diff_area_candidate = candidate[y:y+h, x:x+w]

                        self.add_screenshot_to_log(diff_area_reference, "_page_" + str(i+1) + "_diff_area_reference_"+str(c))
                        self.add_screenshot_to_log(diff_area_candidate, "_page_" + str(i+1) + "_diff_area_test_"+str(c))

                        text_reference = pytesseract.image_to_string(diff_area_reference, config='--psm 6').replace("\n\n", "\n")
                        text_candidate = pytesseract.image_to_string(diff_area_candidate, config='--psm 6').replace("\n\n", "\n")
                        if text_reference.strip()==text_candidate.strip():                           
                            print("Partial text content is the same")
                            print(text_reference)
                        else:
                            images_are_equal=False
                            detected_differences.append(True)
                            print("Partial text content is different")
                            print(text_reference + " is not equal to " + text_candidate)
                elif compare_options["get_pdf_content"] is True:
                
                    images_are_equal=True
                    ref_words = reference_pdf_content.get_text("words")
                    cand_words = candidate_pdf_content.get_text("words")
                    for c in range(len(cnts)):

                        (x, y, w, h) = cv2.boundingRect(cnts[c])
                        rect = fitz.Rect(x*72/self.DPI, y*72/self.DPI, (x+w)*72/self.DPI, (y+h)*72/self.DPI)
                        diff_area_ref_words = [w for w in ref_words if fitz.Rect(w[:4]).intersects(rect)]
                        diff_area_cand_words = [w for w in cand_words if fitz.Rect(w[:4]).intersects(rect)]
                        diff_area_ref_words = make_text(diff_area_ref_words)
                        diff_area_cand_words = make_text(diff_area_cand_words)
                        diff_area_reference = reference[y:y+h, x:x+w]
                        diff_area_candidate = candidate[y:y+h, x:x+w]
                        
                        self.add_screenshot_to_log(diff_area_reference, "_page_" + str(i+1) + "_diff_area_reference_"+str(c))
                        self.add_screenshot_to_log(diff_area_candidate, "_page_" + str(i+1) + "_diff_area_test_"+str(c))
                                                                    
                        
                        if len(diff_area_ref_words)!=len(diff_area_cand_words):
                            images_are_equal=False
                            detected_differences.append(True)
                            print("The identified pdf layout elements are different", diff_area_ref_words, diff_area_cand_words)
                        else:

                            if diff_area_ref_words.strip() != diff_area_cand_words.strip():
                                images_are_equal=False
                                detected_differences.append(True)
                                print("Partial text content is different")
                                print(diff_area_ref_words.strip(), " is not equal to " ,diff_area_cand_words.strip())
                        if images_are_equal:
                            print("Partial text content of area is the same")
                            print(diff_area_ref_words)
                            pass

            if(compare_options["move_tolerance"]!=None) and images_are_equal is not True:
                move_tolerance=int(compare_options["move_tolerance"])
                images_are_equal=True
                
                if compare_options["get_pdf_content"] is not True:
                    #Experimental, to solve a problem with small images
                    #wr, hr, _ = reference.shape
                    for c in range(len(cnts)):
                    
                        (x, y, w, h) = cv2.boundingRect(cnts[c])
                        diff_area_reference = reference[y:y+h, x:x+w]
                        diff_area_candidate = candidate[y:y+h, x:x+w]

                        #Experimental, to solve a problem with small images
                        #search_area_candidate = candidate[(y - self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if y >= self.BORDER_FOR_MOVE_TOLERANCE_CHECK else 0:(y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if hr >= (y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) else hr, (x - self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if x >= self.BORDER_FOR_MOVE_TOLERANCE_CHECK else 0:(x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if wr >= (x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) else wr]

                        search_area_candidate = candidate[y - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK, x - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK]
                        search_area_reference = reference[y - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK, x - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK]                      
                        
                        # self.add_screenshot_to_log(search_area_candidate)
                        # self.add_screenshot_to_log(search_area_reference)
                        # self.add_screenshot_to_log(diff_area_candidate)
                        # self.add_screenshot_to_log(diff_area_reference)
                        try:
                            positions_in_compare_image = self.find_partial_image_position(search_area_candidate, diff_area_reference)
                        except:
                            print("Error in finding position in compare image")
                            images_are_equal=False
                            detected_differences.append(True)
                            continue
                        #positions_in_compare_image = self.find_partial_image_position(candidate, diff_area_reference)
                        if (np.mean(diff_area_reference) == 255) or (np.mean(diff_area_candidate) == 255):
                            images_are_equal=False
                            detected_differences.append(True)
                            
                            print("Image section contains only white background")

                            self.add_screenshot_to_log(np.concatenate((cv2.copyMakeBorder(diff_area_reference, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0]), cv2.copyMakeBorder(diff_area_candidate, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0])), axis=1), "_diff_area_concat")



                            #self.add_screenshot_to_log(np.concatenate((diff_area_reference, diff_area_candidate), axis=1), "_diff_area_concat")

                        else:
                            if positions_in_compare_image:
                                
                                #pt_original = (x, y)
                                pt_original = positions_in_compare_image['pt1']
                                pt_compare = positions_in_compare_image['pt2']
                                x_moved = abs(pt_original[0]-pt_compare[0])
                                y_moved = abs(pt_original[1]-pt_compare[1])
                                move_distance = math.sqrt(x_moved** 2 +y_moved ** 2)
                                #cv2.arrowedLine(candidate_with_rect, pt_original, pt_compare, (255, 0, 0), 4)
                                if int(move_distance)>int(move_tolerance):
                                    print("Image section moved ",move_distance, " pixels")
                                    print("This is outside of the allowed range of ",move_tolerance, " pixels")
                                    images_are_equal=False
                                    detected_differences.append(True)
                                    self.add_screenshot_to_log(self.overlay_two_images(search_area_reference, search_area_candidate), "_diff_area_blended")
                                    
                                else:
                                    print("Image section moved ",move_distance, " pixels")
                                    print("This is within the allowed range of ",move_tolerance, " pixels")
                                    self.add_screenshot_to_log(self.overlay_two_images(search_area_reference, search_area_candidate), "_diff_area_blended")

                            else:
                                images_are_equal=False
                                detected_differences.append(True)
                                print("The reference image section was not found in test image (or vice versa)")
                                self.add_screenshot_to_log(np.concatenate((cv2.copyMakeBorder(diff_area_reference, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0]), cv2.copyMakeBorder(diff_area_candidate, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0])), axis=1), "_diff_area_concat")

                elif compare_options["get_pdf_content"] is True:
                    images_are_equal=True
                    ref_words = reference_pdf_content.get_text("words")
                    cand_words = candidate_pdf_content.get_text("words")
                    for c in range(len(cnts)):

                        (x, y, w, h) = cv2.boundingRect(cnts[c])
                        rect = fitz.Rect(x*72/self.DPI, y*72/self.DPI, (x+w)*72/self.DPI, (y+h)*72/self.DPI)
                        diff_area_ref_words = [w for w in ref_words if fitz.Rect(w[:4]).intersects(rect)]
                        diff_area_cand_words = [w for w in cand_words if fitz.Rect(w[:4]).intersects(rect)]
                        # diff_area_ref_words = make_text(diff_area_ref_words)
                        # diff_area_cand_words = make_text(diff_area_cand_words)
                        diff_area_reference = reference[y:y+h, x:x+w]
                        diff_area_candidate = candidate[y:y+h, x:x+w]
                        self.add_screenshot_to_log(diff_area_reference, "_page_" + str(i+1) + "_diff_area_reference_"+str(c))
                        self.add_screenshot_to_log(diff_area_candidate, "_page_" + str(i+1) + "_diff_area_test_"+str(c))

                        if len(diff_area_ref_words)!=len(diff_area_cand_words):
                            images_are_equal=False
                            detected_differences.append(True)
                            print("The identified pdf layout elements are different", diff_area_ref_words, diff_area_cand_words)
                        else:
                            for ref_Item, cand_Item in zip(diff_area_ref_words, diff_area_cand_words):
                                if ref_Item == cand_Item:
                                    pass

                                elif str(ref_Item[4]).strip() == str(cand_Item[4]).strip():
                                    left_moved = abs(ref_Item[0]-cand_Item[0])*self.DPI/72
                                    top_moved = abs(ref_Item[1]-cand_Item[1])*self.DPI/72
                                    right_moved = abs(ref_Item[2]-cand_Item[2])*self.DPI/72
                                    bottom_moved = abs(ref_Item[3]-cand_Item[3])*self.DPI/72
                                    print("Checking pdf elements", ref_Item, cand_Item)


                                    if int(left_moved)>int(move_tolerance) or int(top_moved)>int(move_tolerance) or int(right_moved)>int(move_tolerance) or int(bottom_moved)>int(move_tolerance):
                                        print("Image section moved ",left_moved, top_moved, right_moved, bottom_moved, " pixels")
                                        print("This is outside of the allowed range of ",move_tolerance, " pixels")
                                        images_are_equal=False
                                        detected_differences.append(True)
                                        self.add_screenshot_to_log(self.overlay_two_images(diff_area_reference, diff_area_candidate), "_diff_area_blended")
                                    

                                    else:
                                        print("Image section moved ",left_moved, top_moved, right_moved, bottom_moved, " pixels")
                                        print("This is within the allowed range of ",move_tolerance, " pixels")
                                        self.add_screenshot_to_log(self.overlay_two_images(diff_area_reference, diff_area_candidate), "_diff_area_blended")
            if images_are_equal is not True:
                detected_differences.append(True)
예제 #13
0
def test_big_text_from_image(testdata_dir):
    img = CompareImage(testdata_dir / 'text_big.png')
    img.get_ocr_text_data()
    assert 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' in img.text_content[0]['text']
    pass
예제 #14
0
def test_white_text_on_dark_background(testdata_dir):
    img = CompareImage(testdata_dir / 'whitetext_blackbackground.png')
    img.get_ocr_text_data()
    assert '0123456789' in img.text_content[0]['text']
    pass
예제 #15
0
def test_text_on_colored_background(testdata_dir):
    img = CompareImage(testdata_dir / 'Beach_date.png')
    img.get_ocr_text_data()
    assert "01-Jan-2021" in img.text_content[0]['text']
    assert "123456789" in img.text_content[0]['text']
    pass
예제 #16
0
def test_non_existing_file(testdata_dir):
    with pytest.raises(AssertionError):
        img = CompareImage(testdata_dir / 'does_not_exist.png')
예제 #17
0
def test_corrupt_pdf(testdata_dir):
    with pytest.raises(AssertionError):
        img = CompareImage(testdata_dir / 'corrupt_pdf.pdf')
def test_pdf_text_mask(testdata_dir):
    img = CompareImage(testdata_dir / 'sample_1_page.pdf',
                       placeholder_file=testdata_dir / 'pdf_pattern_mask.json')
    assert len(img.placeholders) == 3
    img_with_mask = img.get_image_with_placeholders()
    assert img != img_with_mask
예제 #19
0
def test_image_text_content_with_east(testdata_dir):
    img = CompareImage(testdata_dir / 'Beach_date.png')
    img.get_text_content_with_east()
    assert any('01-Jan-2021' in s for s in img.text_content[0]['text'])
예제 #20
0
def test_small_text_from_image(testdata_dir):
    img = CompareImage(testdata_dir / 'text_small.png')
    img.get_ocr_text_data()
    assert '1234567890' in img.text_content[0]['text']
    pass