def test_image_text_mask_with_east(testdata_dir): img = CompareImage(testdata_dir / 'Beach_date.png', placeholder_file=testdata_dir / 'pattern_mask.json', ocr_engine='east') assert len(img.placeholders) >= 1 img_with_mask = img.get_image_with_placeholders() assert img != img_with_mask
def get_text_from_document(self, image): """Gets Text Content from documents/images ``image``. Text content is returned as a list of strings. None if no text is identified. Examples: | ${text} | Get Text From Document| reference.pdf | #Gets Text Content from .pdf | | ${text} | Get Text From Document| reference.jpg | #Gets Text Content from .jpg | | List Should Contain Value | ${text} | Test String | #Checks if list contains a specific string | """ img = CompareImage(image) if img.extension == '.pdf': text = [] for i in range(len(img.opencv_images)): tdict = json.loads(img.mupdfdoc[i].get_text("json")) for block in tdict['blocks']: if block['type'] == 0: for line in block['lines']: if line['spans'][0]['text']: text.append(line['spans'][0]['text']) else: try: img.get_ocr_text_data() text = [x for x in img.text_content[0]['text'] if x] except: text = None return text
def test_multipage_pdf(testdata_dir): img = CompareImage(testdata_dir / 'sample.pdf') assert len(img.opencv_images) == 2 assert type(img.opencv_images) == list assert type(img.opencv_images[0]) == numpy.ndarray assert type(img.opencv_images[1]) == numpy.ndarray pass
def test_pdf_text_content(testdata_dir): img = CompareImage(testdata_dir / 'sample_1_page.pdf') assert len(img.mupdfdoc.get_page_text(0, "WORDS")) > 0
def test_image_text_content(testdata_dir): img = CompareImage(testdata_dir / 'Beach_date.png') img.get_ocr_text_data() assert "01-Jan-2021" in img.text_content[0]['text'] assert "123456789" in img.text_content[0]['text']
def test_single_pdf(testdata_dir): img = CompareImage(testdata_dir / 'sample_1_page.pdf') assert len(img.opencv_images) == 1 assert type(img.opencv_images) == list assert type(img.opencv_images[0]) == numpy.ndarray pass
def test_image_area_mask(testdata_dir): img = CompareImage(testdata_dir / 'Beach_date.png', placeholder_file=testdata_dir / 'area_mask.json') assert len(img.placeholders) == 1 img_with_mask = img.get_image_with_placeholders() assert img != img_with_mask
def test_simple_text_from_pdf(testdata_dir): img = CompareImage(testdata_dir / 'sample_1_page.pdf') img.get_ocr_text_data() assert 'FB1DES0A3D5EFE2A60B0B1AE616C653' in img.text_content[0]['text']
def test_single_png(testdata_dir): img = CompareImage(testdata_dir / 'text_big.png') assert len(img.opencv_images) == 1 assert type(img.opencv_images) == list assert type(img.opencv_images[0]) == numpy.ndarray
def test_single_png_with_barcode(testdata_dir): img = CompareImage(testdata_dir / 'datamatrix.png', contains_barcodes=True) assert len(img.placeholders) == 2
def test_single_pdf_without_barcode(testdata_dir): img = CompareImage(testdata_dir / 'sample_1_page.pdf', contains_barcodes=True) assert len(img.placeholders) == 0
def check_for_differences(self, reference, candidate, i, detected_differences, compare_options, reference_pdf_content=None, candidate_pdf_content=None): images_are_equal = True with futures.ThreadPoolExecutor(max_workers=2) as parallel_executor: grayA_future = parallel_executor.submit(cv2.cvtColor, reference, cv2.COLOR_BGR2GRAY) grayB_future = parallel_executor.submit(cv2.cvtColor, candidate, cv2.COLOR_BGR2GRAY) grayA = grayA_future.result() grayB = grayB_future.result() if reference.shape[0] != candidate.shape[0] or reference.shape[1] != candidate.shape[1]: self.add_screenshot_to_log(reference, "_reference_page_" + str(i+1)) self.add_screenshot_to_log(candidate, "_candidate_page_" + str(i+1)) raise AssertionError(f'The compared images have different dimensions:\nreference:{reference.shape}\ncandidate:{candidate.shape}') # compute the Structural Similarity Index (SSIM) between the two # images, ensuring that the difference image is returned (score, diff) = metrics.structural_similarity(grayA, grayB, gaussian_weights=True, full=True) score = abs(1-score) if self.take_screenshots: # Not necessary to take screenshots for every successful comparison self.add_screenshot_to_log(np.concatenate((reference, candidate), axis=1), "_page_" + str(i+1) + "_compare_concat") if (score > self.threshold): diff = (diff * 255).astype("uint8") thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] reference_with_rect, candidate_with_rect , cnts= self.get_images_with_highlighted_differences(thresh, reference.copy(), candidate.copy(), extension=int(os.getenv('EXTENSION', 2))) blended_images = self.overlay_two_images(reference_with_rect, candidate_with_rect) cv2.putText(reference_with_rect,self.REFERENCE_LABEL, self.BOTTOM_LEFT_CORNER_OF_TEXT, self.FONT, self.FONT_SCALE, self.FONT_COLOR, self.LINE_TYPE) cv2.putText(candidate_with_rect,self.CANDIDATE_LABEL, self.BOTTOM_LEFT_CORNER_OF_TEXT, self.FONT, self.FONT_SCALE, self.FONT_COLOR, self.LINE_TYPE) self.add_screenshot_to_log(np.concatenate((reference_with_rect, candidate_with_rect), axis=1), "_page_" + str(i+1) + "_rectangles_concat") self.add_screenshot_to_log(blended_images, "_page_" + str(i+1) + "_blended") if self.show_diff: self.add_screenshot_to_log(np.concatenate((diff, thresh), axis=1), "_page_" + str(i+1) + "_diff") images_are_equal=False if (compare_options["ignore_watermarks"] == True and len(cnts)==1) or compare_options["watermark_file"] is not None: if (compare_options["ignore_watermarks"] == True and len(cnts)==1): (x, y, w, h) = cv2.boundingRect(cnts[0]) diff_center_x = abs((x+w/2)-(reference.shape[1]/2)) diff_center_y = abs((y+h/2)-(reference.shape[0]/2)) if (diff_center_x < reference.shape[1] * self.WATERMARK_CENTER_OFFSET) and (w * 25.4 / self.DPI < self.WATERMARK_WIDTH) and (h * 25.4 / self.DPI < self.WATERMARK_HEIGHT): images_are_equal=True print("A watermark position was identified. After ignoring watermark area, both images are equal") return if compare_options["watermark_file"] is not None: watermark_file = compare_options["watermark_file"] if isinstance(watermark_file, str): if os.path.isdir(watermark_file): watermark_file = [str(os.path.join(watermark_file, f)) for f in os.listdir(watermark_file) if os.path.isfile(os.path.join(watermark_file, f))] else: watermark_file = [watermark_file] if isinstance(watermark_file, list): try: for single_watermark in watermark_file: try: watermark = CompareImage(single_watermark, DPI=self.DPI).opencv_images[0] except: print(f'Watermark file {single_watermark} could not be loaded. Continue with next item.') continue watermark_gray = cv2.cvtColor(watermark, cv2.COLOR_BGR2GRAY) watermark_gray = (watermark_gray * 255).astype("uint8") mask = cv2.threshold(watermark_gray, 10, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] mask = cv2.dilate(mask, None, iterations=1) mask_inv = cv2.bitwise_not(mask) if thresh.shape[0:2] == mask_inv.shape[0:2]: result = cv2.bitwise_and(thresh, thresh, mask=mask_inv) else: print(f"The shape of watermark and image are different. Continue with next item") print(f"Document: {thresh.shape}\nMask: {mask_inv.shape}") continue if self.show_diff: print(f"The diff after watermark removal") self.add_screenshot_to_log(result, "_page_" + str(i + 1) + "_watermark_diff") if cv2.countNonZero(result) == 0: images_are_equal=True print("A watermark file was provided. After removing watermark area, both images are equal") return except: raise AssertionError('The provided watermark_file format is invalid. Please provide a path to a file or a list of files.') else: raise AssertionError('The provided watermark_file format is invalid. Please provide a path to a file or a list of files.') if(compare_options["check_text_content"]==True) and images_are_equal is not True: if compare_options["get_pdf_content"] is not True: #x, y, w, h = self.get_diff_rectangle(thresh) images_are_equal=True for c in range(len(cnts)): (x, y, w, h) = cv2.boundingRect(cnts[c]) diff_area_reference = reference[y:y+h, x:x+w] diff_area_candidate = candidate[y:y+h, x:x+w] self.add_screenshot_to_log(diff_area_reference, "_page_" + str(i+1) + "_diff_area_reference_"+str(c)) self.add_screenshot_to_log(diff_area_candidate, "_page_" + str(i+1) + "_diff_area_test_"+str(c)) text_reference = pytesseract.image_to_string(diff_area_reference, config='--psm 6').replace("\n\n", "\n") text_candidate = pytesseract.image_to_string(diff_area_candidate, config='--psm 6').replace("\n\n", "\n") if text_reference.strip()==text_candidate.strip(): print("Partial text content is the same") print(text_reference) else: images_are_equal=False detected_differences.append(True) print("Partial text content is different") print(text_reference + " is not equal to " + text_candidate) elif compare_options["get_pdf_content"] is True: images_are_equal=True ref_words = reference_pdf_content.get_text("words") cand_words = candidate_pdf_content.get_text("words") for c in range(len(cnts)): (x, y, w, h) = cv2.boundingRect(cnts[c]) rect = fitz.Rect(x*72/self.DPI, y*72/self.DPI, (x+w)*72/self.DPI, (y+h)*72/self.DPI) diff_area_ref_words = [w for w in ref_words if fitz.Rect(w[:4]).intersects(rect)] diff_area_cand_words = [w for w in cand_words if fitz.Rect(w[:4]).intersects(rect)] diff_area_ref_words = make_text(diff_area_ref_words) diff_area_cand_words = make_text(diff_area_cand_words) diff_area_reference = reference[y:y+h, x:x+w] diff_area_candidate = candidate[y:y+h, x:x+w] self.add_screenshot_to_log(diff_area_reference, "_page_" + str(i+1) + "_diff_area_reference_"+str(c)) self.add_screenshot_to_log(diff_area_candidate, "_page_" + str(i+1) + "_diff_area_test_"+str(c)) if len(diff_area_ref_words)!=len(diff_area_cand_words): images_are_equal=False detected_differences.append(True) print("The identified pdf layout elements are different", diff_area_ref_words, diff_area_cand_words) else: if diff_area_ref_words.strip() != diff_area_cand_words.strip(): images_are_equal=False detected_differences.append(True) print("Partial text content is different") print(diff_area_ref_words.strip(), " is not equal to " ,diff_area_cand_words.strip()) if images_are_equal: print("Partial text content of area is the same") print(diff_area_ref_words) pass if(compare_options["move_tolerance"]!=None) and images_are_equal is not True: move_tolerance=int(compare_options["move_tolerance"]) images_are_equal=True if compare_options["get_pdf_content"] is not True: #Experimental, to solve a problem with small images #wr, hr, _ = reference.shape for c in range(len(cnts)): (x, y, w, h) = cv2.boundingRect(cnts[c]) diff_area_reference = reference[y:y+h, x:x+w] diff_area_candidate = candidate[y:y+h, x:x+w] #Experimental, to solve a problem with small images #search_area_candidate = candidate[(y - self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if y >= self.BORDER_FOR_MOVE_TOLERANCE_CHECK else 0:(y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if hr >= (y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) else hr, (x - self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if x >= self.BORDER_FOR_MOVE_TOLERANCE_CHECK else 0:(x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) if wr >= (x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK) else wr] search_area_candidate = candidate[y - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK, x - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK] search_area_reference = reference[y - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:y + h + self.BORDER_FOR_MOVE_TOLERANCE_CHECK, x - self.BORDER_FOR_MOVE_TOLERANCE_CHECK:x + w + self.BORDER_FOR_MOVE_TOLERANCE_CHECK] # self.add_screenshot_to_log(search_area_candidate) # self.add_screenshot_to_log(search_area_reference) # self.add_screenshot_to_log(diff_area_candidate) # self.add_screenshot_to_log(diff_area_reference) try: positions_in_compare_image = self.find_partial_image_position(search_area_candidate, diff_area_reference) except: print("Error in finding position in compare image") images_are_equal=False detected_differences.append(True) continue #positions_in_compare_image = self.find_partial_image_position(candidate, diff_area_reference) if (np.mean(diff_area_reference) == 255) or (np.mean(diff_area_candidate) == 255): images_are_equal=False detected_differences.append(True) print("Image section contains only white background") self.add_screenshot_to_log(np.concatenate((cv2.copyMakeBorder(diff_area_reference, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0]), cv2.copyMakeBorder(diff_area_candidate, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0])), axis=1), "_diff_area_concat") #self.add_screenshot_to_log(np.concatenate((diff_area_reference, diff_area_candidate), axis=1), "_diff_area_concat") else: if positions_in_compare_image: #pt_original = (x, y) pt_original = positions_in_compare_image['pt1'] pt_compare = positions_in_compare_image['pt2'] x_moved = abs(pt_original[0]-pt_compare[0]) y_moved = abs(pt_original[1]-pt_compare[1]) move_distance = math.sqrt(x_moved** 2 +y_moved ** 2) #cv2.arrowedLine(candidate_with_rect, pt_original, pt_compare, (255, 0, 0), 4) if int(move_distance)>int(move_tolerance): print("Image section moved ",move_distance, " pixels") print("This is outside of the allowed range of ",move_tolerance, " pixels") images_are_equal=False detected_differences.append(True) self.add_screenshot_to_log(self.overlay_two_images(search_area_reference, search_area_candidate), "_diff_area_blended") else: print("Image section moved ",move_distance, " pixels") print("This is within the allowed range of ",move_tolerance, " pixels") self.add_screenshot_to_log(self.overlay_two_images(search_area_reference, search_area_candidate), "_diff_area_blended") else: images_are_equal=False detected_differences.append(True) print("The reference image section was not found in test image (or vice versa)") self.add_screenshot_to_log(np.concatenate((cv2.copyMakeBorder(diff_area_reference, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0]), cv2.copyMakeBorder(diff_area_candidate, top=2, bottom=2, left=2, right=2, borderType=cv2.BORDER_CONSTANT, value=[0,0,0])), axis=1), "_diff_area_concat") elif compare_options["get_pdf_content"] is True: images_are_equal=True ref_words = reference_pdf_content.get_text("words") cand_words = candidate_pdf_content.get_text("words") for c in range(len(cnts)): (x, y, w, h) = cv2.boundingRect(cnts[c]) rect = fitz.Rect(x*72/self.DPI, y*72/self.DPI, (x+w)*72/self.DPI, (y+h)*72/self.DPI) diff_area_ref_words = [w for w in ref_words if fitz.Rect(w[:4]).intersects(rect)] diff_area_cand_words = [w for w in cand_words if fitz.Rect(w[:4]).intersects(rect)] # diff_area_ref_words = make_text(diff_area_ref_words) # diff_area_cand_words = make_text(diff_area_cand_words) diff_area_reference = reference[y:y+h, x:x+w] diff_area_candidate = candidate[y:y+h, x:x+w] self.add_screenshot_to_log(diff_area_reference, "_page_" + str(i+1) + "_diff_area_reference_"+str(c)) self.add_screenshot_to_log(diff_area_candidate, "_page_" + str(i+1) + "_diff_area_test_"+str(c)) if len(diff_area_ref_words)!=len(diff_area_cand_words): images_are_equal=False detected_differences.append(True) print("The identified pdf layout elements are different", diff_area_ref_words, diff_area_cand_words) else: for ref_Item, cand_Item in zip(diff_area_ref_words, diff_area_cand_words): if ref_Item == cand_Item: pass elif str(ref_Item[4]).strip() == str(cand_Item[4]).strip(): left_moved = abs(ref_Item[0]-cand_Item[0])*self.DPI/72 top_moved = abs(ref_Item[1]-cand_Item[1])*self.DPI/72 right_moved = abs(ref_Item[2]-cand_Item[2])*self.DPI/72 bottom_moved = abs(ref_Item[3]-cand_Item[3])*self.DPI/72 print("Checking pdf elements", ref_Item, cand_Item) if int(left_moved)>int(move_tolerance) or int(top_moved)>int(move_tolerance) or int(right_moved)>int(move_tolerance) or int(bottom_moved)>int(move_tolerance): print("Image section moved ",left_moved, top_moved, right_moved, bottom_moved, " pixels") print("This is outside of the allowed range of ",move_tolerance, " pixels") images_are_equal=False detected_differences.append(True) self.add_screenshot_to_log(self.overlay_two_images(diff_area_reference, diff_area_candidate), "_diff_area_blended") else: print("Image section moved ",left_moved, top_moved, right_moved, bottom_moved, " pixels") print("This is within the allowed range of ",move_tolerance, " pixels") self.add_screenshot_to_log(self.overlay_two_images(diff_area_reference, diff_area_candidate), "_diff_area_blended") if images_are_equal is not True: detected_differences.append(True)
def test_big_text_from_image(testdata_dir): img = CompareImage(testdata_dir / 'text_big.png') img.get_ocr_text_data() assert 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' in img.text_content[0]['text'] pass
def test_white_text_on_dark_background(testdata_dir): img = CompareImage(testdata_dir / 'whitetext_blackbackground.png') img.get_ocr_text_data() assert '0123456789' in img.text_content[0]['text'] pass
def test_text_on_colored_background(testdata_dir): img = CompareImage(testdata_dir / 'Beach_date.png') img.get_ocr_text_data() assert "01-Jan-2021" in img.text_content[0]['text'] assert "123456789" in img.text_content[0]['text'] pass
def test_non_existing_file(testdata_dir): with pytest.raises(AssertionError): img = CompareImage(testdata_dir / 'does_not_exist.png')
def test_corrupt_pdf(testdata_dir): with pytest.raises(AssertionError): img = CompareImage(testdata_dir / 'corrupt_pdf.pdf')
def test_pdf_text_mask(testdata_dir): img = CompareImage(testdata_dir / 'sample_1_page.pdf', placeholder_file=testdata_dir / 'pdf_pattern_mask.json') assert len(img.placeholders) == 3 img_with_mask = img.get_image_with_placeholders() assert img != img_with_mask
def test_image_text_content_with_east(testdata_dir): img = CompareImage(testdata_dir / 'Beach_date.png') img.get_text_content_with_east() assert any('01-Jan-2021' in s for s in img.text_content[0]['text'])
def test_small_text_from_image(testdata_dir): img = CompareImage(testdata_dir / 'text_small.png') img.get_ocr_text_data() assert '1234567890' in img.text_content[0]['text'] pass