def get_input_to_mapping(input_image_file, methods_to_account, dict_preparation): inputs_to_mapping_semantic_segmentation = [] inputs_to_mapping_OCR = [] inputs_to_mapping_scene = [] for automatic_method in methods_to_account: if automatic_method == "semantic_segmentation": # Get the outputs of the semantic segmentation. print("TODO: choose whether to get the probability") list_img = [] list_img_name = [] list_img_ratio = [] list_img_shapes = [] list_output = [] img, img_name, img_ratio, img_shapes = eval_u.prepare_sample( Path(input_image_file), 'deeplab', True) # Get predictions model_deeplab = eval_u.load_model('deeplab') output_pred_deeplab = eval_u.get_predictions([img], 'deeplab', model_deeplab) # Post process to polygon #output = dl_u.deeplab_pred_to_output(output_pred_deeplab[0][1], False, True, output_pred_deeplab[0][0]) inputs = dl_u.deeplab_pred_to_output(output_pred_deeplab[0][1], False, True, output_pred_deeplab[0][0], True, img_shapes) list_img.append(img) list_img_name.append(img_name) list_img_ratio.append(img_ratio) list_img_shapes.append(img_shapes) list_output.append(inputs) inputs_to_mapping_semantic_segmentation.append(inputs) elif automatic_method == "OCR": # GEt the outputs of the Optical Character Recognition. output_pred_OCR = eval_u.get_predictions([str(input_image_file)], 'OCR', True, False) # Process for misspellings inputs = OCR_u.accountForMisspellings( output_pred_OCR[0], dict_preparation["words_dict"], dict_preparation["ss"]) inputs_to_mapping_OCR.append(inputs) elif automatic_method == "scene_recognition": output_pred = eval_u.get_predictions([str(input_image_file)], 'vgg_places365', True, False, dict_preparation) #["top_k"]) inputs_to_mapping_scene.append(output_pred) return inputs_to_mapping_semantic_segmentation, inputs_to_mapping_OCR, inputs_to_mapping_scene
def decode_pdf(pdf_filepath, image_directory,image_format='PNG'): if Path(pdf_filepath).suffix != '.pdf': raise Exception('Must specify a \'.pdf\' file extension for input pdf_filepath.') image_filepaths = OCR_utils.pdf_pages_to_images(pdf_filepath, image_directory, image_format=image_format) output = PdfFileWriter() pdf_page_fps = [] for image_fp in image_filepaths: pdf_page_fps.append(OCR_utils.image_to_pdf(image_fp)) # Create a pdf from the image file = PdfFileReader(open(pdf_page_fps[-1], "rb")) # Open the image's pdf output.addPage(file.getPage(0)) # Add the page to the new document new_filepath = str(Path(pdf_filepath).parent / (Path(pdf_filepath).stem + '_decoded' + Path(pdf_filepath).suffix)) outputStream = open(new_filepath, "wb") output.write(outputStream) outputStream.close() # Delete the temporary image files created if image_filepaths: for fp in image_filepaths: os.remove(fp) return new_filepath
def remove_greyscale_watermark(PDF_file_path, to_black_upperbound, to_white_lowerbound, compression_factor = 1, replacement_watermark='', replacement_watermark_font = 'Arial', replacement_watermark_text_size=20, replacement_watermark_colour=(50,50,50,255), replacement_watermark_text_center = (200, 200), replacement_watermark_rotation_angle=0, output_file_path = '', jpg_quality = 75): image_fps = OCR_utils.pdf_pages_to_images(PDF_file_path, str(Path(PDF_file_path).parent), 'BMP', compression_factor=compression_factor) mod_image_fps = [] for image_fp in image_fps: im = Image.open(image_fp) pix, s = im.load(), im.size # Examine RGB of specified pixels # i_wm, j_wm = 1422, 3071 # wm_grey = pix[i_wm - 1, j_wm - 1] #173 # i_ol, j_ol = 1579, 2902 # ol_grey = pix[i_ol - 1, j_ol - 1] #81 # # Determine the most common RGBs # dict_of_colours = {} # for i in range(s[0]): # for j in range(s[1]): # col = pix[i, j] # if col not in dict_of_colours.keys(): # dict_of_colours[col] = 1 # else: # dict_of_colours[col] += 1 # dict_of_colours = {k: v for k, v in sorted(dict_of_colours.items(), key=lambda item: item[1], reverse=True)} # len([tup for tup in dict_of_colours.keys() if tup[0]==tup[1] and tup[1]==tup[2]]) == len(dict_of_colours.keys()) # Check if all are greyscale for i in range(s[0]): for j in range(s[1]): col = pix[i, j] if col[0]>=to_white_lowerbound: pix[i, j] = (255,255,255) elif col[0]<=to_black_upperbound: pix[i, j] = (0, 0, 0) if replacement_watermark: fp, im=np.add_text_line_to_image(im, replacement_watermark, replacement_watermark_text_center, text_size=replacement_watermark_text_size, text_box_pixel_width = 0, RGBA=replacement_watermark_colour, text_background_RGBA = (0,0,0,0), text_box_RGBA = (0,0,0,0), rot_degrees=replacement_watermark_rotation_angle, font_name = replacement_watermark_font, show_result = False) im.save(image_fp[:-4]+'_mod.jpg', quality=jpg_quality) mod_image_fps.append(image_fp[:-4]+'_mod.jpg') OCR_utils.images_to_pdf(mod_image_fps, output_file_path=output_file_path) # Delete the temporary image files created if image_fps or mod_image_fps: for fp in image_fps+mod_image_fps: os.remove(fp)
def prepare_needed_elements(methods_to_account): if "OCR" in methods_to_account: ss, words_dict = OCR_u.prepareDictForMisspellings() return {"ss": ss, "words_dict": words_dict}
def ruleBasedMapping(type_semantic_seg_rule, type_OCR_rule, type_scene_rule, list_semantic_segmentation, list_OCR, list_scene, needed_elements): polys_to_obfuscate = [] if len(list_semantic_segmentation) > 0: if type_semantic_seg_rule == "simple_list": print("Dealing with the polygons from semantic segmentation.") list_private_deeplab_labels = ["person, individual, someone, somebody, mortal, soul", \ "car, auto, automobile, machine, motorcar", \ "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, ", \ "motorcoach, omnibus, passenger vehicle", "truck, motortruck", "van", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "minibike, motorbike", \ "bicycle, bike, wheel, cycle", "poster, posting, placard, notice, bill, card", \ "signboard sign", "bulletin board, notice board", \ "screen door, screen", "screen, silver screen, projection screen", \ "crt screen", "plate", "monitor, monitoring device", \ "bookcase", "blind, screen", "book", "computer, computing machine, computing device, data processor ", \ "electronic computer, information processing system", \ "television receiver, television, television set, tv, tv set, idiot ", \ "trade name, brand name, brand, marque", "flag"] for poly in list_semantic_segmentation: #print("TODO: add filter per confidence score") if poly[1] in list_private_deeplab_labels: for poly_elem in poly[0]: if poly_elem[ 0].area > 4.0: # Check that the size of the polygons is large enough to actually see anything on the images. #print(poly[1]) polys_to_obfuscate.append(poly_elem[0]) #print("TODO: check a surface size to filter out polygons.") if len(list_OCR) > 0: if type_OCR_rule == "simple_rule": print("Dealing with the polygons from OCR.") for text_recognized in list_OCR: poly_text = text_recognized[0] #print(poly_text) possible_values = text_recognized[1] for potential_value in possible_values: # Check whether the string is actually not just one letter or a space. string_without_space = potential_value.translate( {ord(c): None for c in string.whitespace}) if (len(string_without_space) > 1): #print(potential_value) ### Obfuscate any number # count number of digits in the string: nb_digit = sum( list( map(lambda x: 1 if x.isdigit() else 0, set(potential_value)))) if nb_digit > 3: # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers. #print(potential_value) polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break # Obfuscate any element recognized as a location or organization or person. continuous_chunk_1 = OCR_u.NERWithOldStanford( potential_value) continuous_chunk_2 = OCR_u.NERNewVersion( potential_value) list_recognized_entities_1 = [ chunk[1] for chunk in continuous_chunk_1 ] list_recognized_entities_2 = [ chunk[1] for chunk in continuous_chunk_2 ] list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2 if ("LOCATION" in list_recognized_entities) or \ ("PERSON" in list_recognized_entities) or \ ("ORGANIZATION" in list_recognized_entities) or \ ("GPE" in list_recognized_entities) : #print(potential_value, list_recognized_entities) polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break # Obfuscate elements in a list of names or locations. words = potential_value.split() list_words = [] for value in words: list_words += [ value, value.upper(), value.lower(), value.title() ] for word in list_words: # Get each word from the extracted strings and check for similarity similar_words = difflib.get_close_matches( word, needed_elements["lines"], n=3, cutoff=0.9) if len(similar_words) > 0: #print(potential_value, similar_words) polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break if len(similar_words) > 0: break # Obfuscate elements that are next to "name" or "date". # Let's thnk about that later... #print("TO IMPLEMENT") elif type_OCR_rule == "simplest_rule": print("Dealing with the polygons from OCR.") # Obfuscate all text that is diffeernt from "" for text_recognized in list_OCR: poly_text = text_recognized[0] possible_values = text_recognized[1] for potential_value in possible_values: if potential_value.strip(): polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break return polys_to_obfuscate
def search_in_presentations(self): ppt_instance, slide_counter = win32com.client.Dispatch( 'PowerPoint.Application'), 0 for index_file, file_path in enumerate( self.results['files_to_search_inside']['presentation']): print('Searching in presentation file {} of {}...'.format( index_file + 1, len(self.results['files_to_search_inside']['presentation']))) read_only, has_title, window = False, False, False prs = ppt_instance.Presentations.open(file_path, read_only, has_title, window) self.results['file_slide_sizes'][file_path] = ( prs.PageSetup.SlideWidth, prs.PageSetup.SlideHeight) for index_slide, Slide in enumerate(prs.Slides): for index_shape, Shape in enumerate(Slide.Shapes): slide_string = 'Slide ' + str(index_slide + 1) object_string = 'Object ' + str(index_shape + 1) if Shape.HasTextFrame: if Shape.TextFrame.HasText: paragraphs_specialchars_removed = [ p.Text for p in Shape.TextFrame.TextRange.Paragraphs() if (p.Text != '\r') ] for index_paragraph, Paragraph in enumerate( paragraphs_specialchars_removed): for search_string in self.parameters[ 'search_strings']: occurrences = nm.count_text_occurrences( Paragraph, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) if occurrences > 0: slide_counter += 1 if str(file_path) not in list( self.results[ 'containing_file_paths'] [search_string] ['presentation'].keys()): self.results[ 'containing_file_paths'][ search_string][ 'presentation'][str( file_path)] = {} paragraph_string = 'Paragraph ' + str( index_paragraph + 1) occurrences_string = str( occurrences ) + ' occurrence' if occurrences == 1 else str( occurrences) + ' occurrences' combined_string = object_string + ', ' + paragraph_string + ', ' + occurrences_string if slide_string in list( self.results[ 'containing_file_paths'] [search_string]['presentation'][ str(file_path)].keys()): self.results[ 'containing_file_paths'][ search_string][ 'presentation'][str( file_path )][slide_string].append( combined_string) else: self.results[ 'containing_file_paths'][ search_string][ 'presentation'][str( file_path )][slide_string] = [ combined_string ] if Shape.Type in [3, 21, 28, 11, 13] and self.parameters[ 'search_in_doc_images'] and self.parameters[ 'allow_OCR']: img_fp = str( self.temp_directory / '{}_{}_{}.jpg'.format( str(Path(file_path).stem).replace('.', ''), slide_string, object_string)) Shape.Export(img_fp, 3) try: image_text = OCR_utils.image_to_text( img_fp, language='eng') except: image_text = '' for search_string in self.parameters['search_strings']: occurrences = nm.count_text_occurrences( image_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) occurrences_string = str( occurrences ) + ' occurrence' if occurrences == 1 else str( occurrences) + ' occurrences' combined_string = object_string + ' (image), ' + occurrences_string if occurrences > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['presentation'].keys()): self.results['containing_file_paths'][ search_string]['presentation'][str( file_path)] = {} if slide_string in list( self.results['containing_file_paths'] [search_string]['presentation'][str( file_path)].keys()): self.results['containing_file_paths'][ search_string]['presentation'][str( file_path)][slide_string].append( combined_string) else: self.results['containing_file_paths'][ search_string]['presentation'][str( file_path)][slide_string] = [ combined_string ] os.remove(img_fp)
def search_in_pdfs(self, file_path): # Read the text with tika and parse into individual pages with BeautifulSoup # TODO: test with 1 page pdf pages = pdf_utils.tika_read() # If the file contains 4 or more consecutive symbols, it is probably encrypted file_probably_encrypted = False for page_text in pages: if page_text is None or pdf_utils.is_probably_encrypted( page_text, n_consecutive_symbols=4): file_probably_encrypted = True break # 1. If probably encrypted and OCR allowed, analyse the pages with Tesseract OCR (Optical Character Recognition) if file_probably_encrypted and self.parameters['allow_OCR']: self.results['pdf_reading_steps'][file_path].append( 'encrypted: used page OCR') page_image_filepaths = OCR_utils.pdf_pages_to_images( file_path, self.temp_directory, 'jpg' ) # Convert pdf pages to image files and save list of their filepaths for i, image_fp in enumerate( page_image_filepaths ): # Use OCR on each page to get a text string for each page_text = OCR_utils.image_to_text(image_fp, language='eng') for search_string in self.parameters['search_strings']: line_numbers = nm.count_text_occurrences( page_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only'], get_line_numbers=True) if len(line_numbers) > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][search_string][ 'fancytext'][str(file_path)]['page ' + str(i + 1)] = line_numbers # Delete the temporary image files created if page_image_filepaths: for fp in page_image_filepaths: os.remove(fp) # 2. If probably encrypted but cannot use OCR, add to failed file paths store elif file_probably_encrypted and not self.parameters['allow_OCR']: self.results['failed_file_paths']['fancytext'][( str(file_path) )] = 'File appears to be encrypted and OCR has not been allowed/is not available.' self.results['pdf_reading_steps'][file_path].append( 'encrypted: OCR not allowed') # 3. If probably not encrypted, analyse the tika text of each page else: self.results['pdf_reading_steps'][file_path].append( 'unencrypted: analyse tika text') for i, page_text in enumerate(pages): for search_string in self.parameters['search_strings']: line_numbers = nm.count_text_occurrences( page_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only'], get_line_numbers=True) if len(line_numbers) > 0: if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][search_string][ 'fancytext'][str(file_path)]['page ' + str(i + 1)] = line_numbers # Check if the pdf has any images - if desired, these can be analysed separately with OCR (not needed in encrypted case) if self.parameters['search_in_doc_images'] and self.parameters[ 'allow_OCR']: self.results['pdf_reading_steps'][file_path].append( 'unencrypted: search in images') n_images, saved_image_filepaths = pdf_utils.count_extract_pdf_images( file_path, save_images=True) if n_images > 0: for j, image_fp in enumerate(saved_image_filepaths): image_text = OCR_utils.image_to_text(image_fp, language='eng') for search_string in self.parameters['search_strings']: occurrences = nm.count_text_occurrences( image_text, search_string, self.parameters['case_sensitive'], self.parameters['whole_phrase_only']) if occurrences > 0: page_number = Path(file_path).stem.split( '_page_')[-1] if str(file_path) not in list( self.results['containing_file_paths'] [search_string]['fancytext'].keys()): self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path)] = {} self.results['containing_file_paths'][ search_string]['fancytext'][str( file_path )]['image {} on page {}'.format( j + 1, page_number )] = '{} occurrences'.format(occurrences)
def postProcessOCROutputs(OCR_outputs, needed_elements): OCR_processed_output = [] for text_recognized in OCR_outputs: poly_text = text_recognized[0] #print(poly_text) possible_values = list(set(text_recognized[1])) #print(possible_values) for potential_value in possible_values: # Check whether the string is actually not just one letter or a space. string_without_space = potential_value.translate( {ord(c): None for c in string.whitespace}) if (len(string_without_space) > 1): #print(potential_value) ### Obfuscate any number # count number of digits in the string: nb_digit = sum( list( map(lambda x: 1 if x.isdigit() else 0, set(potential_value)))) if nb_digit > 3: # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers. #print(potential_value) OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "hasNumbers")) break # Obfuscate any element recognized as a location or organization or person. continuous_chunk_1 = OCR_u.NERWithOldStanford(potential_value) continuous_chunk_2 = OCR_u.NERNewVersion(potential_value) list_recognized_entities_1 = [ chunk[1] for chunk in continuous_chunk_1 ] list_recognized_entities_2 = [ chunk[1] for chunk in continuous_chunk_2 ] list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2 if ("LOCATION" in list_recognized_entities) or \ ("PERSON" in list_recognized_entities) or \ ("ORGANIZATION" in list_recognized_entities) or \ ("GPE" in list_recognized_entities) : #print(potential_value, list_recognized_entities) if ("LOCATION" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "LOCATION")) break elif ("PERSON" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "PERSON")) break elif ("ORGANIZATION" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "ORGANIZATION")) break elif ("GPE" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "GPE")) break # Obfuscate elements in a list of names or locations. words = potential_value.split() list_words = [] for value in words: list_words += [ value, value.upper(), value.lower(), value.title() ] for word in list_words: similar_words = "" if len( word ) > 3: # This is a design choice to avoid small words... # Get each word from the extracted strings and check for similarity similar_words = difflib.get_close_matches( word, needed_elements["lines_names"], n=3, cutoff=0.9) if len(similar_words) > 0: #print(potential_value, similar_words) OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "PERSON")) break similar_words = difflib.get_close_matches( word, needed_elements["lines_location"], n=3, cutoff=0.9) if len(similar_words) > 0: #print(potential_value, similar_words) OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "LOCATION")) break if len(similar_words) > 0: break return OCR_processed_output