def ruleBasedMapping(type_semantic_seg_rule, type_OCR_rule, type_scene_rule, list_semantic_segmentation, list_OCR, list_scene, needed_elements): polys_to_obfuscate = [] if len(list_semantic_segmentation) > 0: if type_semantic_seg_rule == "simple_list": print("Dealing with the polygons from semantic segmentation.") list_private_deeplab_labels = ["person, individual, someone, somebody, mortal, soul", \ "car, auto, automobile, machine, motorcar", \ "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, ", \ "motorcoach, omnibus, passenger vehicle", "truck, motortruck", "van", "conveyer belt, conveyor belt, conveyer, conveyor, transporter", "minibike, motorbike", \ "bicycle, bike, wheel, cycle", "poster, posting, placard, notice, bill, card", \ "signboard sign", "bulletin board, notice board", \ "screen door, screen", "screen, silver screen, projection screen", \ "crt screen", "plate", "monitor, monitoring device", \ "bookcase", "blind, screen", "book", "computer, computing machine, computing device, data processor ", \ "electronic computer, information processing system", \ "television receiver, television, television set, tv, tv set, idiot ", \ "trade name, brand name, brand, marque", "flag"] for poly in list_semantic_segmentation: #print("TODO: add filter per confidence score") if poly[1] in list_private_deeplab_labels: for poly_elem in poly[0]: if poly_elem[ 0].area > 4.0: # Check that the size of the polygons is large enough to actually see anything on the images. #print(poly[1]) polys_to_obfuscate.append(poly_elem[0]) #print("TODO: check a surface size to filter out polygons.") if len(list_OCR) > 0: if type_OCR_rule == "simple_rule": print("Dealing with the polygons from OCR.") for text_recognized in list_OCR: poly_text = text_recognized[0] #print(poly_text) possible_values = text_recognized[1] for potential_value in possible_values: # Check whether the string is actually not just one letter or a space. string_without_space = potential_value.translate( {ord(c): None for c in string.whitespace}) if (len(string_without_space) > 1): #print(potential_value) ### Obfuscate any number # count number of digits in the string: nb_digit = sum( list( map(lambda x: 1 if x.isdigit() else 0, set(potential_value)))) if nb_digit > 3: # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers. #print(potential_value) polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break # Obfuscate any element recognized as a location or organization or person. continuous_chunk_1 = OCR_u.NERWithOldStanford( potential_value) continuous_chunk_2 = OCR_u.NERNewVersion( potential_value) list_recognized_entities_1 = [ chunk[1] for chunk in continuous_chunk_1 ] list_recognized_entities_2 = [ chunk[1] for chunk in continuous_chunk_2 ] list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2 if ("LOCATION" in list_recognized_entities) or \ ("PERSON" in list_recognized_entities) or \ ("ORGANIZATION" in list_recognized_entities) or \ ("GPE" in list_recognized_entities) : #print(potential_value, list_recognized_entities) polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break # Obfuscate elements in a list of names or locations. words = potential_value.split() list_words = [] for value in words: list_words += [ value, value.upper(), value.lower(), value.title() ] for word in list_words: # Get each word from the extracted strings and check for similarity similar_words = difflib.get_close_matches( word, needed_elements["lines"], n=3, cutoff=0.9) if len(similar_words) > 0: #print(potential_value, similar_words) polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break if len(similar_words) > 0: break # Obfuscate elements that are next to "name" or "date". # Let's thnk about that later... #print("TO IMPLEMENT") elif type_OCR_rule == "simplest_rule": print("Dealing with the polygons from OCR.") # Obfuscate all text that is diffeernt from "" for text_recognized in list_OCR: poly_text = text_recognized[0] possible_values = text_recognized[1] for potential_value in possible_values: if potential_value.strip(): polys_to_obfuscate.append( Polygon([(poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3])])) break return polys_to_obfuscate
def postProcessOCROutputs(OCR_outputs, needed_elements): OCR_processed_output = [] for text_recognized in OCR_outputs: poly_text = text_recognized[0] #print(poly_text) possible_values = list(set(text_recognized[1])) #print(possible_values) for potential_value in possible_values: # Check whether the string is actually not just one letter or a space. string_without_space = potential_value.translate( {ord(c): None for c in string.whitespace}) if (len(string_without_space) > 1): #print(potential_value) ### Obfuscate any number # count number of digits in the string: nb_digit = sum( list( map(lambda x: 1 if x.isdigit() else 0, set(potential_value)))) if nb_digit > 3: # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers. #print(potential_value) OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "hasNumbers")) break # Obfuscate any element recognized as a location or organization or person. continuous_chunk_1 = OCR_u.NERWithOldStanford(potential_value) continuous_chunk_2 = OCR_u.NERNewVersion(potential_value) list_recognized_entities_1 = [ chunk[1] for chunk in continuous_chunk_1 ] list_recognized_entities_2 = [ chunk[1] for chunk in continuous_chunk_2 ] list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2 if ("LOCATION" in list_recognized_entities) or \ ("PERSON" in list_recognized_entities) or \ ("ORGANIZATION" in list_recognized_entities) or \ ("GPE" in list_recognized_entities) : #print(potential_value, list_recognized_entities) if ("LOCATION" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "LOCATION")) break elif ("PERSON" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "PERSON")) break elif ("ORGANIZATION" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "ORGANIZATION")) break elif ("GPE" in list_recognized_entities): OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "GPE")) break # Obfuscate elements in a list of names or locations. words = potential_value.split() list_words = [] for value in words: list_words += [ value, value.upper(), value.lower(), value.title() ] for word in list_words: similar_words = "" if len( word ) > 3: # This is a design choice to avoid small words... # Get each word from the extracted strings and check for similarity similar_words = difflib.get_close_matches( word, needed_elements["lines_names"], n=3, cutoff=0.9) if len(similar_words) > 0: #print(potential_value, similar_words) OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "PERSON")) break similar_words = difflib.get_close_matches( word, needed_elements["lines_location"], n=3, cutoff=0.9) if len(similar_words) > 0: #print(potential_value, similar_words) OCR_processed_output.append((Polygon([ (poly_text[0], poly_text[1]), (poly_text[2], poly_text[1]), (poly_text[2], poly_text[3]), (poly_text[0], poly_text[3]) ]), potential_value, "LOCATION")) break if len(similar_words) > 0: break return OCR_processed_output