Python OCR_utils.NERNewVersion Examples

Programming Language: Python

Class/Type: OCR_utils

Method/Function: NERNewVersion

Examples at hotexamples.com: 2

Python OCR_utils.NERNewVersion - 2 examples found. These are the top rated real world Python examples of OCR_utils.NERNewVersion extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

pdf_pages_to_images(3)

NERNewVersion(2)

NERWithOldStanford(2)

image_to_text(2)

accountForMisspellings(1)

image_to_pdf(1)

images_to_pdf(1)

prepareDictForMisspellings(1)

Example #1

Show file

File: mapping_utils.py Project: agathe-balayn/man_machine_image_privatisation

def ruleBasedMapping(type_semantic_seg_rule, type_OCR_rule, type_scene_rule,
                     list_semantic_segmentation, list_OCR, list_scene,
                     needed_elements):
    polys_to_obfuscate = []

    if len(list_semantic_segmentation) > 0:
        if type_semantic_seg_rule == "simple_list":
            print("Dealing with the polygons from semantic segmentation.")
            list_private_deeplab_labels = ["person, individual, someone, somebody, mortal, soul", \
                                       "car, auto, automobile, machine, motorcar", \
                       "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, ", \
                       "motorcoach, omnibus, passenger vehicle", "truck, motortruck", "van",
                       "conveyer belt, conveyor belt, conveyer, conveyor, transporter",  "minibike, motorbike", \
                       "bicycle, bike, wheel, cycle", "poster, posting, placard, notice, bill, card", \
                       "signboard sign", "bulletin board, notice board", \
                      "screen door, screen",  "screen, silver screen, projection screen", \
                      "crt screen", "plate", "monitor, monitoring device", \
                       "bookcase", "blind, screen", "book", "computer, computing machine, computing device, data processor ", \
                        "electronic computer, information processing system", \
                        "television receiver, television, television set, tv, tv set, idiot ", \
                        "trade name, brand name, brand, marque", "flag"]
            for poly in list_semantic_segmentation:
                #print("TODO: add filter per confidence score")
                if poly[1] in list_private_deeplab_labels:
                    for poly_elem in poly[0]:
                        if poly_elem[
                                0].area > 4.0:  # Check that the size of the polygons is large enough to actually see anything on the images.
                            #print(poly[1])
                            polys_to_obfuscate.append(poly_elem[0])
                            #print("TODO: check a surface size to filter out polygons.")

    if len(list_OCR) > 0:
        if type_OCR_rule == "simple_rule":
            print("Dealing with the polygons from OCR.")

            for text_recognized in list_OCR:
                poly_text = text_recognized[0]
                #print(poly_text)
                possible_values = text_recognized[1]
                for potential_value in possible_values:
                    # Check whether the string is actually not just one letter or a space.
                    string_without_space = potential_value.translate(
                        {ord(c): None
                         for c in string.whitespace})
                    if (len(string_without_space) > 1):
                        #print(potential_value)

                        ### Obfuscate any number
                        # count number of digits in the string:
                        nb_digit = sum(
                            list(
                                map(lambda x: 1 if x.isdigit() else 0,
                                    set(potential_value))))
                        if nb_digit > 3:  # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers.
                            #print(potential_value)
                            polys_to_obfuscate.append(
                                Polygon([(poly_text[0], poly_text[1]),
                                         (poly_text[2], poly_text[1]),
                                         (poly_text[2], poly_text[3]),
                                         (poly_text[0], poly_text[3])]))
                            break

                        # Obfuscate any element recognized as a location or organization or person.
                        continuous_chunk_1 = OCR_u.NERWithOldStanford(
                            potential_value)
                        continuous_chunk_2 = OCR_u.NERNewVersion(
                            potential_value)
                        list_recognized_entities_1 = [
                            chunk[1] for chunk in continuous_chunk_1
                        ]
                        list_recognized_entities_2 = [
                            chunk[1] for chunk in continuous_chunk_2
                        ]
                        list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2
                        if ("LOCATION" in list_recognized_entities) or \
                        ("PERSON" in list_recognized_entities) or \
                        ("ORGANIZATION" in list_recognized_entities) or \
                        ("GPE" in list_recognized_entities) :
                            #print(potential_value, list_recognized_entities)
                            polys_to_obfuscate.append(
                                Polygon([(poly_text[0], poly_text[1]),
                                         (poly_text[2], poly_text[1]),
                                         (poly_text[2], poly_text[3]),
                                         (poly_text[0], poly_text[3])]))
                            break

                        # Obfuscate elements in a list of names or locations.
                        words = potential_value.split()
                        list_words = []
                        for value in words:
                            list_words += [
                                value,
                                value.upper(),
                                value.lower(),
                                value.title()
                            ]

                        for word in list_words:
                            # Get each word from the extracted strings and check for similarity
                            similar_words = difflib.get_close_matches(
                                word,
                                needed_elements["lines"],
                                n=3,
                                cutoff=0.9)
                            if len(similar_words) > 0:
                                #print(potential_value, similar_words)
                                polys_to_obfuscate.append(
                                    Polygon([(poly_text[0], poly_text[1]),
                                             (poly_text[2], poly_text[1]),
                                             (poly_text[2], poly_text[3]),
                                             (poly_text[0], poly_text[3])]))
                                break

                        if len(similar_words) > 0:
                            break

                        # Obfuscate elements that are next to "name" or "date". # Let's thnk about that later...

                        #print("TO IMPLEMENT")

        elif type_OCR_rule == "simplest_rule":
            print("Dealing with the polygons from OCR.")

            # Obfuscate all text that is diffeernt from ""
            for text_recognized in list_OCR:
                poly_text = text_recognized[0]
                possible_values = text_recognized[1]
                for potential_value in possible_values:
                    if potential_value.strip():
                        polys_to_obfuscate.append(
                            Polygon([(poly_text[0], poly_text[1]),
                                     (poly_text[2], poly_text[1]),
                                     (poly_text[2], poly_text[3]),
                                     (poly_text[0], poly_text[3])]))
                        break

    return polys_to_obfuscate

Example #2

Show file

def postProcessOCROutputs(OCR_outputs, needed_elements):
    OCR_processed_output = []
    for text_recognized in OCR_outputs:
        poly_text = text_recognized[0]
        #print(poly_text)
        possible_values = list(set(text_recognized[1]))
        #print(possible_values)
        for potential_value in possible_values:
            # Check whether the string is actually not just one letter or a space.
            string_without_space = potential_value.translate(
                {ord(c): None
                 for c in string.whitespace})
            if (len(string_without_space) > 1):
                #print(potential_value)

                ### Obfuscate any number
                # count number of digits in the string:
                nb_digit = sum(
                    list(
                        map(lambda x: 1 if x.isdigit() else 0,
                            set(potential_value))))
                if nb_digit > 3:  # This is a parameter to tune. for now, 4 corresponds to a year, we will put 6 digits minimum because it corresponds to a birth date and phone numbers have even more numbers.
                    #print(potential_value)
                    OCR_processed_output.append((Polygon([
                        (poly_text[0], poly_text[1]),
                        (poly_text[2], poly_text[1]),
                        (poly_text[2], poly_text[3]),
                        (poly_text[0], poly_text[3])
                    ]), potential_value, "hasNumbers"))
                    break

                # Obfuscate any element recognized as a location or organization or person.
                continuous_chunk_1 = OCR_u.NERWithOldStanford(potential_value)
                continuous_chunk_2 = OCR_u.NERNewVersion(potential_value)
                list_recognized_entities_1 = [
                    chunk[1] for chunk in continuous_chunk_1
                ]
                list_recognized_entities_2 = [
                    chunk[1] for chunk in continuous_chunk_2
                ]
                list_recognized_entities = list_recognized_entities_1 + list_recognized_entities_2
                if ("LOCATION" in list_recognized_entities) or \
                ("PERSON" in list_recognized_entities) or \
                ("ORGANIZATION" in list_recognized_entities) or \
                ("GPE" in list_recognized_entities) :
                    #print(potential_value, list_recognized_entities)
                    if ("LOCATION" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "LOCATION"))
                        break
                    elif ("PERSON" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "PERSON"))
                        break
                    elif ("ORGANIZATION" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "ORGANIZATION"))
                        break
                    elif ("GPE" in list_recognized_entities):
                        OCR_processed_output.append((Polygon([
                            (poly_text[0], poly_text[1]),
                            (poly_text[2], poly_text[1]),
                            (poly_text[2], poly_text[3]),
                            (poly_text[0], poly_text[3])
                        ]), potential_value, "GPE"))
                        break

                # Obfuscate elements in a list of names or locations.
                words = potential_value.split()
                list_words = []
                for value in words:
                    list_words += [
                        value,
                        value.upper(),
                        value.lower(),
                        value.title()
                    ]

                for word in list_words:
                    similar_words = ""
                    if len(
                            word
                    ) > 3:  # This is a design choice to avoid small words...
                        # Get each word from the extracted strings and check for similarity
                        similar_words = difflib.get_close_matches(
                            word,
                            needed_elements["lines_names"],
                            n=3,
                            cutoff=0.9)
                        if len(similar_words) > 0:
                            #print(potential_value, similar_words)
                            OCR_processed_output.append((Polygon([
                                (poly_text[0], poly_text[1]),
                                (poly_text[2], poly_text[1]),
                                (poly_text[2], poly_text[3]),
                                (poly_text[0], poly_text[3])
                            ]), potential_value, "PERSON"))
                            break
                        similar_words = difflib.get_close_matches(
                            word,
                            needed_elements["lines_location"],
                            n=3,
                            cutoff=0.9)
                        if len(similar_words) > 0:
                            #print(potential_value, similar_words)
                            OCR_processed_output.append((Polygon([
                                (poly_text[0], poly_text[1]),
                                (poly_text[2], poly_text[1]),
                                (poly_text[2], poly_text[3]),
                                (poly_text[0], poly_text[3])
                            ]), potential_value, "LOCATION"))
                            break

                    if len(similar_words) > 0:
                        break
    return OCR_processed_output