Exemple #1
0
class TableHandler(object):

    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.PRINT_TO_CHECKFILE = False
        # a line starting with these words can't be in a table
        self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:",
                                   "von","Gründung:", "Ordnungsnr.", "Ordnungsnr",
                                   "Grundkapital:","Umstellung"]

        #with open("checkfile_tables.txt", "w") as myfile:
         #   myfile.write("----" + "\n")

    def recognize_a_line(self, line):

        if line == None or line == False or line == True or line.textstr == None:
            return False

        whole_text = line.textstr
        self.cpr.print("recognizing line:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for key_index, key in enumerate(line.word['text']):
            word = line.word['text'][key]
            uid_info = line.word['UID'][key]
            word_xstart = line.data['word_x0'][character_index]
            word_xstop = line.data['word_x1'][character_index]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if key_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word is None or word == "":
                continue

            if key_index == 0:
                if word in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word[0] == "(":
                    starts_with_parenthesis = True


            if key_index == len(line.word['text'])-1:
                if word[-1] == ")":
                    ends_with_parenthesis = True



            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0


            counter_words += 1

            word_list = list(word)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word))
            counters_numbers.append(counter_numbers_word)
            character_index += len(uid_info)
            last_xstop = word_xstop


        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers


        if counter_chars == 0:
            self.cpr.printw("no chars shouldn't happen, no recognizion")
            return False

        special_chars_ratio = counter_special_chars/ counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces/ counter_chars
        numbers_ratio = counter_numbers / counter_chars


        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers/counter,2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths)-1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True



        self.cpr.print("alle cntr:", counter_chars)
        self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio)
        self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio)
        self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio)
        self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio)
        self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio)
        self.cpr.print("x_box_sizes", x_box_sizes)
        self.cpr.print("x_gaps", x_gaps)
        self.cpr.print("x_gap_max_size", maximum_x_gap)
        self.cpr.print("x_gaps_mean", mean_x_gap)
        self.cpr.print("x_gaps_median", median_x_gap)

        if "Gewinn nach Vortrag" in whole_text:
            print("")


        if ((alphabetical_ratio < 0.75 and \
            numbers_ratio > 0.2 and \
            counter_chars > 5 and \
            counter_words >= 2) and not \
            (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word:

            if first_word_no_table_indicator:
                return False

            if mean_x_gap <= 115:
                return False
            if many_alphabetical_in_last_word:
                return False
            if many_alphabetical_in_middle_words and many_numbers_in_first_word:
                return False


            self.cpr.print("possible entry:", whole_text)

            if self.PRINT_TO_CHECKFILE:
                with open("checkfile_tables.txt", "a") as myfile:
                    myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \
                             + "||| median x_gap: " + str(median_x_gap)+"\n")

            print("jab")
            return True

        return False
class EndobjectFactory(object):
    """
    Creates an object with the following structure and provides exporting methods:

    segment_tag_1: [                ---> this level is created by set_current_main_list
        {
            type: "Sitz"            ---> add this level entries with add_to_my_object object_number=0
            city: "Neustadt"
        },
        {
            type: "Sitz"            ---> add this level entries with add_to_my_object object_number=0
            city: "Neustadt"
        }

    ],
    segment_tag_2: [
        {
            ...
        }
        ...
    ]
    """
    def __init__(self):
        self.my_object = {}
        self.current_main_list = None
        self.pp = pprint.PrettyPrinter(indent=5)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            self.known_uc = KnownUncategories()

    def set_current_main_list(self, segment_tag):
        if segment_tag not in self.my_object.keys():
            self.my_object[segment_tag] = []              # create the main list (all subsequent entries are stored here)

        self.current_main_list = self.my_object[segment_tag]  # create a short link on the main list

    def add_to_my_obj(self, key, value, object_number=0, only_filled=False):

        if only_filled is True and (value == None or value == "" or value == [] or value == {}):
            return False

        # fill main list if object index not in
        len_list = len(self.current_main_list)
        if len_list < object_number+1:
            for index in range(len_list,object_number+1):
                self.current_main_list.append({})

        self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value)
        # add or insert to the main_list
        self.current_main_list[object_number][key] = value
        return True

    def print_me_and_return(self):
        print("my_object is:")
        self.pp.pprint(self.my_object)
        return self.my_object

    def print_current_main(self):
        print("current_main:")
        self.pp.pprint(self.current_main_list)

    def export_as_json(self):
        my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False)
        return my_obj_json

    def export_as_json_at_key(self, key, remove_first_object=False):

        if key not in self.my_object.keys():
            return None

        my_obj = self.my_object[key]
        if remove_first_object:
            if len(my_obj) >= 1:
                my_obj = my_obj[1:]  # remove the first object which usally contains generic info

        my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False)
        return my_obj_json

    @staticmethod
    def fetch_subentries_recursive_check(entry):
        """
        Fetches all subentries (values) from an entry and writes them to a list of texts
        This get's called recursively within the function until all subentries
        are found
        :param entry: entry to fetch the subentries from
        :return: list of subentries
        """
        final_texts = []

        for item in entry:
            if isinstance(entry, list):
                value = item
            else:
                # item is a key
                value = entry[item]
            if isinstance(value, str):
                final_texts.append(value)
            elif isinstance(value, int):
                final_texts.append(str(value))
            elif isinstance(value, object):
                obj_size = len(value)
                if obj_size > 0:
                    recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value)
                    final_texts.extend(recursive_texts)

        return final_texts

    @staticmethod
    def fetch_keys_recusive_check(entry, final_keys, create_multiple=True):
        """
        Fetches all keys in an object and it's sub-objects
        calls itself recursively until all keys are found
        writes final keys to final_keys array and returns this
        :param entry: object to fetch the sub-keys from
        :param final_keys: list of final keys (initial state)
        :param create_multiple: if the same key occurs multiple times it still gets added
        :return: final_keys with added keys from object
        """

        if isinstance(entry, list):
            for item in entry:
                final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple)
            return final_keys
        elif not isinstance(entry, dict):
            # just return if there are no keys (cause no dictionary)
            return final_keys

        for key in entry:
            value = entry[key]
            if create_multiple or key not in final_keys:
                if isinstance(key, int):
                    continue
                final_keys.append(key)
            final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys)
        return final_keys

    def diff_seg_to_orig_at_key(self, key):
        """
        def fetch_subentries_recursive(entry):
            final_texts = []

            for item in entry:
                if isinstance(entry, list):
                    value = item
                else:
                    # item is a key
                    value = entry[item]
                if isinstance(value, str):
                    final_texts.append(value)
                elif isinstance(value, int):
                    final_texts.append(str(value))
                elif isinstance(value, object):
                    obj_size = len(value)
                    if obj_size > 0:
                        recursive_texts = fetch_subentries_recursive(value)
                        final_texts.extend(recursive_texts)

            return final_texts
        """
        if key not in self.my_object.keys():
            return None

        my_data = self.my_object[key]

        # check if the orig-post property can exist warn if not
        if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
            self.cpr.printw("trying to fetch original data, original data is not added to results")
            self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
        if len(my_data) <= 0:
            self.cpr.printw("no data to do returning")
            return

        return # todo this seems to be wrong
        # copy orig string
        original_text = my_data[0]['origpost']
        rest_text = original_text

        # fetch parsed entries for diff
        all_final_entries = []  # array of final entries
        for index in range(1, len(my_data)):
            entry = my_data[index]
            final_entries = fetch_subentries_recursive(entry)
            all_final_entries.extend(final_entries)

        # order diff data after length
        all_final_entries.sort(key=lambda x: len(x))
        all_final_entries.reverse()

        # subtract
        for text in all_final_entries:
            rest_text = rest_text.replace(text, "")

            rest_text = rest_text.strip()

        return rest_text, original_text

    def diff_parsed_to_orig_at_key(self, key):
        """
        def fetch_subentries_recursive(entry):
            final_texts = []

            for item in entry:
                if isinstance(entry, list):
                    value = item
                else:
                    # item is a key
                    value = entry[item]
                if isinstance(value, str):
                    final_texts.append(value)
                elif isinstance(value, int):
                    final_texts.append(str(value))
                elif isinstance(value, object):
                    obj_size = len(value)
                    if obj_size > 0:
                        recursive_texts = fetch_subentries_recursive(value)
                        final_texts.extend(recursive_texts)

            return final_texts

        def fetch_keys_recusive(entry, final_keys, create_multiple=True):
            # just return if there are no keys (cause no dictionary)
            if not isinstance(entry, dict):
                return final_keys

            for key in entry:
                value = entry[key]
                if create_multiple or key not in final_keys:
                    if isinstance(key, int):
                        continue
                    final_keys.append(key)
                final_keys = fetch_keys_recusive(value, final_keys)
            return final_keys
        """
        if key not in self.my_object.keys():
            return None

        #if key == "KursVonZuteilungsrechten":
        #   print("todo remove debug")

        my_data = self.my_object[key]

        # check if the orig-post property can exist warn if not
        if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
            self.cpr.printw("trying to fetch original data, original data is not added to results")
            self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
        if len(my_data) <= 0:
            self.cpr.printw("no data to do returning")
            return
        # copy orig string
        original_text = my_data[0]['origpost']
        rest_text = original_text

        # fetch parsed entries for diff
        pool_entries = []  # array of final entries
        for index in range(1, len(my_data)):
            entry = my_data[index]
            final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry)
            pool_entries.extend(final_entries)

        if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
            # removes all spaces from rest and comparison values because spaces are often
            # a problem in subtracting the rests
            rest_text = rest_text.replace(" ", "")
            for index in range(0,len(pool_entries)):
                pool_entries[index] = pool_entries[index].replace(" ", "")

        all_final_entries = []

        # add the entries to the complete subtraction and tag them with '1'
        for pentry in pool_entries:
            all_final_entries.append((pentry, 1))

        # if keys shall be subracted also add them also
        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            pool_keys = []  # gets multiple of the same key for later 1 by 1 subtraction
            for index in range(1, len(my_data)):
                pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True)

            # also remove spaces in keys
            if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
                for index in range(0, len(pool_keys)):
                    pool_keys[index] = pool_keys[index].replace(" ", "")

            final_keys = []
            for pkey in pool_keys:
                final_keys.append((pkey, 2))

            all_final_entries.extend(final_keys)

        # order diff data after length
        all_final_entries.sort(key=lambda x: len(x[0]))
        all_final_entries.reverse()

        # subtract
        for entry in all_final_entries:
            text = entry[0]
            text_or_key = entry[1]
            if text_or_key == 2:
                if text in self.known_uc.unkeys:
                    continue
            text_stripped = text.strip()  # remove spaces so texts better fit in
            rest_text = rest_text.replace(text_stripped, "", 1)
            rest_text = rest_text.strip()

        return rest_text, original_text
Exemple #3
0
class SegmentParser(object):
    """
    Parse the classified segments segment by segment,
    each segment defined code the parser points to.
    """
    def __init__(self,
                 output_analyzer,
                 dictionary_handler,
                 ocromore_data=None):

        self.ef = EndobjectFactory()
        self.dictionary_handler = dictionary_handler

        # map which maps tags to functions for parsing -> change constuctor for other project
        fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.function_map = fmap.get_function_map()
        self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/"

    def clear_result(self,
                     output_analyzer,
                     dictionary_handler,
                     ocromore_data=None):
        # create a new end object factory, new content
        self.ef = EndobjectFactory()
        # map to the new ef object which has been recreated
        fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)
        self.function_map = fmap.get_function_map()

    def parse_segments(self, ocromore_data):
        self.ocromore_data = ocromore_data
        segmentation = ocromore_data['segmentation']
        segmentation_classes = segmentation.my_classes

        # add all text from original file if activated (i.e. for debugging purposes)
        if self.config.ADD_FULLTEXT_ENTRY:
            all_texts = self.get_all_text(ocromore_data)
            self.ef.set_current_main_list("overall_info")
            self.ef.add_to_my_obj("fulltexts", all_texts)
        # add additional info to result
        if self.config.ADDITIONAL_INFORMATION and self.config.ADD_ADDITIONAL_INFO:
            if not self.config.ADD_FULLTEXT_ENTRY:
                self.ef.set_current_main_list("Information")
            self.ef.add_to_my_obj("additionals",
                                  ocromore_data["additional_info"])
        # add a duplicate of the original text from which in the below analysis case the files get subtracted
        if self.config.LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE:
            if self.config.ADD_FULLTEXT_ENTRY:
                ocromore_data['analysis_to_orig'] = {}
                original_rest, complete_text = self.get_all_text(
                    ocromore_data, join_separated_lines=True)
                ocromore_data['analysis_to_orig'][
                    'original_rest'] = original_rest
                ocromore_data['analysis_to_orig'][
                    'original_length_initial'] = len(complete_text)
            else:
                self.cpr.printw(
                    "activated segment to orig diff, but no saving of origin activate ADD_FULLTEXT_ENTRY "
                    "in config for this functionality")

        #Init toolbbox
        snippet = None
        if self.config.USE_SNIPPET:
            if "./" in self.config.IMGPATH:
                ipath = os.path.dirname(
                    ocromore_data["file_info"].path) + self.config.IMGPATH[1:]
            else:
                ipath = os.path.normcase(self.config.IMGPATH)
            results = glob.glob(
                ipath + ocromore_data["file_info"].name.split(".")[0].replace(
                    "_msa_best", "") + "*",
                recursive=True)
            if results:
                snippet = Snippet()
                snippet.imread(results[0])
            else:
                self.config.USE_TOOLBBOX = False
        info_handler = {}
        # start parsing for each successfully segmented area
        for segmentation_class in segmentation_classes:

            # if the class segment was recognized ...
            if segmentation_class.is_start_segmented():
                # get the unique identifier for this class
                segment_tag = segmentation_class.get_segment_tag()
                segmentation_class.snippet = snippet
                segmentation_class.info_handler = info_handler
                self.trigger_mapped_function(segment_tag, segmentation_class,
                                             ocromore_data)

        # add and return result
        ocromore_data['results'] = self.ef
        return ocromore_data

    def trigger_mapped_function(self, segment_tag, segmentation_class,
                                ocromore_data):

        if segment_tag not in self.function_map.keys():
            return
        #todo: fileinfo -> parsing
        real_start_tag, content_texts, content_lines, feature_lines = self.prepare_parsing_info(
            segmentation_class, ocromore_data)

        # switch the object to save context
        segment_tag = segmentation_class.segment_tag
        self.ef.set_current_main_list(segment_tag)

        # call the mapped function, which fills the end-factory
        self.function_map[segment_tag].__call__(real_start_tag, content_texts,
                                                content_lines, feature_lines,
                                                segmentation_class)

    def prepare_parsing_info(self, segmentation_class, ocromore_data):
        lines = ocromore_data['lines']
        line_features = ocromore_data['line_features']
        real_start_tag, content_texts, content_lines, feature_lines = \
            DataHelper.get_content(lines,line_features, segmentation_class)

        return real_start_tag, content_texts, content_lines, feature_lines

    def get_all_text(self, ocromore_data, join_separated_lines=False):
        """
        Gets all text lines in ocromore_data as
        array and as joined string
        :param ocromore_data: data from which the text is extracted
        :return: texts list, complete text
        """
        all_texts = []
        complete_text = ""
        for line in ocromore_data['lines']:
            text = line['text']
            all_texts.append(text)
            complete_text += text

        if join_separated_lines:
            complete_text = ""
            all_texts = dh.join_separated_lines(all_texts)
            for text in all_texts:
                complete_text += text

        return all_texts, complete_text

    def write_result_to_output(self, as_json, ocromore_data):
        if as_json is True:
            my_json = self.ef.export_as_json()
            my_json_lines = my_json.split("\n")
            dh.write_array_to_root("result_json/", my_json_lines,
                                   ocromore_data, self.result_root)
class FeatureExtractor():
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.filter_start_words = [
            "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:",
            "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung"
        ]

    def extract_file_features(self, ocromore_data):
        all_line_features = []
        for line in ocromore_data['lines']:
            current_line_features = self.extract_line_features(line)
            all_line_features.append(current_line_features)

        ocromore_data['line_features'] = all_line_features

        return ocromore_data

    def extract_line_features(self, line):

        final_line_features = {}

        whole_text = line['text']

        self.cpr.print("recognizing text:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for word_obj in line['words']:
            word_index = word_obj['word_index']
            word_text = word_obj['text']
            hocr_coordinates = word_obj['hocr_coordinates']

            word_xstart = hocr_coordinates[0]
            word_xstop = hocr_coordinates[2]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if word_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word_text is None or word_text == "":
                continue

            if word_index == 0:
                if word_text in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word_text.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word_text[0] == "(":
                    starts_with_parenthesis = True

            if word_index == len(whole_text) - 1:
                if word_text[-1] == ")":
                    ends_with_parenthesis = True

            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0

            counter_words += 1

            word_list = list(word_text)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(
                counter_alphabetical_word / len(word_text), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word_text))
            counters_numbers.append(counter_numbers_word)
            character_index += len(word_text)
            last_xstop = word_xstop

        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers

        if counter_chars == 0:
            self.cpr.printw("no chars in line:", str(line['line_index']),
                            "no features here")
            return False

        special_chars_ratio = counter_special_chars / counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces / counter_chars
        numbers_ratio = counter_numbers / counter_chars

        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers / counter, 2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths) - 1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True

        final_line_features = LineFeatures(cpr=self.cpr)
        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word

        final_line_features.counter_special_chars = counter_special_chars
        final_line_features.counter_chars = counter_chars
        final_line_features.counter_spaces = counter_spaces
        final_line_features.counter_numbers = counter_numbers
        final_line_features.counter_alphabetical = counter_alphabetical
        final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars
        final_line_features.counter_words = counter_words

        final_line_features.counters_numbers = counters_numbers
        final_line_features.counters_wordlengths = counters_wordlengths
        final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios

        final_line_features.numbers_ratio = numbers_ratio
        final_line_features.alphabetical_ratio = alphabetical_ratio
        final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio
        final_line_features.special_chars_ratio = special_chars_ratio
        final_line_features.spaces_ratio = spaces_ratio

        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word
        final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words
        final_line_features.many_numbers_in_first_word = many_numbers_in_first_word
        final_line_features.x_box_sizes = x_box_sizes
        final_line_features.x_gaps = x_gaps

        final_line_features.maximum_x_gap = maximum_x_gap
        final_line_features.mean_x_gap = mean_x_gap
        final_line_features.median_x_gap = median_x_gap

        return final_line_features
Exemple #5
0
class VocabularyChecker():
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_VOCABULARY_CHECKER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.dict_lines = []
        self.max_edist = None
        self.suggenstion_verbosity = None
        #self.spellchecker = None
        self.special_chars_borders = "!¦1234567890,)(;.:\"-"

        self.pattern_start = re.compile(r"^[" + self.special_chars_borders +
                                        "]+")
        self.pattern_trail = re.compile(r"[" + self.special_chars_borders +
                                        "]+$")
        self.pattern_trail_dash = re.compile(r"[-]$")
        self.pattern_only_normal_chars = re.compile(r"[a-zA-Z]+")

    def _load_doc(self, filename):
        # open the file as read only
        file = open(filename, 'r')
        # read all text
        texts = file.readlines()
        # close the file
        file.close()
        return texts

    def without_special_chars(self, input_text):
        len_text = len(input_text)
        input_text_wo_sc = self.pattern_only_normal_chars.findall(input_text)
        if len(input_text_wo_sc) >= 1:
            len_text_wo_sc = len(input_text_wo_sc[0])
            ratio = len_text_wo_sc / len_text
            return input_text_wo_sc[0], ratio
        else:
            # there are only special characters
            return input_text, 0

    def get_accumulated_confidence_rate(self, word, word_acc_confs,
                                        wildcard_char):

        word_reduced, word_starting_borders, word_trailing_borders, change = self.remove_and_give_borders(
            word)
        wsplit = list(word)

        if change == False:
            acc_conf = 0
            for i in range(0, len(wsplit)):
                acc_conf += word_acc_confs[i]

            return acc_conf, acc_conf / len(
                wsplit
            ), False, word_starting_borders, word_trailing_borders, word
        else:
            acc_conf = 0

            len_start = len(word_starting_borders)
            len_trail = len(word_trailing_borders)
            for i in range(len_start, len(wsplit) - len_trail):
                acc_conf += word_acc_confs[i]

            return acc_conf, acc_conf / (
                len(wsplit) - len_start - len_trail
            ), True, word_starting_borders, word_trailing_borders, word_reduced

    def remove_and_give_borders(self, input_text):

        start_sc_text = ""
        stop_sc_text = ""

        if len(input_text) > 2:

            start_special_chars = self.pattern_start.findall(input_text)
            stop_special_chars = self.pattern_trail.findall(input_text)
            if len(start_special_chars) >= 1:
                start_sc_text = start_special_chars[0]
            if len(stop_special_chars) >= 1:
                stop_sc_text = stop_special_chars[0]

            if start_special_chars == None and stop_special_chars == None:
                return input_text, start_sc_text, stop_sc_text, False
            else:
                input_text_stripped = input_text.strip(
                    self.special_chars_borders)
                return input_text_stripped, start_sc_text, stop_sc_text, True
        else:
            return input_text, start_sc_text, stop_sc_text, False

    def word_trails_with_dash(self, input_text):
        trail_dash_res = self.pattern_trail_dash.findall(input_text)
        if len(trail_dash_res) >= 1:
            return True
        else:
            return False

    def initialize_lines(self, dict_file_path, remove_special_border_chars):
        # add the lines from a dictionary path to dict_lines
        doc = self._load_doc(dict_file_path)
        lines_doc = self._get_lines(doc, remove_special_border_chars)
        self.dict_lines.extend(lines_doc)

    def _get_lines(self, doc, remove_special_border_chars):
        lines_doc = []
        for line in doc:
            if "--------------" in line:
                continue

            line = line.replace('\n', "")

            if remove_special_border_chars:
                # print("lbef",line)
                line = line.strip(self.special_chars_borders)

            # print("laft",line)

            linelen = len(line)
            if linelen > 2:
                if linelen < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH:
                    continue  # filter out lengths which are shorter than minimum

                if self.config.KEYING_RESULT_VC_DOWNCAST_ALL_CASES:
                    line_low = line.lower()
                    if line_low != line:
                        lines_doc.append(line_low)

                lines_doc.append(line)

        return lines_doc

    def initialize_spellchecker(self):
        try:
            from pysymspell.symspell import SymSpell
            if self.dict_lines == None:
                self.cpr.printw(
                    "can't initialize spellchecker, please first call initialize_lines"
                )
                return

            # set paramters
            self.max_edist = self.config.KEYING_RESULT_VC_EDIT_DISTANCE_LEVEL
            self.suggenstion_verbosity = SymSpell.Verbosity.CLOSEST

            # initialize symspell as spellchecker
            sym_spell = SymSpell(self.max_edist)

            # load dictionary to spellchecker
            sym_spell.create_dictionary_by_list(self.dict_lines)
            self.spellchecker = sym_spell
        except:
            print(
                "To use the vocabulary checker you must pull PySymSpell from GitHub in the directory (AWARE: MIT License)"
                "by activate and initalize the submodule (delete the comment symbol: #):\n"
                ".gitmodule at line: 1-3")

    def correct_text_at_certain_indices_only(self, input_text,
                                             possible_error_indices):

        replacement_char = "‖"
        return_term, suggestions, first_letter_high = self.correct_text(
            input_text, suggestion_verbosity=SymSpell.Verbosity.ALL)

        if input_text == return_term:
            return return_term
        #print("asd")

        input_text_array = list(input_text)

        #if "Vortrag" in input_text or len(suggestions)>=2:
        #    print("asd")

        suggestion_number_error_correction_count = []

        num_of_possible_suggestions = 0

        for suggestion in suggestions:
            input_text_array_c = input_text_array[:]  # copy input text array
            sug_array = list(suggestion.term)

            for char_index_it, char_it in enumerate(input_text_array):
                for char_index_sug, char_sug in enumerate(sug_array):

                    if input_text_array_c[char_index_it] == sug_array[
                            char_index_sug]:
                        input_text_array_c[char_index_it] = replacement_char
                        sug_array[char_index_sug] = replacement_char
                        continue
            # print("asd")

            number_of_possible_errors_corrected = 0
            # check if char was sustracted in possible error indices
            for index in possible_error_indices:
                char_to_check = input_text_array_c[index]
                char_previous = input_text_array[index]
                if char_to_check == char_previous:
                    number_of_possible_errors_corrected += 1

            if number_of_possible_errors_corrected >= 1:
                num_of_possible_suggestions += 1

            suggestion_number_error_correction_count.append(
                number_of_possible_errors_corrected)

        if len(suggestion_number_error_correction_count) <= 0:
            return None

        # if num_of_possible_suggestions >=2:
        #     print("asd")

        best_suggestion_index = np.argmax(
            suggestion_number_error_correction_count)
        best_suggestion_ecccount = suggestion_number_error_correction_count[
            best_suggestion_index]
        if best_suggestion_ecccount > 0:
            best_suggestion_value = suggestions[best_suggestion_index].term
            if first_letter_high:
                best_suggestion_value = best_suggestion_value[0].upper(
                ) + best_suggestion_value[1:]
            return best_suggestion_value
        else:
            return None

    def correct_text(self, input_text, suggestion_verbosity=None):

        first_letter_high = False
        if self.config.KEYING_RESULT_VC_DOWNCAST_ALL_CASES:
            first_letter = input_text[0]
            first_letter_high = first_letter.islower() == False
        #    input_text = input_text.lower()

        suggestion_verbosity_used = self.suggenstion_verbosity
        if suggestion_verbosity != None:
            suggestion_verbosity_used = suggestion_verbosity

        suggestions = self.spellchecker.lookup(input_text,
                                               suggestion_verbosity_used,
                                               self.max_edist)

        if len(suggestions) >= 1:
            return_term = suggestions[0]._term
            if self.config.KEYING_RESULT_VC_DOWNCAST_ALL_CASES and first_letter_high:
                return_term = return_term[0].upper() + return_term[1:]

            return return_term, suggestions, first_letter_high
        else:
            return None, suggestions, first_letter_high