Exemple #1
0
class VisualizationHandler(object):
    def __init__(self):
        self.os = os.name.lower()
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)

    def show_file_comparison_pycharm(self, filepath_1, filepath_2):
        if self.os == 'linux' or self.os == 'posix':
            try:
                process = Popen(["charm", "diff", filepath_1, filepath_2])
                return process

            except Exception as ex:
                self.cpr.printex("Exception calling pycharm", ex)
        else:
            self.cpr.printex("Write code here for other os, or take other os")

        return None

    def show_file_comparison_meld(self,
                                  filepath_1,
                                  filepath_2,
                                  just_add_tab=False):
        if self.os == 'linux' or self.os == 'posix':
            try:
                if just_add_tab:
                    process = Popen(
                        ["meld", "--newtab", filepath_1, filepath_2])
                else:
                    process = Popen(["meld", filepath_1, filepath_2])
                return process

            except Exception as ex:
                self.cpr.printex("Exception calling meld", ex)
        else:
            self.cpr.printex("Write code here for other os, or take other os")

        return None
Exemple #2
0
class OCRVoter(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.cpr_vocab_check = ConditionalPrint(
            self.config.PRINT_VOCABULARY_CHECKER,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)
        self.cpr_sc_predict = ConditionalPrint(
            self.config.PRINT_SPECIALCHAR_PREDICTOR,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)

        self.filo_last_chars = Filo(250)
        self.predictor = None
        self.use_aufsichtsrat_prediction = False
        self.vocab_checker = None
        self.previous_word_with_seperator = False

    def add_predictor(self, predictor):
        self.predictor = predictor

    def add_vocab_checker(self, vocab_checker):
        self.vocab_checker = vocab_checker

    def get_same_count(self, c1, c2, c3):
        same_ctr = 0
        if c1 == c2:
            same_ctr += 1

        if c1 == c3:
            same_ctr += 1

        return same_ctr

    def get_confidence_count(self,
                             char1,
                             char2,
                             char3,
                             cconf1,
                             cconf2,
                             cconf3,
                             wildcard_char='¦'):
        def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3):
            if char_first != char_sec:
                return char_sec, float(co2)
            elif char_first != char_thrd:
                return char_thrd, float(co3)

        same_ctr = 0
        cconf_ctr = float(cconf1)

        if char1 == char2:
            same_ctr += 1
            cconf_ctr += float(cconf2)
        if char1 == char3:
            same_ctr += 1
            cconf_ctr += float(cconf3)

        # special cases space: ' ', ' ', 'x'
        # wildcard character : '¦', '¦', '¦'

        if char1 == ' ' and same_ctr == 1:
            # if the confidence of the other character is below that value, space gets the high put in confidence value
            return 1, 95.0  #todo j4t

            SPACE_TRESH = 50.0
            SPACE_PUT_IN_VALUE = 99.0
            otherchar, otherconf = get_other_char(char1, char2, char3, cconf1,
                                                  cconf2, cconf3)
            #print("otherchar",otherchar,"otherconf",otherconf)
            if otherconf < SPACE_TRESH:
                return 1, SPACE_PUT_IN_VALUE

        elif char1 == wildcard_char and same_ctr == 1:  #todo: differentiate type of character ??
            # if there is two wildcards and one characters, characters confidence has to be higher than
            # WILDCARD_TRESH to be taken

            wildcard_tresh = 98.5
            if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                wildcard_tresh -= 10  # 0:99,19%, 20:99.16%, 10:99.27%

            return 1, wildcard_tresh

        elif char1 == wildcard_char and same_ctr == 0:
            pass  # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard)
        elif char1 == '' and same_ctr == 0:
            pass  # todo maybe cover this case (cause space has no confidence ...
        elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \
            and Random.is_special_character(char1) and same_ctr == 0 \
            and char2 == wildcard_char and char3 == wildcard_char:
            # lower the confidence of special characters which stand without any other chars
            return same_ctr, cconf_ctr * 0.9

        return same_ctr, cconf_ctr

    def vote_best_of_three_simple(self,
                                  text_1,
                                  text_2,
                                  text_3,
                                  index_best,
                                  wildcard_character='¦'):
        list_line_1 = list(text_1)
        list_line_2 = list(text_2)
        list_line_3 = list(text_3)

        accumulated_chars = ""
        accumulated_confs = Filo
        for character_index, character_1 in enumerate(list_line_1):
            character_2 = list_line_2[character_index]
            character_3 = list_line_3[character_index]

            clist = [character_1, character_2, character_3]
            # get the character which occurs the most
            sc1 = self.get_same_count(character_1, character_2, character_3)
            sc2 = self.get_same_count(character_2, character_1, character_3)
            sc3 = self.get_same_count(character_3, character_2, character_1)
            maxindices = np.argmax([sc2, sc1, sc3])
            if maxindices == 0:
                accumulated_chars += character_2
            elif maxindices == 1:
                accumulated_chars += character_1
            else:
                accumulated_chars += character_3

        accumulated_chars_stripped = accumulated_chars.replace(
            wildcard_character, '')

        return accumulated_chars, accumulated_chars_stripped

    def vote_best_of_three_charconfs(self,
                                     line_1,
                                     line_2,
                                     line_3,
                                     index_best,
                                     wildcard_character='¦'):
        try:

            def try_obtain_charconf(value, undef_value=0):
                if value is None or value is False or value is True:
                    return undef_value
                return value

            def try_obtain_char(charlist, index):
                if index >= len(charlist):
                    return False  #j4t means not defined
                else:
                    return charlist[index]

            key_confs_mapping = 'UID'
            key_confs = 'x_confs'
            key_char = 'calc_char'
            self.cpr.print("vote_text1", line_1.textstr)
            self.cpr.print("vote_text2", line_2.textstr)
            self.cpr.print("vote_text3", line_3.textstr)
            #if "¦¦lt.H" in line_1.textstr:
            #    self.cpr.print("asd")

            maximum_char_number = max(len(line_1.textstr), len(line_2.textstr),
                                      len(line_3.textstr))

            accumulated_chars = ""

            for character_index in range(
                    0, maximum_char_number
            ):  # check: is list 1 always best reference?

                character_1 = line_1.value(key_char, character_index)
                character_2 = line_2.value(key_char, character_index)
                character_3 = line_3.value(key_char, character_index)

                charconf_1 = try_obtain_charconf(
                    line_1.value(key_confs, character_index, wsval=50.0))
                charconf_2 = try_obtain_charconf(
                    line_2.value(key_confs, character_index, wsval=50.0))
                charconf_3 = try_obtain_charconf(
                    line_3.value(key_confs, character_index, wsval=50.0))

                clist = [character_1, character_2, character_3]
                # get the character which occurs the most
                sc1, acc_conf_1 = self.get_confidence_count(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)
                sc2, acc_conf_2 = self.get_confidence_count(
                    character_2, character_1, character_3, charconf_2,
                    charconf_1, charconf_3)
                sc3, acc_conf_3 = self.get_confidence_count(
                    character_3, character_2, character_1, charconf_3,
                    charconf_2, charconf_1)
                maxindices = np.argmax([
                    acc_conf_2, acc_conf_1, acc_conf_3
                ])  # this takes in priorisation in case the chars are same
                #todo:import to config
                if character_index == maximum_char_number - 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I":
                    continue

                if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True:
                    tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH
                    maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3)
                    if maximum_conf < tresh:
                        if [character_2, character_1, character_3
                            ][maxindices] != '¦':
                            continue

                if maxindices == 0:
                    accumulated_chars += character_2
                elif maxindices == 1:
                    accumulated_chars += character_1
                else:
                    accumulated_chars += character_3

            accumulated_chars_stripped = accumulated_chars.replace(
                wildcard_character, '')

            return accumulated_chars, accumulated_chars_stripped
        except Exception as ex:
            tr = inspect.trace()

            self.cpr.printex("ocr_voter.py Exception during confidence vote:",
                             ex)
            self.cpr.printex("trace is:", tr)

    def increase_umlaut_confidence(self, chars, charconfs):

        charconfs_adapted = []

        for char_index, char in enumerate(chars):
            if char in SpecialChars.umlauts_caps or char in SpecialChars.umlauts:
                cconf_to_add = charconfs[
                    char_index] + SpecialChars.umlaut_increment
            elif char in SpecialChars.special_chars:
                cconf_to_add = charconfs[
                    char_index] + SpecialChars.special_char_increment
            else:
                cconf_to_add = charconfs[char_index]

            charconfs_adapted.append(cconf_to_add)

        return charconfs_adapted

    def vote_best_of_three_charconfs_searchspaces(self,
                                                  line_1,
                                                  line_2,
                                                  line_3,
                                                  index_best,
                                                  wildcard_character='¦'):
        try:

            key_confs_mapping = 'UID'
            key_confs = 'x_confs'
            key_char = 'calc_char'
            self.cpr.print("vote_text1", line_1.textstr)
            self.cpr.print("vote_text2", line_2.textstr)
            self.cpr.print("vote_text3", line_3.textstr)
            #if "Beteiligung:" in line_1.textstr:
            #     self.cpr.print("asd")

            maximum_char_number = max(len(line_1.textstr), len(line_2.textstr),
                                      len(line_3.textstr))

            accumulated_chars = ""
            accumulated_confs = Filo(300)

            # search space settings
            SEARCH_SPACE_Y_SIZE = 3
            SEARCH_SPACE_X_SIZE_OUTER = 7
            SEARCH_SPACE_X_SIZE_INNER = 3
            SEARCH_SPACE_X_SEARCH_RANGE = 1
            SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR = '¦'
            SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS = True
            SEARCH_RANGE = 1
            PRINT_MATRICES = self.config.PRINT_SEARCH_SPACE_MATRICES

            # initialize search space processor and search spaces
            search_space_processor = SearchSpaceProcessor(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_INNER, \
                                                          wildcard_character, SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR)

            ssp_chars = SearchSpace(SEARCH_SPACE_Y_SIZE,
                                    SEARCH_SPACE_X_SIZE_OUTER,
                                    SEARCH_SPACE_X_SEARCH_RANGE, True)
            ssp_confs = SearchSpace(SEARCH_SPACE_Y_SIZE,
                                    SEARCH_SPACE_X_SIZE_OUTER,
                                    SEARCH_SPACE_X_SEARCH_RANGE, True)

            # check if one of the lines is empty for certain settings
            one_line_empty = False
            if self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS or \
                self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                one_line_empty = self.check_if_one_line_empty(
                    [line_1, line_2, line_3], wildcard_character)

            # loop through the maximum character range of the lines
            range_extension = SEARCH_SPACE_X_SIZE_INNER
            for character_index in range(
                    0, maximum_char_number + range_extension +
                    2):  # check: is list 1 always best reference?

                if character_index < maximum_char_number:
                    # if there is a character within range (no padding char from extension)
                    # get character values and obtain corresponding confidences (from searchspace because they might
                    # be different to normal values because of swapping
                    line_vals = [line_1.value(key_char, character_index), line_2.value(key_char, character_index), \
                                 line_3.value(key_char, character_index)]

                    line_1_conf = line_1.value(key_confs,
                                               character_index,
                                               wsval=50.0)
                    line_2_conf = line_2.value(key_confs,
                                               character_index,
                                               wsval=50.0)
                    line_3_conf = line_3.value(key_confs,
                                               character_index,
                                               wsval=50.0)

                    charconf_1 = self.try_obtain_charconf_searchspace(
                        line_1_conf,
                        line_vals[0],
                        engine_key=line_1.name[0],
                        one_line_empty=one_line_empty)
                    charconf_2 = self.try_obtain_charconf_searchspace(
                        line_2_conf,
                        line_vals[1],
                        engine_key=line_2.name[0],
                        one_line_empty=one_line_empty)
                    charconf_3 = self.try_obtain_charconf_searchspace(
                        line_3_conf,
                        line_vals[2],
                        engine_key=line_3.name[0],
                        one_line_empty=one_line_empty)
                    charconf_vals = [charconf_1, charconf_2, charconf_3]
                else:
                    # if the character is within padding range just give none values for characters and confidences
                    line_vals = [None, None, None]
                    charconf_vals = [None, None, None]

                # fill searchspace with the chars and confidences
                ssp_chars.push_column(line_vals)
                ssp_confs.push_column(charconf_vals)

                # update the mid-window of the search space (this is the actual search space processing step)
                mid_chars = ssp_chars.get_middle_matrix(PRINT_MATRICES)
                mid_confs = ssp_confs.get_middle_matrix(PRINT_MATRICES)
                mid_chars_processed, mid_confs_processed, change_done = \
                    search_space_processor.process_search_space(mid_chars, mid_confs,SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS)
                if change_done is True:
                    ssp_chars.update_middle_matrix(mid_chars_processed)
                    ssp_confs.update_middle_matrix(mid_confs_processed)

                # extract changed values from search space
                character_offset = -(SEARCH_SPACE_X_SEARCH_RANGE + 1)
                character_1 = ssp_chars.get_value_around_middle(
                    0, character_offset)
                character_2 = ssp_chars.get_value_around_middle(
                    1, character_offset)
                character_3 = ssp_chars.get_value_around_middle(
                    2, character_offset)
                charconf_1 = ssp_confs.get_value_around_middle(
                    0, character_offset)
                charconf_2 = ssp_confs.get_value_around_middle(
                    1, character_offset)
                charconf_3 = ssp_confs.get_value_around_middle(
                    2, character_offset)
                if character_1 is None or character_2 is None or character_3 is None:
                    # self.cpr.print("test")
                    continue

                # in case umlaut confidence increment is active change charconfs otherwise same charconfs
                charconf_1, charconf_2, charconf_3 = self.increase_umlaut_confidence_searchspace(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)

                # get the previous characters from other lines as string (mainly for predictor)
                filo_content = self.filo_last_chars.get_content_as_string()

                # trigger predicted section for aufsichtsrat predictor
                self.toggle_predictor(filo_content)

                # predict_char if predictor is enabled
                predicted_char = self.predict_char(filo_content)

                # get the character which occurs the most by accumulating confidence scores
                sc1, acc_conf_1 = self.get_confidence_count(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)
                sc2, acc_conf_2 = self.get_confidence_count(
                    character_2, character_1, character_3, charconf_2,
                    charconf_1, charconf_3)
                sc3, acc_conf_3 = self.get_confidence_count(
                    character_3, character_2, character_1, charconf_3,
                    charconf_2, charconf_1)
                maxindices = np.argmax([
                    acc_conf_2, acc_conf_1, acc_conf_3
                ])  # this takes in priorisation in case the chars are same

                if character_index == maximum_char_number + range_extension + 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I":
                    continue

                # drop chars completely if they fall below a certain dropping treshhold and the setting is active
                if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True:
                    tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH
                    maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3)
                    if maximum_conf < tresh:
                        if [character_2, character_1, character_3
                            ][maxindices] != '¦':
                            continue

                # determine character with the best accumulated confidence
                voted_char = None
                voted_acc_conf = None
                if maxindices == 0:
                    voted_char = character_2
                    voted_acc_conf = acc_conf_2
                elif maxindices == 1:
                    voted_char = character_1
                    voted_acc_conf = acc_conf_1
                else:
                    voted_char = character_3
                    voted_acc_conf = acc_conf_3

                # if predictor is active, check if there is a better char predicted which can replace  voted character
                voted_char = self.maybe_replace_voted_by_predicted_char(
                    voted_char, self.use_aufsichtsrat_prediction,
                    predicted_char, wildcard_character, voted_acc_conf,
                    character_1, character_2, character_3)
                # push the voted char and the accumulated confidence of this char to results
                accumulated_confs.push(voted_acc_conf)
                accumulated_chars += voted_char

                # if the predictor is enabled fill the filo with the voted_char
                self.fill_filo_last_chars(voted_char)

            # do vocabulary related steps, if activated
            accumulated_chars = self.vocabulary_related_corrections(
                accumulated_chars, wildcard_character, accumulated_confs)

            # remove the wilcard characters and return result
            accumulated_chars_stripped = accumulated_chars.replace(
                wildcard_character, '')
            return accumulated_chars, accumulated_chars_stripped

        except Exception as ex:
            tr = inspect.trace()

            self.cpr.printex("ocr_voter.py Exception during confidence vote",
                             ex)
            self.cpr.printex("trace", tr)

    def vocabulary_related_corrections(self, accumulated_chars,
                                       wildcard_character, accumulated_confs):

        if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE:
            accumulated_chars_final = ""
            acc_split = accumulated_chars.split()
            len_split = len(acc_split)

            for word_index, word in enumerate(acc_split):

                if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION:
                    if word_index == len_split - 1 and word.replace(
                            wildcard_character, "").endswith('-'):
                        self.previous_word_with_seperator = True
                        accumulated_chars_final += word + " "
                        continue
                    if word_index == 0:
                        if self.previous_word_with_seperator is True:
                            self.previous_word_with_seperator = False
                            accumulated_chars_final += word + " "
                            continue

                acc_confs_word = accumulated_confs.pop_multi(len(word))
                acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \
                    self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character)
                self.cpr_vocab_check.print("w:", word, "wr:", word_reduced,
                                           "accr:", acc_conf, "rate", rate)

                # don't correct words below min vocab length ( mind that special chars in dict are toggled)
                check_len = len(word)
                if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS:
                    check_len = len(word_reduced)
                if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH:
                    accumulated_chars_final += word + " "
                    continue

                if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS:
                    swappable_char_indices = []

                    acc_confs_used = None
                    word_used = None

                    if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                        # use the full length confidences array including trailing and leading special characters
                        acc_confs_used = acc_confs_word
                        word_used = word
                    else:
                        # don't use trailing and starting special characters if no special chars needed
                        acc_confs_used = acc_confs_word[
                            len(word_starting_borders):(
                                len(acc_confs_word) -
                                len(word_trailing_borders))]
                        word_used = word_reduced

                    for conf_index, conf in enumerate(acc_confs_used):
                        if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                            if conf <= 250:
                                character_related = word_used[conf_index]
                                is_special_char = Random.is_special_character(
                                    character_related)
                                if is_special_char and character_related != wildcard_character:
                                    # only swap special character indices
                                    swappable_char_indices.append(conf_index)
                        else:
                            if conf <= 215:
                                swappable_char_indices.append(conf_index)

                    if len(swappable_char_indices) >= 1:
                        word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only(
                            word_used, swappable_char_indices)
                        if word_reduced_correct != None:
                            word_correct_withtrails = None

                            if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                                if Random.has_special_character(
                                        word_reduced_correct):
                                    # if special character was replaced with special character
                                    word_correct_withtrails = word_reduced_correct
                                else:
                                    # if special character was replaced by alphanumerical character
                                    word_correct_withtrails = word
                            else:
                                word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                            # only print the changed results
                            if word != word_correct_withtrails:
                                self.cpr_vocab_check.print(
                                    "w:", word, "wc:", word_correct_withtrails,
                                    "accr:", acc_conf, "rate", rate)

                            accumulated_chars_final += word_correct_withtrails + " "
                        else:
                            accumulated_chars_final += word + " "
                    else:
                        accumulated_chars_final += word + " "

                    continue

                if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \
                        and len(word_reduced) > 2:
                    # if the rate drops below tresh, try to fetch vocab entry
                    word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text(
                        word_reduced)
                    if word_reduced_correct != None and word_reduced_correct != word_reduced:

                        word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                        self.cpr_vocab_check.print("w:", word, "wc:",
                                                   word_correct_withtrails,
                                                   "accr:", acc_conf, "rate",
                                                   rate)

                        accumulated_chars_final += word_correct_withtrails + " "
                    else:
                        accumulated_chars_final += word + " "
                else:
                    accumulated_chars_final += word + " "

            accumulated_chars = accumulated_chars_final

        return accumulated_chars

    def try_obtain_charconf_searchspace(
        self,
        value_confidence,
        value,
        undef_value=0,
        engine_key=None,
        one_line_empty=False,
    ):
        if value_confidence is None or value_confidence is False or value_confidence is True:
            return undef_value

        returnvalue = value_confidence

        if self.config.MSA_BEST_VOTER_SCALE_ENGINE_CONFIDENCES and engine_key is not None:
            if engine_key == 'Abbyy':
                if self.config.MSA_BEST_INCREASE_CONFIDENCE_OF_SOME_ABBYY_CHARS:
                    if value == "%":  # improve ocropus in confidence of % because it was trained
                        value_confidence = value_confidence + 80

                returnvalue = ConfidenceModifications.abby_factor * value_confidence
            elif engine_key == 'Tess':
                returnvalue = ConfidenceModifications.tesseract_factor * value_confidence

            elif engine_key == 'Ocro':

                returnvalue = ConfidenceModifications.ocropus_factor * value_confidence

        if (self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS and one_line_empty and value == " ") \
            or (self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD and one_line_empty \
                and value == " "):

            returnvalue += ConfidenceModifications.whitespace_push

        return returnvalue

    def check_if_one_line_empty(self, lines, wildcard_character):
        for line in lines:
            text_wo_wildcards = line.textstr.replace(wildcard_character, '')
            if text_wo_wildcards == "":
                return True
            if self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD:
                # also count in high whitecard ratios as empty line
                wildcard_ratio = 1 - (len(text_wo_wildcards) /
                                      len(line.textstr))
                if wildcard_ratio > 0.70:
                    return True

    def toggle_predictor(self, filo_content):
        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            if "Aufsichtsrat" in filo_content:
                self.use_aufsichtsrat_prediction = True
            if "Gründung:" in filo_content:
                self.use_aufsichtsrat_prediction = False

    def predict_char(self, filo_content):
        predicted_char = None
        if self.use_aufsichtsrat_prediction:
            if len(filo_content
                   ) >= 19:  # if filo_content bigger than one prediction chunk
                len_aufsichtsrat = 19
                predicted_char = self.predictor.predict_next_aufsichtsrat_chars(
                    len_aufsichtsrat, filo_content)
                # print("filo", filo_content,"predict:", predicted_char)
                # print("dd")
        return predicted_char

    def fill_filo_last_chars(self, voted_char):
        """
        fill filo for predictor usage with voted_char some additional chars around this char
        :param voted_char:
        :return:
        """

        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            # create pre semi-tokenized input strings in the filos from the voted characters for prediction
            if voted_char == ' ':
                # the models usally use the 'ƿ' char in substitution for spaces
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push('ƿ', filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')
            elif Random.is_special_character(voted_char):
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push(voted_char, filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')

            else:
                self.filo_last_chars.push(voted_char, filterchar='¦')

    def increase_umlaut_confidence_searchspace(self, character_1, character_2,
                                               character_3, charconf_1,
                                               charconf_2, charconf_3):

        if self.config.MSA_BEST_SEARCHSPACE_INCREASE_UMLAUT_CONFIDENCE:
            clist = [character_1, character_2, character_3]
            conflist = [charconf_1, charconf_2, charconf_3]
            conflist_new = self.increase_umlaut_confidence(clist, conflist)
            charconf_1 = conflist_new[0]
            charconf_2 = conflist_new[1]
            charconf_3 = conflist_new[2]
            return charconf_1, charconf_2, charconf_3
        return charconf_1, charconf_2, charconf_3

    def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \
                                              wildcard_character, voted_acc_conf, character_1, character_2, character_3):
        if aufsichtsrat_prediction_toggled:
            if Random.is_special_character(predicted_char):
                one_char_sc = Random.is_special_character(character_1) \
                              or Random.is_special_character(character_2) or Random.is_special_character(
                    character_3)
                voted_char_sc = Random.is_special_character(voted_char)

                if predicted_char != voted_char and (
                        one_char_sc
                        or voted_char_sc) and voted_char != wildcard_character:
                    # print("FiloContent:", filo_content)
                    self.cpr_sc_predict.print("pc:", predicted_char, "vc:",
                                              voted_char, "vc_acc",
                                              voted_acc_conf)
                    if voted_acc_conf <= 90.0:
                        if voted_char != '\f':  # don't swap formfeeds, they don't get predicted at all
                            self.cpr_sc_predict.print("swap")
                            voted_char = predicted_char

        return voted_char
class AdditionalInfoHandler(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)
        self.cpr.print("init additional info handler")

    def write_excel_to_json(self,
                            fileinfo,
                            filepath,
                            filetype,
                            idxcol=None,
                            parse_cols=None,
                            page=0):
        """"
        At the moment a little helper script for the Aktienführer-Project.
        Be free to modify as you wish.
        """
        #if isinstance(parse_cols, list): parse_cols = [parse_cols],
        additional_filepath = path.normpath(
            f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
        file = glob.glob(additional_filepath, recursive=True)
        if len(file) != 1: return None
        if filetype in ["xlsx", "xls"]:
            df = pd.read_excel(file[0]).set_index("ProfileID")
            jsondata = {fileinfo.dbname: {"Year": fileinfo.dbname}}
            jsondf = df.to_dict(orient="index")
            jsondata.update(jsondf)
            with open(file[0].replace("xlsx", "json"), "w") as output:
                json.dump(jsondata, output, indent=4)
        return None

    def fetch_additional_information_simple(self, file):
        """
        Same as fetch additional information, but config related info is already included in given
        parameters
        :return: additional info
        """
        if self.config.ADDITIONAL_INFORMATION:
            additional_info = self.fetch_additional_information(
                file,
                self.config.INPUT_ADDINFOPATH,
                idxcol=self.config.IDXCOL,
                parse_cols=self.config.PARSE_COLS,
                filetype=self.config.INPUT_ADDINFOFILETPYE)
            return additional_info

        return None

    def fetch_additional_information(self,
                                     fileinfo,
                                     filepath,
                                     filetype,
                                     idxcol=None,
                                     parse_cols=None,
                                     page=0):
        """
        Reads an additional file with information
        It searches the file where the index_name matches tablename or dbname
        :param file:
        :param index_name:
        :return: additional info
        """
        #if isinstance(parse_cols, list): parse_cols = [parse_cols]
        additional_filepath = path.normpath(
            f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
        file = glob.glob(additional_filepath, recursive=True)

        len_files = len(file)
        if len_files > 1:
            self.cpr.printex(
                "More than one additional information file was found!")
            return None
        if len_files == 0:
            self.cpr.printex("No additional information file was found!")
            return None

        file = file[0]
        current_db_and_table = {
            "db": fileinfo.dbname,
            "table": fileinfo.tablename
        }
        if filetype in ["xlsx", "xls"]:
            infos = {}
            info_df = pd.read_excel(file)  #.set_index("ProfileID")
            parse_cols.remove(idxcol)
            for db_and_table_id, current_db_and_tablename in current_db_and_table.items(
            ):
                infos[db_and_table_id] = {}
                for line, rubric_content in info_df.loc[
                        info_df[idxcol] ==
                        current_db_and_tablename][parse_cols].to_dict(
                            orient="index").items():
                    for rubric, content in rubric_content.items():
                        if rubric != idxcol:
                            if infos[db_and_table_id].get(rubric,
                                                          None) is None:
                                infos[db_and_table_id][rubric] = content
                            elif infos[db_and_table_id].get(rubric,
                                                            None) != content:
                                if not isinstance(
                                        infos[db_and_table_id][rubric], list):
                                    infos[db_and_table_id][rubric] = [
                                        infos[db_and_table_id][rubric]
                                    ]
                                infos[db_and_table_id][rubric].append(content)
        elif filetype == "json":
            with open(file, "r") as add_info_file:
                infos = json.load(add_info_file)

            for possible_db_or_tablenames in reversed(list(infos.keys())):
                possible_db_or_tablenames_orig = possible_db_or_tablenames  # unchanged name

                if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON:
                    psplit = possible_db_or_tablenames.split("-")
                    possible_db_or_tablenames = psplit[0]

                if possible_db_or_tablenames not in current_db_and_table[
                        'table']:
                    del infos[possible_db_or_tablenames_orig]
                else:
                    for db_and_table_id, current_db_and_tablename in current_db_and_table.items(
                    ):
                        if possible_db_or_tablenames == current_db_and_tablename:
                            infos[db_and_table_id] = infos[
                                possible_db_or_tablenames_orig]
                            del infos[possible_db_or_tablenames_orig]
        else:
            return None
        return infos
Exemple #4
0
class IsriHandler(object):
    def __init__(self):
        self.os = os.name.lower()
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()

        if 'ExceptionInitializing' in self.config:
            print("Exception initializing config, don't print")
            self.cpr = ConditionalPrint(False, False, False)
        else:

            self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                        self.config.PRINT_EXCEPTION_LEVEL,
                                        self.config.PRINT_WARNING_LEVEL)

        if self.os != 'linux' and self.os != 'posix':
            raise OSError(
                "Untested operating system adapt code and continue at own risk"
            )

    def accuracy(self,
                 path_correctfile,
                 path_generatedfile,
                 path_accuracy_report=""):

        try:
            call([
                "accuracy", path_correctfile, path_generatedfile,
                path_accuracy_report
            ])
        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    class SynctextConfig(object):
        def __init__(self):
            self._used_config_acc = []

        def use_T_algorithm(self):
            self._used_config_acc.append("-T")

        def use_H_algorithm(self):
            self._used_config_acc.append("-H")

        def use_case_insensitive(self):
            self._used_config_acc.append("-i")

        def use_display_suspect_markers_in_output(self):
            self._used_config_acc.append("-s")

        def get_used_config(self):
            return self._used_config_acc

        def clear_used_config(self):
            self._used_config_acc = []

    def synctext(self,
                 filepaths,
                 path_generatedfile=None,
                 synctext_config=SynctextConfig()):

        try:
            flags = synctext_config.get_used_config()
            calls = ["synctext"]
            calls.extend(flags)
            calls.extend(filepaths)

            if path_generatedfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generatedfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def accsum(self, filepaths_accreports, path_generatedfile=None):

        try:
            calls = ["accsum"]
            calls.extend(filepaths_accreports)

            if path_generatedfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generatedfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def groupacc(self,
                 path_groupfile,
                 path_accuracy_report,
                 path_groupacc_report=None):

        try:
            calls = ["groupacc"]
            calls.append(path_groupfile)
            calls.append(path_accuracy_report)

            if path_groupacc_report is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_groupacc_report, True)
                filehandle.close()
                calls.append(path_groupacc_report)
                call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def accdist(self, filepaths_accreports, path_generated_xyfile=None):

        try:
            calls = ["accdist"]
            calls.extend(filepaths_accreports)

            if path_generated_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generated_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    class NGramConfig(object):
        def __init__(self):
            self._used_config_acc = []

        def set_ngram_size(self, number):
            if number >= 1 and number <= 3:
                self._used_config_acc.append("-n")
                self._used_config_acc.append(str(number))

        def clear_used_config(self):
            self._used_config_acc = []

        def get_used_config(self):
            return self._used_config_acc

    def ngram(self,
              filepaths,
              path_generatedfile=None,
              ngram_config=NGramConfig()):

        try:
            flags = ngram_config.get_used_config()
            calls = ["ngram"]
            calls.extend(flags)
            calls.extend(filepaths)

            if path_generatedfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generatedfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    class VoteConfig(object):
        def __init__(self):
            self._used_config_acc = []

        def enable_O_optimization(self):
            self._used_config_acc.append("-O")

        def set_s(self, fraction_counter, fraction_denominator):
            self._used_config_acc.append("-s")
            self._used_config_acc.append(fraction_counter + "/" +
                                         fraction_denominator)

        def set_w(self, fraction_counter, fraction_denominator):
            self._used_config_acc.append("-w")
            self._used_config_acc.append(fraction_counter + "/" +
                                         fraction_denominator)

        def set_output_file(self, path_outputfile):
            self._used_config_acc.append("-o")
            self._used_config_acc.append(path_outputfile)  #ok?

        def clear_used_config(self):
            self._used_config_acc = []

        def get_used_config(self):
            return self._used_config_acc

    def vote(self, filepaths, ngram_config=VoteConfig()):

        try:

            flags = ngram_config.get_used_config()
            calls = ["vote"]
            calls.extend(flags)
            calls.extend(filepaths)

            call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordacc(self,
                path_correctfile,
                path_comparison_file,
                path_stopwordfile=None,
                path_wordacc_report=None):

        try:
            calls = ["wordacc"]

            if path_stopwordfile is not None:
                calls.append("-S")
                calls.append(path_stopwordfile)

            calls.append(path_correctfile)
            calls.append(path_comparison_file)

            if path_wordacc_report is not None:
                calls.append(path_wordacc_report)

            call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordaccsum(self, filepaths_wordacc_reports, path_accsumreport=None):

        try:
            calls = ["wordaccsum"]
            calls.extend(filepaths_wordacc_reports)

            if path_accsumreport is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_accsumreport, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def nonstopacc(self,
                   path_stopwordfile,
                   path_wordacc_report,
                   path_output_xyfile=None):

        try:
            calls = ["nonstopacc"]
            calls.append(path_stopwordfile)
            calls.append(path_wordacc_report)

            if path_output_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_output_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordaccci(self, filepaths_wordacc_reports, path_outputfile=None):

        try:
            calls = ["wordaccci"]
            calls.extend(filepaths_wordacc_reports)

            if path_outputfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_outputfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordaccdist(self, filepaths_wordacc_reports, path_output_xyfile=None):

        try:
            calls = ["wordaccdist"]
            calls.extend(filepaths_wordacc_reports)

            if path_output_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_output_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordfreq(self, filepaths_inputtext, path_resultfile=None):

        try:
            calls = ["wordfreq"]
            calls.extend(filepaths_inputtext)

            if path_resultfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_resultfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    #todo add the zoning programs some day: point 4 in doc
    def editop(self,
               path_correctfile,
               path_comparison_file,
               path_editop_report=None):

        try:
            calls = ["editop"]

            calls.append(path_correctfile)
            calls.append(path_comparison_file)

            if path_editop_report is not None:
                calls.append(path_editop_report)

            call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def editopsum(self, filepaths_editopreports, path_summed_report=None):

        try:
            calls = ["editopsum"]
            calls.extend(filepaths_editopreports)

            if path_summed_report is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_summed_report, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def editopcost(self,
                   path_editop_report,
                   path_editop_report2=None,
                   path_output_xyfile=None):

        try:
            calls = ["editopcost"]

            calls.append(path_editop_report)

            if path_editop_report2 is not None:
                calls.append(path_editop_report2)

            if path_output_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_output_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def create_file_if_doesnt_exist(self, filepath, overwrite=False):

        file = open(filepath, 'w+')
        if overwrite:
            self.delete_file_content(file)
        return file

    def delete_file_content(self, pfile):
        pfile.seek(0)
        pfile.truncate()
class DictionaryHandler(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.cpr.print("init dictionary handler")
        self.data_functs = None  # storage for json object
        self.data_titles = None  # storage for json object
        self.texts_functs = None
        self.texts_titles = None
        if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING:
            self.load_dictionaries()
            # get the rows as sorted list of texts longest first
            if self.data_functs is not None:
                check_tf = self.sort_rows(self.get_rows(self.data_functs))
                self.texts_functs = check_tf
            if self.data_titles is not None:
                check_tt = self.sort_rows(self.get_rows(self.data_titles))
                self.texts_titles = check_tt

    def diff_name_title(self, text_to_check):

        len_text_to_check = len(text_to_check)
        name_found = text_to_check
        title_found = ""

        for entry_index, entry in enumerate(self.texts_titles):
            title, tlen = entry
            # accelerate the process, by skipping comparisons which have longer texts
            if tlen > len_text_to_check:
                continue
            # compare the texts
            if title in text_to_check:
                name_found = text_to_check.replace(title, "", 1).strip()
                title_found = title
                break

        return name_found, title_found

    def load_dictionaries(self):
        base_dict_path = self.get_dict_path()

        filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json")
        filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json")

        # load titles
        if os.path.exists(filepath_titles_dict):
            with open(filepath_titles_dict) as f:
                self.data_titles = json.load(f)
        else:
            self.cpr.printex(
                "dictionary dict_titles.json missing at specificied path",
                filepath_titles_dict)

        # load functs
        if os.path.exists(filepath_functs_dict):
            with open(filepath_functs_dict) as f:
                self.data_functs = json.load(f)
        else:
            self.cpr.printex(
                "dictionary dict_functs.json missing at specificied path",
                filepath_functs_dict)

    def get_rows(self, dict_data):
        rows = dict_data['rows']
        final_rows = []
        for entry in rows:
            text = entry[0]
            final_rows.append((text, len(text)))
        return final_rows

    def sort_rows(self, rows):
        #itemgetter(1),
        rows.sort(key=lambda t: len(t[0]), reverse=True)
        return rows

    def path(self):
        return os.getcwd()

    def get_dict_path(self):
        complete = os.path.join(self.path(), "additionals", "dictionaries")
        return complete
Exemple #6
0
class OCRset:
    """
        A storage class for a y_mean value
        and a set of lines which was assigned to each other
        If the lineset values where not edited, they are intialized with 'False
    """
    N_DISTANCE_SHORTEST_TAG = "n_distance_shortest"

    def __init__(self, lines_size, y_mean, msa_handler):
        lineset = []
        for x in range(0, lines_size):
            lineset.append(False)

        self._set_lines = lineset
        self._size = lines_size
        self._y_mean = y_mean  # mean y coordinate of all lines referenced in this set
        self.shortest_distance_line_index = -1
        self._unspaced = False  # indicates the set_lines was unspaced
        self._refspaced = False  # indicates the set_lines was reference spaced
        self._text_unspacer = TextUnspacer()
        self.shortest_distance_line = None  # holder element for recognized shortest distance line
        self._best_msa_text = ""
        self._text_seg = None
        self._is_origin_database = False
        self._database_handler = None
        config_handler = ConfigurationHandler(first_init=False)
        self._config = config_handler.get_config()

        if 'ExceptionInitializing' in self._config:
            print("Exception initializing config, don't print")
            self._cpr = ConditionalPrint(False, False, False)
        else:

            self._cpr = ConditionalPrint(self._config.PRINT_MSA_HANDLER,
                                         self._config.PRINT_EXCEPTION_LEVEL,
                                         self._config.PRINT_WARNING_LEVEL)

        self._msa_handler = msa_handler

    def add_predictor(self, predictor):
        self.predictor = predictor
        self._msa_handler.add_predictor(predictor)

    def is_database_set(self, enabled, database_handler):
        self._is_origin_database = enabled
        self._database_handler = database_handler

    def edit_line_set_value(self, set_index, new_value):
        self._set_lines[set_index] = new_value

    def get_line_set_value_line(self, set_index):
        return self._set_lines[set_index]

    def get_line_set_value_text(self, set_index):
        value_line = self.get_line_set_value_line(set_index)
        value_text = self.get_line_content(value_line)
        return value_text

    def get_msa_best_text(self):
        return self._best_msa_text

    def set_msa_best_text(self, value):
        self._best_msa_text = value

    @property
    def size(self):
        return self._size

    @size.setter
    def size(self, value):
        self._size = value

    @property
    def y_mean(self):
        return self._y_mean

    @y_mean.setter
    def y_mean(self, value):
        self.y_mean = value

    def calculate_y_mean(self):
        """
        Goes through set elements and calculates y_mean for y_start and y_stop values
        :return:
        """

        acc_counter = 0
        y_start_final = 0
        y_stop_final = 0

        for line in self._set_lines:
            # don't count undefined values for means
            if line is False or line is None:
                continue
            # accumulate y-values
            (x_start, y_start, x_stop, y_stop) = line.coordinates
            y_start_final = y_start_final + y_start
            y_stop_final = y_stop_final + y_stop
            # add number of accumulation count
            acc_counter = acc_counter + 1

        y_start_mean = y_start_final / acc_counter
        y_stop_mean = y_stop_final / acc_counter
        y_mean = (y_start_mean + y_stop_mean) / 2

        self._y_mean = round(y_mean)

    def is_full(self):
        """
        Checks if all lines are defined in the lineset
        :return: True or False
        """
        for line in self._set_lines:
            if line is False:
                return False

        return True

    def print_me(self, diff_only=False):

        lineset_acc = ""
        one_line_is_false = False

        for line in self._set_lines:
            try:
                ocr_text = self.get_line_content(line)
                if ocr_text is False:
                    one_line_is_false = True
                    lineset_acc = lineset_acc + str(ocr_text) + "||"
                else:
                    lineset_acc = lineset_acc + ocr_text + "||"

            except:
                self._cpr.print("problem creating printable lineset ")

        lineset_acc = lineset_acc + "||"
        msa_str = str(self._best_msa_text)
        if diff_only is True:
            if one_line_is_false is True:
                self._cpr.print(
                    str(self.y_mean) + "||" + msa_str + "||" +
                    str(self.shortest_distance_line_index) + "||" +
                    lineset_acc)
        else:
            self._cpr.print(
                str(self.y_mean) + "||" + msa_str + "||" +
                str(self.shortest_distance_line_index) + "||" + lineset_acc)

    def calculate_n_distance_keying(self):

        # get the texts
        texts = []
        for line in self._set_lines:
            text = self.get_line_content(line)
            texts.append(text)

        self._n_distance_voter = NDistanceVoter(texts)
        if "ExceptionInitializing" in self._config.keys():
            print("Exception in initializing config using default in c")
            shortest_dist_index = self._n_distance_voter.compare_texts( \
                    take_longest_on_empty_lines = True, \
                    vote_without_spaces = False)
        else:
            shortest_dist_index = self._n_distance_voter.compare_texts( \
                    take_longest_on_empty_lines = self._config.NDIST_VOTE_LONGEST_IF_EMPTY_STRINGS, \
                    vote_without_spaces = self._config.NDIST_VOTE_WITHOUT_SPACES)

        # save the result
        self.shortest_distance_line_index = shortest_dist_index
        self.shortest_distance_line = self._set_lines[shortest_dist_index]

    def calculate_n_distance_keying_wordwise(self):
        if self._is_origin_database is False:
            self._cpr.printex(
                "Wordwise keying only possible with database originated ocr_sets"
            )
            raise Exception

        # get maximum word index todo probably will be refactored
        max_word_indices = []
        for line in self._set_lines:
            if line is False or line is None or line.textstr == '':
                max_word_indices.append(0)
            else:
                max_word_index = int(max(line.data["word_idx"]))
                max_word_indices.append(max_word_index)

        max_word_index = max(max_word_indices)
        self._cpr.print("mwi", max_word_index)

        def get_word_at_calc_wordindex(line, word_index):
            accumulated_word = ""
            word_indices = line.data["calc_word_idx"]

            for char_index, char in enumerate(line.data["char"]):
                current_word_index = word_indices[char_index]
                if current_word_index == word_index:
                    accumulated_word += char
                if current_word_index > word_index:
                    break
            return accumulated_word

        max_word_index = 2
        words_mock = [["hallo", "h4llo", "hallo"], ["zwei", None, "2wei"]]
        ndist_voter = NDistanceVoter(None)

        # get corresponding words
        for current_word_index in range(0, max_word_index):
            words = []
            """
            for line in self._set_lines:
                if line is False or line is None:
                    words.append(False)
                else:
                    if current_word_index < int(max(line.data["calc_word_idx"])):
                        current_word = get_word_at_calc_wordindex(line, current_word_index)
                        words.append(current_word)
                    else:
                        words.append(False)
            """

            words = words_mock[current_word_index]
            ndist_voter.set_texts(words)
            wordindex_result = ndist_voter.compare_texts( \
                take_longest_on_empty_lines = self._config.NDIST_VOTE_LONGEST_IF_EMPTY_STRINGS, \
                vote_without_spaces=self._config.NDIST_VOTE_WITHOUT_SPACES)

            ndist_voter.reset()
            self._cpr.print(words[wordindex_result])
            self._cpr.print("--")
            # just assume words is filled here and a 3 word list

        return

    def get_longest_index(self):
        def if_notdef_set_emptystring(value):
            if value is True or value is False or value is None:
                return ""

            return value

        lsval_1 = if_notdef_set_emptystring(
            self.get_line_content(self.get_line_set_value_line(0)))
        lsval_2 = if_notdef_set_emptystring(
            self.get_line_content(self.get_line_set_value_line(1)))
        lsval_3 = if_notdef_set_emptystring(
            self.get_line_content(self.get_line_set_value_line(2)))

        len_pline_1 = len(lsval_1)
        len_pline_2 = len(lsval_2)
        len_pline_3 = len(lsval_3)
        # max_index_value = max([len_pline_1, len_pline_2, len_pline_3])
        max_index = np.argmax([len_pline_1, len_pline_2, len_pline_3])
        self._cpr.print(max_index)
        return max_index

    def calculate_msa_best(self,
                           take_n_dist_best_index=False,
                           take_longest_as_pivot=False):

        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1
        if take_longest_as_pivot is True:
            best_index = self.get_longest_index()
        elif take_n_dist_best_index is True:
            best_index = self.get_shortest_n_distance_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        index1 = indices[0]
        index2 = indices[1]

        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        index1, "and", index2, ")")

        try:
            line_1 = self.get_line_content(self._set_lines[index1])
            line_2 = self.get_line_content(
                self._set_lines[best_index])  # should be best
            line_3 = self.get_line_content(self._set_lines[index2])

            self._cpr.print("ocr_set:")
            self._cpr.print("text_A", line_1)
            self._cpr.print("text_B", line_2)
            self._cpr.print("text_C", line_3)

            lines = [line_1, line_2, line_3]

            line_1_ok = not Random.is_false_true_or_none(line_1)
            line_2_ok = not Random.is_false_true_or_none(line_2)
            line_3_ok = not Random.is_false_true_or_none(line_3)
            ok_lines = [line_1_ok, line_2_ok, line_3_ok]
            not_ok_indices = []
            ok_indices = []
            for ok_index, ok in enumerate(ok_lines):
                if ok is True:
                    # not_ok_indices.append(ok_index)
                    ok_indices.append(ok_index)

            ok_len = len(ok_indices)

            if ok_len == 1:
                result = lines[ok_indices[0]]
            elif ok_len == 0:
                result = None
            elif ok_len == 2:
                result = lines[ok_indices[0]]
            else:
                result = self._msa_handler.get_best_of_three(
                    line_1, line_2, line_3)

            self._best_msa_text = result
        except Exception as e:
            self._cpr.printex(
                "ocr_set.py Exception in MSA, just taking line prio exception:",
                e)
            tr = inspect.trace()
            self._cpr.printex("trace is:", tr)

            self._best_msa_text = self.get_line_content(self._set_lines[1])

    def obtain_best_index(self,
                          use_n_dist_pivot,
                          use_longest_pivot,
                          default_best_index=1):
        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1

        if use_n_dist_pivot is True:
            ldist_best_index = self.get_shortest_n_distance_index(
            )  # this doesn't work in all cases atm
            best_index = ldist_best_index
        if use_longest_pivot is True:
            best_index = self.get_longest_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        other_indices = indices
        return best_index, other_indices

    def obtain_line_info(self, best_index, other_indices):

        line_1 = self._set_lines[other_indices[0]]
        line_2 = self._set_lines[best_index]  # should be best
        line_3 = self._set_lines[other_indices[1]]

        text_1 = self.get_line_content(line_1)
        text_2 = self.get_line_content(line_2)  # should be best
        text_3 = self.get_line_content(line_3)

        self._cpr.print("ocr_set:")
        self._cpr.print("text_A", text_1)
        self._cpr.print("text_B", text_2)
        self._cpr.print("text_C", text_3)

        line_1_ok = not Random.is_false_true_or_none(line_1)
        line_2_ok = not Random.is_false_true_or_none(line_2)
        line_3_ok = not Random.is_false_true_or_none(line_3)
        ok_lines = [line_1_ok, line_2_ok, line_3_ok]

        ok_indices = []
        for ok_index, ok in enumerate(ok_lines):
            if ok is True:
                # not_ok_indices.append(ok_index)
                ok_indices.append(ok_index)

        ok_len = len(ok_indices)

        texts_return = [text_1, text_2, text_3]
        lines_return = [line_1, line_2, line_3]
        lines_return_ok = [line_1_ok, line_2_ok, line_3_ok]

        return texts_return, lines_return, lines_return_ok, ok_len

    def calculate_msa_best_all(self,
                               use_ndist_pivot,
                               use_longest_pivot,
                               use_charconfs,
                               use_wordwise,
                               use_searchspaces,
                               prefered_index=1):

        # get the pivot index and the other indices
        best_index, other_indices = self.obtain_best_index(
            use_ndist_pivot, use_longest_pivot, prefered_index)
        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        other_indices[0], "and", other_indices[1], ")")

        # fetch the lines to process and info which (and how many) lines are ok
        texts, lines, lines_ok, number_lines_ok = self.obtain_line_info(
            best_index, other_indices)

        # do the msa if there is at least one line ok (confidence vote can be done with one line also :))
        if use_wordwise is True:
            if number_lines_ok != 0:
                result, self._text_seg = self._msa_handler.get_best_of_three_wordwise(
                    lines[0], lines[1], lines[2], use_charconfs,
                    use_searchspaces)
            else:
                result = None

        else:
            if number_lines_ok != 0:

                text_1 = self.get_line_content(lines[0])
                text_2 = self.get_line_content(lines[1])  # should be best
                text_3 = self.get_line_content(lines[2])

                result = self._msa_handler.get_best_of_three(
                    text_1,
                    text_2,
                    text_3,
                    line_1=lines[0],
                    line_2=lines[1],
                    line_3=lines[2],
                    use_charconfs=use_charconfs,
                    use_searchspaces=use_searchspaces)
            else:
                result = None

        self._best_msa_text = result

    def calculate_msa_best_charconf(self,
                                    take_n_dist_best_index=False,
                                    take_longest_as_pivot=True):

        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1

        if take_n_dist_best_index is True:
            ldist_best_index = self.get_shortest_n_distance_index(
            )  # this doesn't work in all cases atm
            best_index = ldist_best_index
        if take_longest_as_pivot is True:
            best_index = self.get_longest_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        index1 = indices[0]
        index2 = indices[1]

        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        index1, "and", index2, ")")

        try:

            line_1 = self._set_lines[index1]
            line_2 = self._set_lines[best_index]
            line_3 = self._set_lines[index2]

            text_1 = self.get_line_content(line_1)
            text_2 = self.get_line_content(line_2)  # should be best
            text_3 = self.get_line_content(line_3)

            self._cpr.print("ocr_set:")
            self._cpr.print("text_A", text_1)
            self._cpr.print("text_B", text_2)
            self._cpr.print("text_C", text_3)

            lines = [text_1, text_2, text_3]

            line_1_ok = not Random.is_false_true_or_none(text_1)
            line_2_ok = not Random.is_false_true_or_none(text_2)
            line_3_ok = not Random.is_false_true_or_none(text_3)
            ok_lines = [line_1_ok, line_2_ok, line_3_ok]
            not_ok_indices = []
            ok_indices = []
            for ok_index, ok in enumerate(ok_lines):
                if ok is True:
                    # not_ok_indices.append(ok_index)
                    ok_indices.append(ok_index)

            ok_len = len(ok_indices)

            if ok_len == 0:
                result = None
            else:
                result = self._msa_handler.get_best_of_three(text_1, text_2, text_3, use_charconfs=True, \
                                                      line_1=line_1,line_2=line_2,line_3=line_3)

            self._best_msa_text = result
        except Exception as e:
            self._cpr.printex(
                "ocr_set.py Exception in MSA, just taking line prio exception:",
                e)
            tr = inspect.trace()
            self._cpr.printex("trace is:", tr)
            if take_n_dist_best_index is True:
                self._best_msa_text = self.get_line_content(
                    self._set_lines[ldist_best_index])
            else:
                self._best_msa_text = self.get_line_content(
                    self._set_lines[best_index])

    def get_shortest_n_distance_text(self):
        if self.shortest_distance_line_index >= 0:
            line = self.shortest_distance_line
            line_text = self.get_line_content(line)
            return line_text
        else:
            return None

    def set_shortest_n_distance_text(self, value):
        if self.shortest_distance_line_index >= 0:
            sd_line = self.shortest_distance_line
            sd_line_new_value = self.set_line_content(sd_line, value)
            self.set_shortest_n_distance_line(sd_line_new_value)
        else:
            return None

    def get_shortest_n_distance_line(self):
        if self.shortest_distance_line_index >= 0:
            line = self.shortest_distance_line
            return line
        else:
            return None

    def set_shortest_n_distance_line(self, value):
        self.shortest_distance_line = value

    def get_shortest_n_distance_index(self):
        if self.shortest_distance_line_index >= 0:
            return self.shortest_distance_line_index
        else:
            return None

    def print_shortest_n_distance_line(self):
        line = self.get_shortest_n_distance_text()
        if line is not None and line is not False:
            self._cpr.print(line)

    def print_msa_best_line(self):
        msa_text = self._best_msa_text
        if msa_text is not None and msa_text is not False:
            print(msa_text)
        else:
            self._cpr.print(str(msa_text))

    def get_line_content(self, line):
        """
        Helper method to get line content, because ocropus content
        has other access properties. Method behaves differently when
        the current set is a database set
        :param line: line element to check upn
        :return: string with line content, or 'False if line isn't defined.
        """

        # hint: the attribute checked is created by hocr_line_normalizer
        if line is False:
            return False
        # elif hasattr(line, 'ocr_text_normalized'):

        if self._is_origin_database is False:
            # just the standard behaviour
            if line.ocr_text_normalized is not None:
                return line.ocr_text_normalized
            else:
                return line.ocr_text
        else:
            return line.textstr

    def set_line_content(self, line, value):
        """
        Helper method to set line content, because ocropus content
        has other access properties.
        :param line: line element to set the value to
        :param value: value to set to 'ocr_text_normalized' property
        :return: line or false if line not defined
        """

        # hint: the attribute checked is created by hocr_line_normalizer
        if line is False:
            return False

        line.ocr_text_normalized = value
        return line

    def unspace_lines(self, list_index_to_unspace, unspaced_list_index):

        unspaced_lines = self._text_unspacer.unspace_texts(
            self._set_lines, list_index_to_unspace, unspaced_list_index)

        self._unspaced = True
        self._refspaced = False
        self._set_lines = unspaced_lines

    def refspace_lines(self, list_index_to_adapt, list_index_reference):

        refspaced_lines = self._text_unspacer.refspace_texts(
            self._set_lines, list_index_to_adapt, list_index_reference)

        self._unspaced = False
        self._refspaced = True
        self._set_lines = refspaced_lines