Ejemplo n.º 1
0
    def __init__(self, lines_size, y_mean, msa_handler):
        lineset = []
        for x in range(0, lines_size):
            lineset.append(False)

        self._set_lines = lineset
        self._size = lines_size
        self._y_mean = y_mean  # mean y coordinate of all lines referenced in this set
        self.shortest_distance_line_index = -1
        self._unspaced = False  # indicates the set_lines was unspaced
        self._refspaced = False  # indicates the set_lines was reference spaced
        self._text_unspacer = TextUnspacer()
        self.shortest_distance_line = None  # holder element for recognized shortest distance line
        self._best_msa_text = ""
        self._text_seg = None
        self._is_origin_database = False
        self._database_handler = None
        config_handler = ConfigurationHandler(first_init=False)
        self._config = config_handler.get_config()

        if 'ExceptionInitializing' in self._config:
            print("Exception initializing config, don't print")
            self._cpr = ConditionalPrint(False, False, False)
        else:

            self._cpr = ConditionalPrint(self._config.PRINT_MSA_HANDLER,
                                         self._config.PRINT_EXCEPTION_LEVEL,
                                         self._config.PRINT_WARNING_LEVEL)

        self._msa_handler = msa_handler
Ejemplo n.º 2
0
 def __init__(self):
     self.os = os.name.lower()
     config_handler = ConfigurationHandler(first_init=False)
     self.config = config_handler.get_config()
     self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                 self.config.PRINT_EXCEPTION_LEVEL,
                                 self.config.PRINT_WARNING_LEVEL)
Ejemplo n.º 3
0
    def __init__(self):

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
        self.cpr.print("init segment classifier")
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)
        self.cpr.print("init additional info handler")
Ejemplo n.º 5
0
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.PRINT_TO_CHECKFILE = False
        # a line starting with these words can't be in a table
        self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:",
                                   "von","Gründung:", "Ordnungsnr.", "Ordnungsnr",
                                   "Grundkapital:","Umstellung"]
Ejemplo n.º 6
0
    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TWO, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions two")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler
    def __init__(self):
        self.my_object = {}
        self.current_main_list = None
        self.pp = pprint.PrettyPrinter(indent=5)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            self.known_uc = KnownUncategories()
Ejemplo n.º 8
0
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.filter_start_words = [
            "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:",
            "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung"
        ]
class AkfParsingFunctionsTablesOne(object):
    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(
            self.config.PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE,
            self.config.PRINT_EXCEPTION_LEVEL,
            self.config.PRINT_WARNING_LEVEL,
            leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions tables one")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler

    def parse_aktienkurse(self, real_start_tag, content_texts, content_lines,
                          feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

    def parse_dividenden(self, real_start_tag, content_texts, content_lines,
                         feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

    def parse_dividenden_auf_xyaktien(self, real_start_tag, content_texts,
                                      content_lines, feature_lines,
                                      segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)
    def __init__(self):

        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SPECIALCHAR_PREDICTOR,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
Ejemplo n.º 11
0
    def __init__(self, y_size, x_size, wildcard_character,
                 substitution_character):
        self._y_size = y_size
        self._x_size = x_size
        self._middle_index = Random.find_middle(self._x_size, True)
        self._pre_middle_index = self.get_middle_index() - 1
        self._nex_middle_index = self.get_middle_index() + 1

        self._wildcard_character = wildcard_character
        self._substitution_character = substitution_character
        self.similar_chars = []
        self.similar_chars.append(['o', 'ö'])
        self.similar_chars.append(['<',
                                   'o'])  # untested is this really better?
        self.similar_chars.append(['O', 'Ö'])
        self.similar_chars.append(['0', 'O', '9'])
        self.similar_chars.append(['d', 'ö'])
        #self.similar_chars.append(['1', 'l'])
        self.similar_chars.append(['l', 'j', '1'])
        self.similar_chars.append(['I', 'l'])
        self.similar_chars.append(['u', 'ü'])
        self.similar_chars.append(['U', 'Ü', 'O'])
        self.similar_chars.append(['a', 'ä'])
        self.similar_chars.append(['A', 'Ä'])
        self.similar_chars.append([':', ';'])
        self.similar_chars.append(['-', '¬'])
        self.similar_chars.append(['"', "'"])
        self.similar_chars.append(['C', "G", "c"])
        # just for testing ...
        self.similar_chars.append(['.', ','])
        self.similar_chars.append([',', ';'])
        self.similar_chars.append(['v', 'V'])
        self.similar_chars.append(['w', 'W'])

        self.similar_chars.append(['i', 'l', 't', '1',
                                   '.'])  # 1 l i also possible
        self.similar_chars.append(['r', 'n'])
        self.similar_chars.append(['%', 'm'])
        self.similar_chars.append(['&', 'é'])
        self.similar_chars.append(['e', 'é'])

        config_handler = ConfigurationHandler(first_init=False)
        self._config = config_handler.get_config()
        self._cpr = ConditionalPrint(self._config.PRINT_SEARCH_SPACE_PROCESSOR,
                                     self._config.PRINT_EXCEPTION_LEVEL,
                                     self._config.PRINT_WARNING_LEVEL)
Ejemplo n.º 12
0
    def __init__(self,
                 predictor=None,
                 vocabulary_checker=None,
                 first_config_init=False):
        self.ocr_sets = []
        self.line_height_information = []
        config_handler = ConfigurationHandler(first_init=first_config_init)
        self.config = config_handler.get_config()

        if 'ExceptionInitializing' in self.config:
            self.cpr = ConditionalPrint(False, False, False)
        else:
            self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                        self.config.PRINT_EXCEPTION_LEVEL,
                                        self.config.PRINT_WARNING_LEVEL)

        self.predictor = predictor
        self.vocabulary_checker = vocabulary_checker
Ejemplo n.º 13
0
    def __init__(self):
        self.os = os.name.lower()
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()

        if 'ExceptionInitializing' in self.config:
            print("Exception initializing config, don't print")
            self.cpr = ConditionalPrint(False, False, False)
        else:

            self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                        self.config.PRINT_EXCEPTION_LEVEL,
                                        self.config.PRINT_WARNING_LEVEL)

        if self.os != 'linux' and self.os != 'posix':
            raise OSError(
                "Untested operating system adapt code and continue at own risk"
            )
Ejemplo n.º 14
0
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_VOCABULARY_CHECKER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.dict_lines = []
        self.max_edist = None
        self.suggenstion_verbosity = None
        #self.spellchecker = None
        self.special_chars_borders = "!¦1234567890,)(;.:\"-"

        self.pattern_start = re.compile(r"^[" + self.special_chars_borders +
                                        "]+")
        self.pattern_trail = re.compile(r"[" + self.special_chars_borders +
                                        "]+$")
        self.pattern_trail_dash = re.compile(r"[-]$")
        self.pattern_only_normal_chars = re.compile(r"[a-zA-Z]+")
Ejemplo n.º 15
0
    def __init__(self,
                 output_analyzer,
                 dictionary_handler,
                 ocromore_data=None):

        self.ef = EndobjectFactory()
        self.dictionary_handler = dictionary_handler

        # map which maps tags to functions for parsing -> change constuctor for other project
        fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.function_map = fmap.get_function_map()
        self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/"
Ejemplo n.º 16
0
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.cpr.print("init dictionary handler")
        self.data_functs = None  # storage for json object
        self.data_titles = None  # storage for json object
        self.texts_functs = None
        self.texts_titles = None
        if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING:
            self.load_dictionaries()
            # get the rows as sorted list of texts longest first
            if self.data_functs is not None:
                check_tf = self.sort_rows(self.get_rows(self.data_functs))
                self.texts_functs = check_tf
            if self.data_titles is not None:
                check_tt = self.sort_rows(self.get_rows(self.data_titles))
                self.texts_titles = check_tt
Ejemplo n.º 17
0
class VisualizationHandler(object):
    def __init__(self):
        self.os = os.name.lower()
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)

    def show_file_comparison_pycharm(self, filepath_1, filepath_2):
        if self.os == 'linux' or self.os == 'posix':
            try:
                process = Popen(["charm", "diff", filepath_1, filepath_2])
                return process

            except Exception as ex:
                self.cpr.printex("Exception calling pycharm", ex)
        else:
            self.cpr.printex("Write code here for other os, or take other os")

        return None

    def show_file_comparison_meld(self,
                                  filepath_1,
                                  filepath_2,
                                  just_add_tab=False):
        if self.os == 'linux' or self.os == 'posix':
            try:
                if just_add_tab:
                    process = Popen(
                        ["meld", "--newtab", filepath_1, filepath_2])
                else:
                    process = Popen(["meld", filepath_1, filepath_2])
                return process

            except Exception as ex:
                self.cpr.printex("Exception calling meld", ex)
        else:
            self.cpr.printex("Write code here for other os, or take other os")

        return None
Ejemplo n.º 18
0
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.cpr_vocab_check = ConditionalPrint(
            self.config.PRINT_VOCABULARY_CHECKER,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)
        self.cpr_sc_predict = ConditionalPrint(
            self.config.PRINT_SPECIALCHAR_PREDICTOR,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)

        self.filo_last_chars = Filo(250)
        self.predictor = None
        self.use_aufsichtsrat_prediction = False
        self.vocab_checker = None
        self.previous_word_with_seperator = False
class AkfParsingFunctionsOne(object):
    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(
            self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE,
            self.config.PRINT_EXCEPTION_LEVEL,
            self.config.PRINT_WARNING_LEVEL,
            leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions one")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler

    def parse_firmenname(self, real_start_tag, content_texts, content_lines,
                         feature_lines, segmentation_class):
        # get basic data
        element_counter = 0

        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # get relevant info
        accumulated_text = ""
        for text in content_texts:
            accumulated_text += " " + text

        only_add_if_value = False
        accumulated_text = accumulated_text.strip()
        self.ef.add_to_my_obj("Firmenname",
                              accumulated_text,
                              object_number=element_counter,
                              only_filled=only_add_if_value)

    def parse_sitz(self, real_start_tag, content_texts, content_lines,
                   feature_lines, segmentation_class):
        """
         "Sitz": [
                {
                  "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:[email protected] Internetseite:http://www.3u.net ",
                  "type": "Sitz",
                  "street": "Mergenthalerallee",
                  "street_number": "79-81",
                  "zip": "65760",
                  "city": "Eschborn",
                  "phone": "(069) 7 50 06-0",
                  "fax": "(069) 7 50 06-111",
                  "email": [
                    "*****@*****.**"
                  ],
                  "www": [
                    "http://www.3u.net"
                  ]
                }
              ],
        """
        # get basic data
        element_counter = 0

        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # get relevant info
        num_id, city, street, street_number, additional_info = cf.parse_id_location(
            origpost_red)

        # add stuff to ef
        only_add_if_value = True
        self.ef.add_to_my_obj("numID",
                              num_id,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("city",
                              city,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("street",
                              street,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("street_number",
                              street_number,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("additional_info",
                              additional_info,
                              object_number=element_counter,
                              only_filled=only_add_if_value)

        return True

    def parse_verwaltung(self, real_start_tag, content_texts, content_lines,
                         feature_lines, segmentation_class):
        # kmy_obj_2 = self.ef.print_me_and_return()
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        if "srat" in real_start_tag:
            # Verwaltungsrat ..
            persons_final = cf.parse_persons(
                origpost_red, self.dictionary_handler,
                self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
            only_add_if_filed = True
            for entry in persons_final:
                name, first_name, last_name, city, title, funct, rest_info = entry
                self.ef.add_to_my_obj("name",
                                      name,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("first_name",
                                      first_name,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("last_name",
                                      last_name,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)

                self.ef.add_to_my_obj("city",
                                      city,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("title",
                                      title,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("rest",
                                      rest_info,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("funct",
                                      funct,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)

                element_counter += 1
            return True
        elif "Verw." in real_start_tag:
            # Verw.
            num_id, city, street, street_number, additional_info = cf.parse_id_location(
                origpost_red)

            # add stuff to ef
            only_add_if_value = True
            self.ef.add_to_my_obj("numID",
                                  num_id,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("street",
                                  street,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("street_number",
                                  street_number,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("additional_info",
                                  additional_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)

            return True
        else:
            # Verwaltung
            final_items = cf.parse_general_and_keys(
                content_texts,
                join_separated_lines=False,
                current_key_initial_value="General_Info")
            for key in final_items.keys():
                value = final_items[key]
                if value is None or value == "":
                    continue
                self.ef.add_to_my_obj(key,
                                      value,
                                      object_number=element_counter,
                                      only_filled=True)
                element_counter += 1
            return True

    def parse_telefon_fernruf(self, real_start_tag, content_texts,
                              content_lines, feature_lines,
                              segmentation_class):

        # get basic data
        origpost, origpost_red, element_counter, content_texts = cf.add_check_element(
            self, content_texts, real_start_tag, segmentation_class, 0)
        # do special match: Verwaltung und Betriebshof
        split_post = []

        match_special = regex.match(
            r"(?<Verw>Verwaltung.*)"
            r"(?<Betr>Betriebshof.*)", origpost_red)
        if match_special:
            betriebshof = match_special.group("Betr")
            verwaltung = match_special.group("Verw")
            origpost_red = origpost_red.replace(betriebshof, "")
            origpost_red = origpost_red.replace(verwaltung, "")
            split_post.append(betriebshof)
            split_post.append(verwaltung)
        # do special match: Ortsgespräche and Ferngespräche

        match_special2 = regex.match(
            r"(?<og>Ortsgespräche.*)"
            r"(?<fg>Ferngespräche.*)", origpost_red)
        if match_special2:
            ortsgespr = match_special2.group("og")
            ferngespr = match_special2.group("fg")
            origpost_red = origpost_red.replace(ortsgespr, "")
            origpost_red = origpost_red.replace(ferngespr, "")
            split_post.append(ortsgespr)
            split_post.append(ferngespr)

        # do special match: Ortsverkehr and Fernverkehr

        match_special3 = regex.match(
            r"(?<ov>Ortsverkehr.*)"
            r"(?<fv>Fernverkehr.*)", origpost_red)
        if match_special3:
            ortsverkehr = match_special3.group("ov")
            fernverkehr = match_special3.group("fv")
            origpost_red = origpost_red.replace(ortsverkehr, "")
            origpost_red = origpost_red.replace(fernverkehr, "")
            split_post.append(ortsverkehr)
            split_post.append(fernverkehr)

        # do special match: check if only numbers
        origpost_red_new = origpost_red
        #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "")
        test_split = regex.split("\su\.|\sund\s|,|;", origpost_red)
        for number in test_split:
            # additional parenthesis block
            match_parenthesis = regex.search("\(.*\)", number)
            parenthesis = None
            if match_parenthesis:
                parenthesis = match_parenthesis.group()
                number = number.replace(parenthesis, "")  # remove number
                self.ef.add_to_my_obj("vorwahl",
                                      parenthesis,
                                      object_number=element_counter,
                                      only_filled=True)

            match_word_num = regex.search("(?<word>[^\d]*)(?<num>[\d\s\-/]*)",
                                          number)
            if match_word_num is None:
                continue

            word = match_word_num.group("word")
            num = match_word_num.group("num")
            if "Sa." in word and "Nr" in word:
                continue
            number_stripped = num.strip(" ./").replace("/", "").replace(
                "-", "").replace(" ", "")
            if number_stripped.isdigit():
                origpost_red_new = origpost_red_new.replace(
                    number, "")  # remove number
                origpost_red_new = origpost_red_new.replace(
                    word, "")  # remove word found

                change1 = self.ef.add_to_my_obj("number_Sa.-Nr.",
                                                num.strip(),
                                                object_number=element_counter,
                                                only_filled=True)
                change2 = self.ef.add_to_my_obj("location",
                                                word.strip(),
                                                object_number=element_counter,
                                                only_filled=True)
                if change1 or change2:
                    element_counter += 1

        #if "32 20 47" in origpost_red:
        #    print("asd")

        origpost_red = origpost_red_new
        # substitute in a separator char to integrate delimiters in next step
        origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red)

        # do  further matches (sc-separated)
        split_post.extend(regex.split(';|~~~~|\su\.', origpost_red))

        for index, entry in enumerate(split_post):
            if entry is None:
                continue
            entry_stripped = entry.strip()
            if entry_stripped == "":
                continue

            # additional parenthesis block
            match_parenthesis = regex.search("\(.*\)", entry_stripped)
            parenthesis = None
            if match_parenthesis:
                parenthesis = match_parenthesis.group()
                entry_stripped = entry_stripped.replace(parenthesis,
                                                        "")  # remove entry
                self.ef.add_to_my_obj("vorwahl",
                                      parenthesis,
                                      object_number=element_counter,
                                      only_filled=True)

            match_word = regex.match(r"(?<Tag>\D*)"
                                     r"(?<Numbers>[\d\s\W]*)", entry_stripped)
            if match_word is not None:
                # fetch match results
                tag_match = match_word.group("Tag")
                numbers_match = match_word.group("Numbers")
                rest_from_entry_str = entry_stripped.replace(tag_match, "", 1)
                rest_from_entry_str = rest_from_entry_str.replace(
                    numbers_match, "", 1)

                tag = dh.strip_if_not_none(tag_match, "")
                match_tag = regex.match(
                    r"(?<rest_bef>.*)(?<sanr>Sa\.?\-Nr\.?)(?<rest_end>.*)",
                    tag)
                location = ""
                if match_tag is not None:
                    rest_tag = match_tag.group('rest_bef')
                    rest_tag_2 = match_tag.group('rest_end')
                    # sanr = match_tag.group('sanr') # this is the filtered group
                    location = dh.strip_if_not_none(
                        rest_tag + " " + rest_tag_2, ":., ")
                else:
                    # if there are no real descriptors in tag then tag is usually location  (like Düsseldorf 1 36 62.)
                    location = tag

                if "und" in location:
                    location = regex.sub("[^\w]und[^\w]", "", location)

                number = dh.strip_if_not_none(numbers_match, "., ")
                self.ef.add_to_my_obj("number_Sa.-Nr.",
                                      number.strip(),
                                      object_number=element_counter,
                                      only_filled=True)
                self.ef.add_to_my_obj("location",
                                      location.strip(),
                                      object_number=element_counter,
                                      only_filled=True)
                additional_info_entry_level = dh.strip_if_not_none(
                    rest_from_entry_str, ",. ")
                self.ef.add_to_my_obj("additional_info",
                                      additional_info_entry_level.strip(),
                                      object_number=element_counter,
                                      only_filled=True)
                element_counter += 1

                origpost_red = origpost_red.replace(number, "", 1)
                origpost_red = origpost_red.replace(location, "", 1)

        origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "")
        origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red)

        if len(origpost_red_end) > 3:
            self.ef.add_to_my_obj("additional_info_unparsed",
                                  origpost_red_end.strip(),
                                  object_number=element_counter)

    def parse_vorstand(self, real_start_tag, content_texts, content_lines,
                       feature_lines, segmentation_class):

        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        persons_final = cf.parse_persons(
            origpost_red, self.dictionary_handler,
            self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)

        only_add_if_filed = True
        for entry in persons_final:
            name, first_name, last_name, city, title, funct, rest_info = entry
            self.ef.add_to_my_obj("name",
                                  name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("first_name",
                                  first_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("last_name",
                                  last_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("title",
                                  title,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("rest",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("funct",
                                  funct,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            element_counter += 1
        """
        # do  matches (;-separated)
        split_post = origpost_red.split(';')

        for index, entry in enumerate(split_post):
            entry_stripped = entry.strip()

            if index == len(split_post)-1:
                matchend = regex.match("^[Aa]lle", entry_stripped)
                if matchend:
                    self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter)
                    element_counter += 1
                    continue

            match = regex.match(r"(?<Name>.*)[,]"             # find location string
                                r"(?<Rest>.*+)",              # just get the rest which is usually streetname and number, but has other possibilities
                                entry_stripped)
            if match is None:
                name = dh.strip_if_not_none(entry_stripped, ", ")
                self.ef.add_to_my_obj("name", name, object_number=element_counter)
                element_counter += 1
                continue

            name = dh.strip_if_not_none(match.group("Name"), ", ")
            rest = dh.strip_if_not_none(match.group("Rest"), ",. ")
            name_split = name.split(',')
            if len(name_split) > 1:
                position = rest
                name = name_split[0]
                city = name_split[1]
            else:
                city = rest
                position = ""

            self.ef.add_to_my_obj("name", name, object_number=element_counter)
            self.ef.add_to_my_obj("city", city, object_number=element_counter)
            self.ef.add_to_my_obj("position", position, object_number=element_counter)
            element_counter += 1
            """

        return True

    def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines,
                           feature_lines, segmentation_class):

        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        #Try to fix +) problems
        origpost_red = origpost_red.replace("; +)", "+);").replace(
            ";+)", "+);").replace("')", "").replace("*)", "")

        persons_final = cf.parse_persons(
            origpost_red, self.dictionary_handler,
            self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)

        only_add_if_filed = True
        for entry in persons_final:
            name, first_name, last_name, city, title, funct, rest_info = entry
            self.ef.add_to_my_obj("name",
                                  name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("first_name",
                                  first_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("last_name",
                                  last_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("title",
                                  title,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("rest",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("funct",
                                  funct,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            element_counter += 1

        return True

    def parse_arbeitnehmervertreter(self, real_start_tag, content_texts,
                                    content_lines, feature_lines,
                                    segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        persons_final = cf.parse_persons(
            origpost_red, self.dictionary_handler,
            self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
        only_add_if_filed = True
        for entry in persons_final:
            name, first_name, last_name, city, title, funct, rest_info = entry
            self.ef.add_to_my_obj("name",
                                  name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("first_name",
                                  first_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("last_name",
                                  last_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("title",
                                  title,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("rest",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("funct",
                                  funct,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)

            element_counter += 1

        return True

    # Gruendung
    def parse_gruendung(self, real_start_tag, content_texts, content_lines,
                        feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
        match_year = regex.search("^\d*", origpost_red.strip())
        if match_year:
            result = match_year.group()
            origpost_red_new = origpost_red.replace(result, "", 1)
            year = dh.strip_if_not_none(result, ".,() ")
            rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ")
            self.ef.add_to_my_obj("rest_info",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=True)
            self.ef.add_to_my_obj("year",
                                  year,
                                  object_number=element_counter,
                                  only_filled=True)
        else:
            rest_info = dh.strip_if_not_none(origpost_red, ".,() ")
            self.ef.add_to_my_obj("rest_info",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=True)

    # Tätigkeitsgebiet
    def parse_taetigkeitsgebiet(self, real_start_tag, content_texts,
                                content_lines, feature_lines,
                                segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        final_items = cf.parse_general_and_keys(
            content_texts,
            join_separated_lines=False,
            current_key_initial_value="General_Info")

        for key in final_items.keys():
            value = final_items[key]
            if value is None or len(value) == 0:
                continue
            self.ef.add_to_my_obj(key,
                                  value,
                                  object_number=element_counter,
                                  only_filled=True)
            element_counter += 1
Ejemplo n.º 20
0
class FeatureExtractor():
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.filter_start_words = [
            "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:",
            "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung"
        ]

    def extract_file_features(self, ocromore_data):
        all_line_features = []
        for line in ocromore_data['lines']:
            current_line_features = self.extract_line_features(line)
            all_line_features.append(current_line_features)

        ocromore_data['line_features'] = all_line_features

        return ocromore_data

    def extract_line_features(self, line):

        final_line_features = {}

        whole_text = line['text']

        self.cpr.print("recognizing text:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for word_obj in line['words']:
            word_index = word_obj['word_index']
            word_text = word_obj['text']
            hocr_coordinates = word_obj['hocr_coordinates']

            word_xstart = hocr_coordinates[0]
            word_xstop = hocr_coordinates[2]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if word_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word_text is None or word_text == "":
                continue

            if word_index == 0:
                if word_text in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word_text.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word_text[0] == "(":
                    starts_with_parenthesis = True

            if word_index == len(whole_text) - 1:
                if word_text[-1] == ")":
                    ends_with_parenthesis = True

            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0

            counter_words += 1

            word_list = list(word_text)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(
                counter_alphabetical_word / len(word_text), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word_text))
            counters_numbers.append(counter_numbers_word)
            character_index += len(word_text)
            last_xstop = word_xstop

        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers

        if counter_chars == 0:
            self.cpr.printw("no chars in line:", str(line['line_index']),
                            "no features here")
            return False

        special_chars_ratio = counter_special_chars / counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces / counter_chars
        numbers_ratio = counter_numbers / counter_chars

        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers / counter, 2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths) - 1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True

        final_line_features = LineFeatures(cpr=self.cpr)
        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word

        final_line_features.counter_special_chars = counter_special_chars
        final_line_features.counter_chars = counter_chars
        final_line_features.counter_spaces = counter_spaces
        final_line_features.counter_numbers = counter_numbers
        final_line_features.counter_alphabetical = counter_alphabetical
        final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars
        final_line_features.counter_words = counter_words

        final_line_features.counters_numbers = counters_numbers
        final_line_features.counters_wordlengths = counters_wordlengths
        final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios

        final_line_features.numbers_ratio = numbers_ratio
        final_line_features.alphabetical_ratio = alphabetical_ratio
        final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio
        final_line_features.special_chars_ratio = special_chars_ratio
        final_line_features.spaces_ratio = spaces_ratio

        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word
        final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words
        final_line_features.many_numbers_in_first_word = many_numbers_in_first_word
        final_line_features.x_box_sizes = x_box_sizes
        final_line_features.x_gaps = x_gaps

        final_line_features.maximum_x_gap = maximum_x_gap
        final_line_features.mean_x_gap = mean_x_gap
        final_line_features.median_x_gap = median_x_gap

        return final_line_features
Ejemplo n.º 21
0
class TableHandler(object):

    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.PRINT_TO_CHECKFILE = False
        # a line starting with these words can't be in a table
        self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:",
                                   "von","Gründung:", "Ordnungsnr.", "Ordnungsnr",
                                   "Grundkapital:","Umstellung"]

        #with open("checkfile_tables.txt", "w") as myfile:
         #   myfile.write("----" + "\n")

    def recognize_a_line(self, line):

        if line == None or line == False or line == True or line.textstr == None:
            return False

        whole_text = line.textstr
        self.cpr.print("recognizing line:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for key_index, key in enumerate(line.word['text']):
            word = line.word['text'][key]
            uid_info = line.word['UID'][key]
            word_xstart = line.data['word_x0'][character_index]
            word_xstop = line.data['word_x1'][character_index]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if key_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word is None or word == "":
                continue

            if key_index == 0:
                if word in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word[0] == "(":
                    starts_with_parenthesis = True


            if key_index == len(line.word['text'])-1:
                if word[-1] == ")":
                    ends_with_parenthesis = True



            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0


            counter_words += 1

            word_list = list(word)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word))
            counters_numbers.append(counter_numbers_word)
            character_index += len(uid_info)
            last_xstop = word_xstop


        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers


        if counter_chars == 0:
            self.cpr.printw("no chars shouldn't happen, no recognizion")
            return False

        special_chars_ratio = counter_special_chars/ counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces/ counter_chars
        numbers_ratio = counter_numbers / counter_chars


        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers/counter,2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths)-1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True



        self.cpr.print("alle cntr:", counter_chars)
        self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio)
        self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio)
        self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio)
        self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio)
        self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio)
        self.cpr.print("x_box_sizes", x_box_sizes)
        self.cpr.print("x_gaps", x_gaps)
        self.cpr.print("x_gap_max_size", maximum_x_gap)
        self.cpr.print("x_gaps_mean", mean_x_gap)
        self.cpr.print("x_gaps_median", median_x_gap)

        if "Gewinn nach Vortrag" in whole_text:
            print("")


        if ((alphabetical_ratio < 0.75 and \
            numbers_ratio > 0.2 and \
            counter_chars > 5 and \
            counter_words >= 2) and not \
            (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word:

            if first_word_no_table_indicator:
                return False

            if mean_x_gap <= 115:
                return False
            if many_alphabetical_in_last_word:
                return False
            if many_alphabetical_in_middle_words and many_numbers_in_first_word:
                return False


            self.cpr.print("possible entry:", whole_text)

            if self.PRINT_TO_CHECKFILE:
                with open("checkfile_tables.txt", "a") as myfile:
                    myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \
                             + "||| median x_gap: " + str(median_x_gap)+"\n")

            print("jab")
            return True

        return False
Ejemplo n.º 22
0
class OCRVoter(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.cpr_vocab_check = ConditionalPrint(
            self.config.PRINT_VOCABULARY_CHECKER,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)
        self.cpr_sc_predict = ConditionalPrint(
            self.config.PRINT_SPECIALCHAR_PREDICTOR,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)

        self.filo_last_chars = Filo(250)
        self.predictor = None
        self.use_aufsichtsrat_prediction = False
        self.vocab_checker = None
        self.previous_word_with_seperator = False

    def add_predictor(self, predictor):
        self.predictor = predictor

    def add_vocab_checker(self, vocab_checker):
        self.vocab_checker = vocab_checker

    def get_same_count(self, c1, c2, c3):
        same_ctr = 0
        if c1 == c2:
            same_ctr += 1

        if c1 == c3:
            same_ctr += 1

        return same_ctr

    def get_confidence_count(self,
                             char1,
                             char2,
                             char3,
                             cconf1,
                             cconf2,
                             cconf3,
                             wildcard_char='¦'):
        def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3):
            if char_first != char_sec:
                return char_sec, float(co2)
            elif char_first != char_thrd:
                return char_thrd, float(co3)

        same_ctr = 0
        cconf_ctr = float(cconf1)

        if char1 == char2:
            same_ctr += 1
            cconf_ctr += float(cconf2)
        if char1 == char3:
            same_ctr += 1
            cconf_ctr += float(cconf3)

        # special cases space: ' ', ' ', 'x'
        # wildcard character : '¦', '¦', '¦'

        if char1 == ' ' and same_ctr == 1:
            # if the confidence of the other character is below that value, space gets the high put in confidence value
            return 1, 95.0  #todo j4t

            SPACE_TRESH = 50.0
            SPACE_PUT_IN_VALUE = 99.0
            otherchar, otherconf = get_other_char(char1, char2, char3, cconf1,
                                                  cconf2, cconf3)
            #print("otherchar",otherchar,"otherconf",otherconf)
            if otherconf < SPACE_TRESH:
                return 1, SPACE_PUT_IN_VALUE

        elif char1 == wildcard_char and same_ctr == 1:  #todo: differentiate type of character ??
            # if there is two wildcards and one characters, characters confidence has to be higher than
            # WILDCARD_TRESH to be taken

            wildcard_tresh = 98.5
            if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                wildcard_tresh -= 10  # 0:99,19%, 20:99.16%, 10:99.27%

            return 1, wildcard_tresh

        elif char1 == wildcard_char and same_ctr == 0:
            pass  # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard)
        elif char1 == '' and same_ctr == 0:
            pass  # todo maybe cover this case (cause space has no confidence ...
        elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \
            and Random.is_special_character(char1) and same_ctr == 0 \
            and char2 == wildcard_char and char3 == wildcard_char:
            # lower the confidence of special characters which stand without any other chars
            return same_ctr, cconf_ctr * 0.9

        return same_ctr, cconf_ctr

    def vote_best_of_three_simple(self,
                                  text_1,
                                  text_2,
                                  text_3,
                                  index_best,
                                  wildcard_character='¦'):
        list_line_1 = list(text_1)
        list_line_2 = list(text_2)
        list_line_3 = list(text_3)

        accumulated_chars = ""
        accumulated_confs = Filo
        for character_index, character_1 in enumerate(list_line_1):
            character_2 = list_line_2[character_index]
            character_3 = list_line_3[character_index]

            clist = [character_1, character_2, character_3]
            # get the character which occurs the most
            sc1 = self.get_same_count(character_1, character_2, character_3)
            sc2 = self.get_same_count(character_2, character_1, character_3)
            sc3 = self.get_same_count(character_3, character_2, character_1)
            maxindices = np.argmax([sc2, sc1, sc3])
            if maxindices == 0:
                accumulated_chars += character_2
            elif maxindices == 1:
                accumulated_chars += character_1
            else:
                accumulated_chars += character_3

        accumulated_chars_stripped = accumulated_chars.replace(
            wildcard_character, '')

        return accumulated_chars, accumulated_chars_stripped

    def vote_best_of_three_charconfs(self,
                                     line_1,
                                     line_2,
                                     line_3,
                                     index_best,
                                     wildcard_character='¦'):
        try:

            def try_obtain_charconf(value, undef_value=0):
                if value is None or value is False or value is True:
                    return undef_value
                return value

            def try_obtain_char(charlist, index):
                if index >= len(charlist):
                    return False  #j4t means not defined
                else:
                    return charlist[index]

            key_confs_mapping = 'UID'
            key_confs = 'x_confs'
            key_char = 'calc_char'
            self.cpr.print("vote_text1", line_1.textstr)
            self.cpr.print("vote_text2", line_2.textstr)
            self.cpr.print("vote_text3", line_3.textstr)
            #if "¦¦lt.H" in line_1.textstr:
            #    self.cpr.print("asd")

            maximum_char_number = max(len(line_1.textstr), len(line_2.textstr),
                                      len(line_3.textstr))

            accumulated_chars = ""

            for character_index in range(
                    0, maximum_char_number
            ):  # check: is list 1 always best reference?

                character_1 = line_1.value(key_char, character_index)
                character_2 = line_2.value(key_char, character_index)
                character_3 = line_3.value(key_char, character_index)

                charconf_1 = try_obtain_charconf(
                    line_1.value(key_confs, character_index, wsval=50.0))
                charconf_2 = try_obtain_charconf(
                    line_2.value(key_confs, character_index, wsval=50.0))
                charconf_3 = try_obtain_charconf(
                    line_3.value(key_confs, character_index, wsval=50.0))

                clist = [character_1, character_2, character_3]
                # get the character which occurs the most
                sc1, acc_conf_1 = self.get_confidence_count(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)
                sc2, acc_conf_2 = self.get_confidence_count(
                    character_2, character_1, character_3, charconf_2,
                    charconf_1, charconf_3)
                sc3, acc_conf_3 = self.get_confidence_count(
                    character_3, character_2, character_1, charconf_3,
                    charconf_2, charconf_1)
                maxindices = np.argmax([
                    acc_conf_2, acc_conf_1, acc_conf_3
                ])  # this takes in priorisation in case the chars are same
                #todo:import to config
                if character_index == maximum_char_number - 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I":
                    continue

                if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True:
                    tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH
                    maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3)
                    if maximum_conf < tresh:
                        if [character_2, character_1, character_3
                            ][maxindices] != '¦':
                            continue

                if maxindices == 0:
                    accumulated_chars += character_2
                elif maxindices == 1:
                    accumulated_chars += character_1
                else:
                    accumulated_chars += character_3

            accumulated_chars_stripped = accumulated_chars.replace(
                wildcard_character, '')

            return accumulated_chars, accumulated_chars_stripped
        except Exception as ex:
            tr = inspect.trace()

            self.cpr.printex("ocr_voter.py Exception during confidence vote:",
                             ex)
            self.cpr.printex("trace is:", tr)

    def increase_umlaut_confidence(self, chars, charconfs):

        charconfs_adapted = []

        for char_index, char in enumerate(chars):
            if char in SpecialChars.umlauts_caps or char in SpecialChars.umlauts:
                cconf_to_add = charconfs[
                    char_index] + SpecialChars.umlaut_increment
            elif char in SpecialChars.special_chars:
                cconf_to_add = charconfs[
                    char_index] + SpecialChars.special_char_increment
            else:
                cconf_to_add = charconfs[char_index]

            charconfs_adapted.append(cconf_to_add)

        return charconfs_adapted

    def vote_best_of_three_charconfs_searchspaces(self,
                                                  line_1,
                                                  line_2,
                                                  line_3,
                                                  index_best,
                                                  wildcard_character='¦'):
        try:

            key_confs_mapping = 'UID'
            key_confs = 'x_confs'
            key_char = 'calc_char'
            self.cpr.print("vote_text1", line_1.textstr)
            self.cpr.print("vote_text2", line_2.textstr)
            self.cpr.print("vote_text3", line_3.textstr)
            #if "Beteiligung:" in line_1.textstr:
            #     self.cpr.print("asd")

            maximum_char_number = max(len(line_1.textstr), len(line_2.textstr),
                                      len(line_3.textstr))

            accumulated_chars = ""
            accumulated_confs = Filo(300)

            # search space settings
            SEARCH_SPACE_Y_SIZE = 3
            SEARCH_SPACE_X_SIZE_OUTER = 7
            SEARCH_SPACE_X_SIZE_INNER = 3
            SEARCH_SPACE_X_SEARCH_RANGE = 1
            SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR = '¦'
            SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS = True
            SEARCH_RANGE = 1
            PRINT_MATRICES = self.config.PRINT_SEARCH_SPACE_MATRICES

            # initialize search space processor and search spaces
            search_space_processor = SearchSpaceProcessor(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_INNER, \
                                                          wildcard_character, SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR)

            ssp_chars = SearchSpace(SEARCH_SPACE_Y_SIZE,
                                    SEARCH_SPACE_X_SIZE_OUTER,
                                    SEARCH_SPACE_X_SEARCH_RANGE, True)
            ssp_confs = SearchSpace(SEARCH_SPACE_Y_SIZE,
                                    SEARCH_SPACE_X_SIZE_OUTER,
                                    SEARCH_SPACE_X_SEARCH_RANGE, True)

            # check if one of the lines is empty for certain settings
            one_line_empty = False
            if self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS or \
                self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                one_line_empty = self.check_if_one_line_empty(
                    [line_1, line_2, line_3], wildcard_character)

            # loop through the maximum character range of the lines
            range_extension = SEARCH_SPACE_X_SIZE_INNER
            for character_index in range(
                    0, maximum_char_number + range_extension +
                    2):  # check: is list 1 always best reference?

                if character_index < maximum_char_number:
                    # if there is a character within range (no padding char from extension)
                    # get character values and obtain corresponding confidences (from searchspace because they might
                    # be different to normal values because of swapping
                    line_vals = [line_1.value(key_char, character_index), line_2.value(key_char, character_index), \
                                 line_3.value(key_char, character_index)]

                    line_1_conf = line_1.value(key_confs,
                                               character_index,
                                               wsval=50.0)
                    line_2_conf = line_2.value(key_confs,
                                               character_index,
                                               wsval=50.0)
                    line_3_conf = line_3.value(key_confs,
                                               character_index,
                                               wsval=50.0)

                    charconf_1 = self.try_obtain_charconf_searchspace(
                        line_1_conf,
                        line_vals[0],
                        engine_key=line_1.name[0],
                        one_line_empty=one_line_empty)
                    charconf_2 = self.try_obtain_charconf_searchspace(
                        line_2_conf,
                        line_vals[1],
                        engine_key=line_2.name[0],
                        one_line_empty=one_line_empty)
                    charconf_3 = self.try_obtain_charconf_searchspace(
                        line_3_conf,
                        line_vals[2],
                        engine_key=line_3.name[0],
                        one_line_empty=one_line_empty)
                    charconf_vals = [charconf_1, charconf_2, charconf_3]
                else:
                    # if the character is within padding range just give none values for characters and confidences
                    line_vals = [None, None, None]
                    charconf_vals = [None, None, None]

                # fill searchspace with the chars and confidences
                ssp_chars.push_column(line_vals)
                ssp_confs.push_column(charconf_vals)

                # update the mid-window of the search space (this is the actual search space processing step)
                mid_chars = ssp_chars.get_middle_matrix(PRINT_MATRICES)
                mid_confs = ssp_confs.get_middle_matrix(PRINT_MATRICES)
                mid_chars_processed, mid_confs_processed, change_done = \
                    search_space_processor.process_search_space(mid_chars, mid_confs,SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS)
                if change_done is True:
                    ssp_chars.update_middle_matrix(mid_chars_processed)
                    ssp_confs.update_middle_matrix(mid_confs_processed)

                # extract changed values from search space
                character_offset = -(SEARCH_SPACE_X_SEARCH_RANGE + 1)
                character_1 = ssp_chars.get_value_around_middle(
                    0, character_offset)
                character_2 = ssp_chars.get_value_around_middle(
                    1, character_offset)
                character_3 = ssp_chars.get_value_around_middle(
                    2, character_offset)
                charconf_1 = ssp_confs.get_value_around_middle(
                    0, character_offset)
                charconf_2 = ssp_confs.get_value_around_middle(
                    1, character_offset)
                charconf_3 = ssp_confs.get_value_around_middle(
                    2, character_offset)
                if character_1 is None or character_2 is None or character_3 is None:
                    # self.cpr.print("test")
                    continue

                # in case umlaut confidence increment is active change charconfs otherwise same charconfs
                charconf_1, charconf_2, charconf_3 = self.increase_umlaut_confidence_searchspace(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)

                # get the previous characters from other lines as string (mainly for predictor)
                filo_content = self.filo_last_chars.get_content_as_string()

                # trigger predicted section for aufsichtsrat predictor
                self.toggle_predictor(filo_content)

                # predict_char if predictor is enabled
                predicted_char = self.predict_char(filo_content)

                # get the character which occurs the most by accumulating confidence scores
                sc1, acc_conf_1 = self.get_confidence_count(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)
                sc2, acc_conf_2 = self.get_confidence_count(
                    character_2, character_1, character_3, charconf_2,
                    charconf_1, charconf_3)
                sc3, acc_conf_3 = self.get_confidence_count(
                    character_3, character_2, character_1, charconf_3,
                    charconf_2, charconf_1)
                maxindices = np.argmax([
                    acc_conf_2, acc_conf_1, acc_conf_3
                ])  # this takes in priorisation in case the chars are same

                if character_index == maximum_char_number + range_extension + 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I":
                    continue

                # drop chars completely if they fall below a certain dropping treshhold and the setting is active
                if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True:
                    tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH
                    maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3)
                    if maximum_conf < tresh:
                        if [character_2, character_1, character_3
                            ][maxindices] != '¦':
                            continue

                # determine character with the best accumulated confidence
                voted_char = None
                voted_acc_conf = None
                if maxindices == 0:
                    voted_char = character_2
                    voted_acc_conf = acc_conf_2
                elif maxindices == 1:
                    voted_char = character_1
                    voted_acc_conf = acc_conf_1
                else:
                    voted_char = character_3
                    voted_acc_conf = acc_conf_3

                # if predictor is active, check if there is a better char predicted which can replace  voted character
                voted_char = self.maybe_replace_voted_by_predicted_char(
                    voted_char, self.use_aufsichtsrat_prediction,
                    predicted_char, wildcard_character, voted_acc_conf,
                    character_1, character_2, character_3)
                # push the voted char and the accumulated confidence of this char to results
                accumulated_confs.push(voted_acc_conf)
                accumulated_chars += voted_char

                # if the predictor is enabled fill the filo with the voted_char
                self.fill_filo_last_chars(voted_char)

            # do vocabulary related steps, if activated
            accumulated_chars = self.vocabulary_related_corrections(
                accumulated_chars, wildcard_character, accumulated_confs)

            # remove the wilcard characters and return result
            accumulated_chars_stripped = accumulated_chars.replace(
                wildcard_character, '')
            return accumulated_chars, accumulated_chars_stripped

        except Exception as ex:
            tr = inspect.trace()

            self.cpr.printex("ocr_voter.py Exception during confidence vote",
                             ex)
            self.cpr.printex("trace", tr)

    def vocabulary_related_corrections(self, accumulated_chars,
                                       wildcard_character, accumulated_confs):

        if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE:
            accumulated_chars_final = ""
            acc_split = accumulated_chars.split()
            len_split = len(acc_split)

            for word_index, word in enumerate(acc_split):

                if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION:
                    if word_index == len_split - 1 and word.replace(
                            wildcard_character, "").endswith('-'):
                        self.previous_word_with_seperator = True
                        accumulated_chars_final += word + " "
                        continue
                    if word_index == 0:
                        if self.previous_word_with_seperator is True:
                            self.previous_word_with_seperator = False
                            accumulated_chars_final += word + " "
                            continue

                acc_confs_word = accumulated_confs.pop_multi(len(word))
                acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \
                    self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character)
                self.cpr_vocab_check.print("w:", word, "wr:", word_reduced,
                                           "accr:", acc_conf, "rate", rate)

                # don't correct words below min vocab length ( mind that special chars in dict are toggled)
                check_len = len(word)
                if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS:
                    check_len = len(word_reduced)
                if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH:
                    accumulated_chars_final += word + " "
                    continue

                if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS:
                    swappable_char_indices = []

                    acc_confs_used = None
                    word_used = None

                    if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                        # use the full length confidences array including trailing and leading special characters
                        acc_confs_used = acc_confs_word
                        word_used = word
                    else:
                        # don't use trailing and starting special characters if no special chars needed
                        acc_confs_used = acc_confs_word[
                            len(word_starting_borders):(
                                len(acc_confs_word) -
                                len(word_trailing_borders))]
                        word_used = word_reduced

                    for conf_index, conf in enumerate(acc_confs_used):
                        if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                            if conf <= 250:
                                character_related = word_used[conf_index]
                                is_special_char = Random.is_special_character(
                                    character_related)
                                if is_special_char and character_related != wildcard_character:
                                    # only swap special character indices
                                    swappable_char_indices.append(conf_index)
                        else:
                            if conf <= 215:
                                swappable_char_indices.append(conf_index)

                    if len(swappable_char_indices) >= 1:
                        word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only(
                            word_used, swappable_char_indices)
                        if word_reduced_correct != None:
                            word_correct_withtrails = None

                            if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                                if Random.has_special_character(
                                        word_reduced_correct):
                                    # if special character was replaced with special character
                                    word_correct_withtrails = word_reduced_correct
                                else:
                                    # if special character was replaced by alphanumerical character
                                    word_correct_withtrails = word
                            else:
                                word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                            # only print the changed results
                            if word != word_correct_withtrails:
                                self.cpr_vocab_check.print(
                                    "w:", word, "wc:", word_correct_withtrails,
                                    "accr:", acc_conf, "rate", rate)

                            accumulated_chars_final += word_correct_withtrails + " "
                        else:
                            accumulated_chars_final += word + " "
                    else:
                        accumulated_chars_final += word + " "

                    continue

                if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \
                        and len(word_reduced) > 2:
                    # if the rate drops below tresh, try to fetch vocab entry
                    word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text(
                        word_reduced)
                    if word_reduced_correct != None and word_reduced_correct != word_reduced:

                        word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                        self.cpr_vocab_check.print("w:", word, "wc:",
                                                   word_correct_withtrails,
                                                   "accr:", acc_conf, "rate",
                                                   rate)

                        accumulated_chars_final += word_correct_withtrails + " "
                    else:
                        accumulated_chars_final += word + " "
                else:
                    accumulated_chars_final += word + " "

            accumulated_chars = accumulated_chars_final

        return accumulated_chars

    def try_obtain_charconf_searchspace(
        self,
        value_confidence,
        value,
        undef_value=0,
        engine_key=None,
        one_line_empty=False,
    ):
        if value_confidence is None or value_confidence is False or value_confidence is True:
            return undef_value

        returnvalue = value_confidence

        if self.config.MSA_BEST_VOTER_SCALE_ENGINE_CONFIDENCES and engine_key is not None:
            if engine_key == 'Abbyy':
                if self.config.MSA_BEST_INCREASE_CONFIDENCE_OF_SOME_ABBYY_CHARS:
                    if value == "%":  # improve ocropus in confidence of % because it was trained
                        value_confidence = value_confidence + 80

                returnvalue = ConfidenceModifications.abby_factor * value_confidence
            elif engine_key == 'Tess':
                returnvalue = ConfidenceModifications.tesseract_factor * value_confidence

            elif engine_key == 'Ocro':

                returnvalue = ConfidenceModifications.ocropus_factor * value_confidence

        if (self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS and one_line_empty and value == " ") \
            or (self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD and one_line_empty \
                and value == " "):

            returnvalue += ConfidenceModifications.whitespace_push

        return returnvalue

    def check_if_one_line_empty(self, lines, wildcard_character):
        for line in lines:
            text_wo_wildcards = line.textstr.replace(wildcard_character, '')
            if text_wo_wildcards == "":
                return True
            if self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD:
                # also count in high whitecard ratios as empty line
                wildcard_ratio = 1 - (len(text_wo_wildcards) /
                                      len(line.textstr))
                if wildcard_ratio > 0.70:
                    return True

    def toggle_predictor(self, filo_content):
        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            if "Aufsichtsrat" in filo_content:
                self.use_aufsichtsrat_prediction = True
            if "Gründung:" in filo_content:
                self.use_aufsichtsrat_prediction = False

    def predict_char(self, filo_content):
        predicted_char = None
        if self.use_aufsichtsrat_prediction:
            if len(filo_content
                   ) >= 19:  # if filo_content bigger than one prediction chunk
                len_aufsichtsrat = 19
                predicted_char = self.predictor.predict_next_aufsichtsrat_chars(
                    len_aufsichtsrat, filo_content)
                # print("filo", filo_content,"predict:", predicted_char)
                # print("dd")
        return predicted_char

    def fill_filo_last_chars(self, voted_char):
        """
        fill filo for predictor usage with voted_char some additional chars around this char
        :param voted_char:
        :return:
        """

        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            # create pre semi-tokenized input strings in the filos from the voted characters for prediction
            if voted_char == ' ':
                # the models usally use the 'ƿ' char in substitution for spaces
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push('ƿ', filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')
            elif Random.is_special_character(voted_char):
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push(voted_char, filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')

            else:
                self.filo_last_chars.push(voted_char, filterchar='¦')

    def increase_umlaut_confidence_searchspace(self, character_1, character_2,
                                               character_3, charconf_1,
                                               charconf_2, charconf_3):

        if self.config.MSA_BEST_SEARCHSPACE_INCREASE_UMLAUT_CONFIDENCE:
            clist = [character_1, character_2, character_3]
            conflist = [charconf_1, charconf_2, charconf_3]
            conflist_new = self.increase_umlaut_confidence(clist, conflist)
            charconf_1 = conflist_new[0]
            charconf_2 = conflist_new[1]
            charconf_3 = conflist_new[2]
            return charconf_1, charconf_2, charconf_3
        return charconf_1, charconf_2, charconf_3

    def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \
                                              wildcard_character, voted_acc_conf, character_1, character_2, character_3):
        if aufsichtsrat_prediction_toggled:
            if Random.is_special_character(predicted_char):
                one_char_sc = Random.is_special_character(character_1) \
                              or Random.is_special_character(character_2) or Random.is_special_character(
                    character_3)
                voted_char_sc = Random.is_special_character(voted_char)

                if predicted_char != voted_char and (
                        one_char_sc
                        or voted_char_sc) and voted_char != wildcard_character:
                    # print("FiloContent:", filo_content)
                    self.cpr_sc_predict.print("pc:", predicted_char, "vc:",
                                              voted_char, "vc_acc",
                                              voted_acc_conf)
                    if voted_acc_conf <= 90.0:
                        if voted_char != '\f':  # don't swap formfeeds, they don't get predicted at all
                            self.cpr_sc_predict.print("swap")
                            voted_char = predicted_char

        return voted_char
class AdditionalInfoHandler(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)
        self.cpr.print("init additional info handler")

    def write_excel_to_json(self,
                            fileinfo,
                            filepath,
                            filetype,
                            idxcol=None,
                            parse_cols=None,
                            page=0):
        """"
        At the moment a little helper script for the Aktienführer-Project.
        Be free to modify as you wish.
        """
        #if isinstance(parse_cols, list): parse_cols = [parse_cols],
        additional_filepath = path.normpath(
            f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
        file = glob.glob(additional_filepath, recursive=True)
        if len(file) != 1: return None
        if filetype in ["xlsx", "xls"]:
            df = pd.read_excel(file[0]).set_index("ProfileID")
            jsondata = {fileinfo.dbname: {"Year": fileinfo.dbname}}
            jsondf = df.to_dict(orient="index")
            jsondata.update(jsondf)
            with open(file[0].replace("xlsx", "json"), "w") as output:
                json.dump(jsondata, output, indent=4)
        return None

    def fetch_additional_information_simple(self, file):
        """
        Same as fetch additional information, but config related info is already included in given
        parameters
        :return: additional info
        """
        if self.config.ADDITIONAL_INFORMATION:
            additional_info = self.fetch_additional_information(
                file,
                self.config.INPUT_ADDINFOPATH,
                idxcol=self.config.IDXCOL,
                parse_cols=self.config.PARSE_COLS,
                filetype=self.config.INPUT_ADDINFOFILETPYE)
            return additional_info

        return None

    def fetch_additional_information(self,
                                     fileinfo,
                                     filepath,
                                     filetype,
                                     idxcol=None,
                                     parse_cols=None,
                                     page=0):
        """
        Reads an additional file with information
        It searches the file where the index_name matches tablename or dbname
        :param file:
        :param index_name:
        :return: additional info
        """
        #if isinstance(parse_cols, list): parse_cols = [parse_cols]
        additional_filepath = path.normpath(
            f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
        file = glob.glob(additional_filepath, recursive=True)

        len_files = len(file)
        if len_files > 1:
            self.cpr.printex(
                "More than one additional information file was found!")
            return None
        if len_files == 0:
            self.cpr.printex("No additional information file was found!")
            return None

        file = file[0]
        current_db_and_table = {
            "db": fileinfo.dbname,
            "table": fileinfo.tablename
        }
        if filetype in ["xlsx", "xls"]:
            infos = {}
            info_df = pd.read_excel(file)  #.set_index("ProfileID")
            parse_cols.remove(idxcol)
            for db_and_table_id, current_db_and_tablename in current_db_and_table.items(
            ):
                infos[db_and_table_id] = {}
                for line, rubric_content in info_df.loc[
                        info_df[idxcol] ==
                        current_db_and_tablename][parse_cols].to_dict(
                            orient="index").items():
                    for rubric, content in rubric_content.items():
                        if rubric != idxcol:
                            if infos[db_and_table_id].get(rubric,
                                                          None) is None:
                                infos[db_and_table_id][rubric] = content
                            elif infos[db_and_table_id].get(rubric,
                                                            None) != content:
                                if not isinstance(
                                        infos[db_and_table_id][rubric], list):
                                    infos[db_and_table_id][rubric] = [
                                        infos[db_and_table_id][rubric]
                                    ]
                                infos[db_and_table_id][rubric].append(content)
        elif filetype == "json":
            with open(file, "r") as add_info_file:
                infos = json.load(add_info_file)

            for possible_db_or_tablenames in reversed(list(infos.keys())):
                possible_db_or_tablenames_orig = possible_db_or_tablenames  # unchanged name

                if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON:
                    psplit = possible_db_or_tablenames.split("-")
                    possible_db_or_tablenames = psplit[0]

                if possible_db_or_tablenames not in current_db_and_table[
                        'table']:
                    del infos[possible_db_or_tablenames_orig]
                else:
                    for db_and_table_id, current_db_and_tablename in current_db_and_table.items(
                    ):
                        if possible_db_or_tablenames == current_db_and_tablename:
                            infos[db_and_table_id] = infos[
                                possible_db_or_tablenames_orig]
                            del infos[possible_db_or_tablenames_orig]
        else:
            return None
        return infos
Ejemplo n.º 24
0
class SegmentClassifier(object):
    """
    This is the basic handler for classification
    which get's accessed from root/-outside classes.
    """

    def __init__(self):

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
        self.cpr.print("init segment classifier")

    def classify_file_segments(self, ocromore_data):
        lines = ocromore_data['lines']
        feats = ocromore_data['line_features']
        file_info = ocromore_data['file_info']
        all_file_segments = AllSegments(len(lines), self.cpr, self.config)

        prev_line = None
        prev_text = None
        for current_line_index, current_line in enumerate(lines):
            current_features = feats[current_line_index]
            current_text = current_line['text']
            current_index = current_line['line_index']
            # create a combined lined object with optimized (removed) separation
            combined_line = None
            if prev_line is not None:
                combined_lines = dh.join_separated_lines([prev_text, current_text])
                combined_line = dh.join_joined_lines(combined_lines)
            else:
                combined_line = current_text
            # pass parameters to matching functions
            all_file_segments.match_my_segments(current_line, current_text, current_index, current_features, 
                                                prev_line, combined_line)
            prev_line = current_line
            prev_text = current_text




        if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION:
            self.adapt_non_explicit_indices(all_file_segments)
        else:
            all_file_segments.correct_overlaps_index_field(only_start_tags=True)

        self.adapt_stop_index_in_last_segment(all_file_segments)


        # does the last steps in segment matching
        all_file_segments.finish_segment_matching(lines, feats, file_info)

        # do again after final step
        if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION:
            self.adapt_non_explicit_indices(all_file_segments)
        else:
            all_file_segments.correct_overlaps_index_field(only_start_tags=True)

        self.adapt_stop_index_in_last_segment(all_file_segments)




        ocromore_data['segmentation'] = all_file_segments

        return ocromore_data


    def adapt_stop_index_in_last_segment(self, all_file_segments):
        """
        Sets the stop_index for the last recognized segment, which
        is a special case and is usually not filled beforehand, because
        there is no next start index
        :param all_file_segments: holder object for segment classes and other info
        :return: None
        """

        # search for last segment
        saved_start_index = -1
        saved_last_segment = None
        for segment in all_file_segments.my_classes:
            # only count segmented segments
            if segment.start_was_segmented is False:
                continue

            if segment.start_line_index >= saved_start_index:
                saved_start_index = segment.start_line_index
                saved_last_segment = segment

        if saved_last_segment is None:
            return

        # adapt the last stop index of last segment
        saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1
        saved_last_segment.stop_was_segmented = True  # todo think about if this is necessary?





    def adapt_non_explicit_indices(self, all_file_segments):

        # update start and explicit stop tags first
        all_file_segments.correct_overlaps_index_field(only_start_tags=True)

        # fill undefined stop regions until next start region
        all_file_segments.fill_start_index_until_next_stop()
class EndobjectFactory(object):
    """
    Creates an object with the following structure and provides exporting methods:

    segment_tag_1: [                ---> this level is created by set_current_main_list
        {
            type: "Sitz"            ---> add this level entries with add_to_my_object object_number=0
            city: "Neustadt"
        },
        {
            type: "Sitz"            ---> add this level entries with add_to_my_object object_number=0
            city: "Neustadt"
        }

    ],
    segment_tag_2: [
        {
            ...
        }
        ...
    ]
    """
    def __init__(self):
        self.my_object = {}
        self.current_main_list = None
        self.pp = pprint.PrettyPrinter(indent=5)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            self.known_uc = KnownUncategories()

    def set_current_main_list(self, segment_tag):
        if segment_tag not in self.my_object.keys():
            self.my_object[segment_tag] = []              # create the main list (all subsequent entries are stored here)

        self.current_main_list = self.my_object[segment_tag]  # create a short link on the main list

    def add_to_my_obj(self, key, value, object_number=0, only_filled=False):

        if only_filled is True and (value == None or value == "" or value == [] or value == {}):
            return False

        # fill main list if object index not in
        len_list = len(self.current_main_list)
        if len_list < object_number+1:
            for index in range(len_list,object_number+1):
                self.current_main_list.append({})

        self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value)
        # add or insert to the main_list
        self.current_main_list[object_number][key] = value
        return True

    def print_me_and_return(self):
        print("my_object is:")
        self.pp.pprint(self.my_object)
        return self.my_object

    def print_current_main(self):
        print("current_main:")
        self.pp.pprint(self.current_main_list)

    def export_as_json(self):
        my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False)
        return my_obj_json

    def export_as_json_at_key(self, key, remove_first_object=False):

        if key not in self.my_object.keys():
            return None

        my_obj = self.my_object[key]
        if remove_first_object:
            if len(my_obj) >= 1:
                my_obj = my_obj[1:]  # remove the first object which usally contains generic info

        my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False)
        return my_obj_json

    @staticmethod
    def fetch_subentries_recursive_check(entry):
        """
        Fetches all subentries (values) from an entry and writes them to a list of texts
        This get's called recursively within the function until all subentries
        are found
        :param entry: entry to fetch the subentries from
        :return: list of subentries
        """
        final_texts = []

        for item in entry:
            if isinstance(entry, list):
                value = item
            else:
                # item is a key
                value = entry[item]
            if isinstance(value, str):
                final_texts.append(value)
            elif isinstance(value, int):
                final_texts.append(str(value))
            elif isinstance(value, object):
                obj_size = len(value)
                if obj_size > 0:
                    recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value)
                    final_texts.extend(recursive_texts)

        return final_texts

    @staticmethod
    def fetch_keys_recusive_check(entry, final_keys, create_multiple=True):
        """
        Fetches all keys in an object and it's sub-objects
        calls itself recursively until all keys are found
        writes final keys to final_keys array and returns this
        :param entry: object to fetch the sub-keys from
        :param final_keys: list of final keys (initial state)
        :param create_multiple: if the same key occurs multiple times it still gets added
        :return: final_keys with added keys from object
        """

        if isinstance(entry, list):
            for item in entry:
                final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple)
            return final_keys
        elif not isinstance(entry, dict):
            # just return if there are no keys (cause no dictionary)
            return final_keys

        for key in entry:
            value = entry[key]
            if create_multiple or key not in final_keys:
                if isinstance(key, int):
                    continue
                final_keys.append(key)
            final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys)
        return final_keys

    def diff_seg_to_orig_at_key(self, key):
        """
        def fetch_subentries_recursive(entry):
            final_texts = []

            for item in entry:
                if isinstance(entry, list):
                    value = item
                else:
                    # item is a key
                    value = entry[item]
                if isinstance(value, str):
                    final_texts.append(value)
                elif isinstance(value, int):
                    final_texts.append(str(value))
                elif isinstance(value, object):
                    obj_size = len(value)
                    if obj_size > 0:
                        recursive_texts = fetch_subentries_recursive(value)
                        final_texts.extend(recursive_texts)

            return final_texts
        """
        if key not in self.my_object.keys():
            return None

        my_data = self.my_object[key]

        # check if the orig-post property can exist warn if not
        if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
            self.cpr.printw("trying to fetch original data, original data is not added to results")
            self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
        if len(my_data) <= 0:
            self.cpr.printw("no data to do returning")
            return

        return # todo this seems to be wrong
        # copy orig string
        original_text = my_data[0]['origpost']
        rest_text = original_text

        # fetch parsed entries for diff
        all_final_entries = []  # array of final entries
        for index in range(1, len(my_data)):
            entry = my_data[index]
            final_entries = fetch_subentries_recursive(entry)
            all_final_entries.extend(final_entries)

        # order diff data after length
        all_final_entries.sort(key=lambda x: len(x))
        all_final_entries.reverse()

        # subtract
        for text in all_final_entries:
            rest_text = rest_text.replace(text, "")

            rest_text = rest_text.strip()

        return rest_text, original_text

    def diff_parsed_to_orig_at_key(self, key):
        """
        def fetch_subentries_recursive(entry):
            final_texts = []

            for item in entry:
                if isinstance(entry, list):
                    value = item
                else:
                    # item is a key
                    value = entry[item]
                if isinstance(value, str):
                    final_texts.append(value)
                elif isinstance(value, int):
                    final_texts.append(str(value))
                elif isinstance(value, object):
                    obj_size = len(value)
                    if obj_size > 0:
                        recursive_texts = fetch_subentries_recursive(value)
                        final_texts.extend(recursive_texts)

            return final_texts

        def fetch_keys_recusive(entry, final_keys, create_multiple=True):
            # just return if there are no keys (cause no dictionary)
            if not isinstance(entry, dict):
                return final_keys

            for key in entry:
                value = entry[key]
                if create_multiple or key not in final_keys:
                    if isinstance(key, int):
                        continue
                    final_keys.append(key)
                final_keys = fetch_keys_recusive(value, final_keys)
            return final_keys
        """
        if key not in self.my_object.keys():
            return None

        #if key == "KursVonZuteilungsrechten":
        #   print("todo remove debug")

        my_data = self.my_object[key]

        # check if the orig-post property can exist warn if not
        if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
            self.cpr.printw("trying to fetch original data, original data is not added to results")
            self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
        if len(my_data) <= 0:
            self.cpr.printw("no data to do returning")
            return
        # copy orig string
        original_text = my_data[0]['origpost']
        rest_text = original_text

        # fetch parsed entries for diff
        pool_entries = []  # array of final entries
        for index in range(1, len(my_data)):
            entry = my_data[index]
            final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry)
            pool_entries.extend(final_entries)

        if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
            # removes all spaces from rest and comparison values because spaces are often
            # a problem in subtracting the rests
            rest_text = rest_text.replace(" ", "")
            for index in range(0,len(pool_entries)):
                pool_entries[index] = pool_entries[index].replace(" ", "")

        all_final_entries = []

        # add the entries to the complete subtraction and tag them with '1'
        for pentry in pool_entries:
            all_final_entries.append((pentry, 1))

        # if keys shall be subracted also add them also
        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            pool_keys = []  # gets multiple of the same key for later 1 by 1 subtraction
            for index in range(1, len(my_data)):
                pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True)

            # also remove spaces in keys
            if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
                for index in range(0, len(pool_keys)):
                    pool_keys[index] = pool_keys[index].replace(" ", "")

            final_keys = []
            for pkey in pool_keys:
                final_keys.append((pkey, 2))

            all_final_entries.extend(final_keys)

        # order diff data after length
        all_final_entries.sort(key=lambda x: len(x[0]))
        all_final_entries.reverse()

        # subtract
        for entry in all_final_entries:
            text = entry[0]
            text_or_key = entry[1]
            if text_or_key == 2:
                if text in self.known_uc.unkeys:
                    continue
            text_stripped = text.strip()  # remove spaces so texts better fit in
            rest_text = rest_text.replace(text_stripped, "", 1)
            rest_text = rest_text.strip()

        return rest_text, original_text
Ejemplo n.º 26
0
class SegmentParser(object):
    """
    Parse the classified segments segment by segment,
    each segment defined code the parser points to.
    """
    def __init__(self,
                 output_analyzer,
                 dictionary_handler,
                 ocromore_data=None):

        self.ef = EndobjectFactory()
        self.dictionary_handler = dictionary_handler

        # map which maps tags to functions for parsing -> change constuctor for other project
        fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.function_map = fmap.get_function_map()
        self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/"

    def clear_result(self,
                     output_analyzer,
                     dictionary_handler,
                     ocromore_data=None):
        # create a new end object factory, new content
        self.ef = EndobjectFactory()
        # map to the new ef object which has been recreated
        fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)
        self.function_map = fmap.get_function_map()

    def parse_segments(self, ocromore_data):
        self.ocromore_data = ocromore_data
        segmentation = ocromore_data['segmentation']
        segmentation_classes = segmentation.my_classes

        # add all text from original file if activated (i.e. for debugging purposes)
        if self.config.ADD_FULLTEXT_ENTRY:
            all_texts = self.get_all_text(ocromore_data)
            self.ef.set_current_main_list("overall_info")
            self.ef.add_to_my_obj("fulltexts", all_texts)
        # add additional info to result
        if self.config.ADDITIONAL_INFORMATION and self.config.ADD_ADDITIONAL_INFO:
            if not self.config.ADD_FULLTEXT_ENTRY:
                self.ef.set_current_main_list("Information")
            self.ef.add_to_my_obj("additionals",
                                  ocromore_data["additional_info"])
        # add a duplicate of the original text from which in the below analysis case the files get subtracted
        if self.config.LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE:
            if self.config.ADD_FULLTEXT_ENTRY:
                ocromore_data['analysis_to_orig'] = {}
                original_rest, complete_text = self.get_all_text(
                    ocromore_data, join_separated_lines=True)
                ocromore_data['analysis_to_orig'][
                    'original_rest'] = original_rest
                ocromore_data['analysis_to_orig'][
                    'original_length_initial'] = len(complete_text)
            else:
                self.cpr.printw(
                    "activated segment to orig diff, but no saving of origin activate ADD_FULLTEXT_ENTRY "
                    "in config for this functionality")

        #Init toolbbox
        snippet = None
        if self.config.USE_SNIPPET:
            if "./" in self.config.IMGPATH:
                ipath = os.path.dirname(
                    ocromore_data["file_info"].path) + self.config.IMGPATH[1:]
            else:
                ipath = os.path.normcase(self.config.IMGPATH)
            results = glob.glob(
                ipath + ocromore_data["file_info"].name.split(".")[0].replace(
                    "_msa_best", "") + "*",
                recursive=True)
            if results:
                snippet = Snippet()
                snippet.imread(results[0])
            else:
                self.config.USE_TOOLBBOX = False
        info_handler = {}
        # start parsing for each successfully segmented area
        for segmentation_class in segmentation_classes:

            # if the class segment was recognized ...
            if segmentation_class.is_start_segmented():
                # get the unique identifier for this class
                segment_tag = segmentation_class.get_segment_tag()
                segmentation_class.snippet = snippet
                segmentation_class.info_handler = info_handler
                self.trigger_mapped_function(segment_tag, segmentation_class,
                                             ocromore_data)

        # add and return result
        ocromore_data['results'] = self.ef
        return ocromore_data

    def trigger_mapped_function(self, segment_tag, segmentation_class,
                                ocromore_data):

        if segment_tag not in self.function_map.keys():
            return
        #todo: fileinfo -> parsing
        real_start_tag, content_texts, content_lines, feature_lines = self.prepare_parsing_info(
            segmentation_class, ocromore_data)

        # switch the object to save context
        segment_tag = segmentation_class.segment_tag
        self.ef.set_current_main_list(segment_tag)

        # call the mapped function, which fills the end-factory
        self.function_map[segment_tag].__call__(real_start_tag, content_texts,
                                                content_lines, feature_lines,
                                                segmentation_class)

    def prepare_parsing_info(self, segmentation_class, ocromore_data):
        lines = ocromore_data['lines']
        line_features = ocromore_data['line_features']
        real_start_tag, content_texts, content_lines, feature_lines = \
            DataHelper.get_content(lines,line_features, segmentation_class)

        return real_start_tag, content_texts, content_lines, feature_lines

    def get_all_text(self, ocromore_data, join_separated_lines=False):
        """
        Gets all text lines in ocromore_data as
        array and as joined string
        :param ocromore_data: data from which the text is extracted
        :return: texts list, complete text
        """
        all_texts = []
        complete_text = ""
        for line in ocromore_data['lines']:
            text = line['text']
            all_texts.append(text)
            complete_text += text

        if join_separated_lines:
            complete_text = ""
            all_texts = dh.join_separated_lines(all_texts)
            for text in all_texts:
                complete_text += text

        return all_texts, complete_text

    def write_result_to_output(self, as_json, ocromore_data):
        if as_json is True:
            my_json = self.ef.export_as_json()
            my_json_lines = my_json.split("\n")
            dh.write_array_to_root("result_json/", my_json_lines,
                                   ocromore_data, self.result_root)
class AkfParsingFunctionsJK(object):
    def __init__(self,
                 endobject_factory,
                 output_analyzer,
                 dictionary_handler,
                 ocromore_data=None):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(
            self.config.PRINT_SEGMENT_PARSER_AKF_FN_THREE,
            self.config.PRINT_EXCEPTION_LEVEL,
            self.config.PRINT_WARNING_LEVEL,
            leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions three")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.ocromore_data = ocromore_data
        self.dictionary_handler = dictionary_handler

    def parse_bilanzen(self, real_start_tag, content_texts, content_lines,
                       feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        if self.config.LOG_SIMPLE:
            geschaeftslage = origpost_red.replace("- ", "")

            #parsing
            self.ef.add_to_my_obj("balances",
                                  geschaeftslage,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True
        #parsing
        table = Datatable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines,
                                feature_lines,
                                template="datatable_balance")
        table.extract_content(content_lines,
                              feature_lines,
                              template="datatable_balance")

        # Write information for income table parsing
        segmentation_class.info_handler["income"] = {}
        segmentation_class.info_handler["income"]["amount"] = table.info.amount
        segmentation_class.info_handler["income"]["col"] = table.info.col
        segmentation_class.info_handler["income"][
            "separator"] = table.info.separator

        # Parsing the tables based on whitespace and number of numbers of each group
        # This should be the last option to parse (error-prone)
        self.ef.add_to_my_obj("balances",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)

    def parse_gewinn_und_verlust(self, real_start_tag, content_texts,
                                 content_lines, feature_lines,
                                 segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        if self.config.LOG_SIMPLE:
            geschaeftslage = origpost_red.replace("- ", "")

            #parsing
            self.ef.add_to_my_obj("income",
                                  geschaeftslage,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True

        # parsing
        table = Datatable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines,
                                feature_lines,
                                template="datatable_income")
        if segmentation_class.info_handler and "income" in set(
                segmentation_class.info_handler.keys()):
            table.info.col = segmentation_class.info_handler["income"]["col"]
            table.info.amount = segmentation_class.info_handler["income"][
                "amount"]
            table.info.separator = segmentation_class.info_handler["income"][
                "separator"]

        table.extract_content(content_lines,
                              feature_lines,
                              template="datatable_income")

        #parsing
        self.ef.add_to_my_obj("income",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)

    def parse_aktienkurse(self, real_start_tag, content_texts, content_lines,
                          feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        #self.config.LOG_SIMPLE = False
        if self.config.LOG_SIMPLE:
            #    self.config.LOG_SIMPLE = False
            skip = origpost_red.replace("- ", "")

            # parsing
            self.ef.add_to_my_obj("shares",
                                  skip,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True

        # parsing
        table = Sharetable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines, feature_lines)
        table.extract_content(content_lines, feature_lines)
        #from timeit import timeit
        #print(timeit(test))
        # parsing
        self.ef.add_to_my_obj("shares",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)

    def parse_dividend(self, real_start_tag, content_texts, content_lines,
                       feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        # self.config.LOG_SIMPLE = True
        if self.config.LOG_SIMPLE:
            #    self.config.LOG_SIMPLE = False
            skip = origpost_red.replace("- ", "")

            # parsing
            self.ef.add_to_my_obj("dividende",
                                  skip,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True

        # parsing
        table = Dividendtable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines, feature_lines)
        table.extract_content(content_lines, feature_lines)
        # from timeit import timeit
        # print(timeit(test))
        # parsing
        self.ef.add_to_my_obj("dividende",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)
Ejemplo n.º 28
0
class AkfParsingFunctionsTwo(object):

    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TWO, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions two")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler

    def parse_zahlstellen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        split_post = origpost_red.split(';')
        DEFAULT_ENTRY = 1
        ADDITIONAL_INFO_BOTH = 2      # beide - two previous
        ADDITIONAL_INFO_ALL_PREV = 3  # sämtl. - all previous

        final_entries = []
        for index, entry in enumerate(split_post):
            entry_stripped = entry.strip()

            if "beide" in entry_stripped:
                entry_final = regex.sub(r"beide\s?\.?", "##", entry_stripped).strip()
                entry_final_split = entry_final.split('##')
                for index_fs, entry_fs in enumerate(entry_final_split):
                    if entry_fs.strip() == "" : continue
                    if index_fs < len(entry_final_split)-1:
                        final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", ""))
                    else:
                        final_entries.append((ADDITIONAL_INFO_BOTH, entry_fs, "", "", ""))
                continue
            if regex.search("sämtl\s?\.?", entry_stripped):
                entry_final = regex.sub(r"sämtl\s?\.?", "##", entry_stripped).strip()
                entry_final_split = entry_final.split('##')
                for index_fs, entry_fs in enumerate(entry_final_split):
                    if entry_fs.strip() == "": continue
                    if index_fs < len(entry_final_split)-1:
                        final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", ""))
                    else:
                        final_entries.append((ADDITIONAL_INFO_ALL_PREV, entry_fs, "", "", ""))
                continue

            entry_split = entry_stripped.split(',')
            bank = ""
            city = ""
            title = ""
            rest_info = []
            for fragment_index, fragment in enumerate(entry_split):
                if fragment_index == 0:
                    bank = fragment
                elif fragment_index == 1:
                    city = fragment
                elif fragment_index >= 2:
                    rest_info.append(fragment)
            if bank != "" or city != "" or title != "":
                final_entries.append((DEFAULT_ENTRY, bank, city, title, rest_info))

        # reverse list for better processing
        reverse_fe = final_entries.__reversed__()
        current_additional_info = ""
        current_info_index = None
        current_entry_type = None
        final_list = []
        for item_index, item in enumerate(reverse_fe):
            entry_type, entryorbank, city, title, rest_info = item
            # change current additional info
            if entry_type == ADDITIONAL_INFO_BOTH or entry_type == ADDITIONAL_INFO_ALL_PREV:
                current_info_index = item_index
                current_additional_info = entryorbank
            elif entry_type == DEFAULT_ENTRY:
                templist = [(entryorbank, city, title, current_additional_info, rest_info)]
                templist.extend(final_list)
                final_list = templist

            # end 'beide'-entry because it's over after 2 iterations
            if current_entry_type == ADDITIONAL_INFO_BOTH and item_index-current_info_index >= 1:
                current_info_index = None
                current_additional_info = ""

        # finally note the entries to output
        only_add_if_value = True
        for entry in final_list:
            bank, city, title, add_info, rest_info = entry
            if add_info.strip() != "":
                rest_info_new = [add_info]
                rest_info_new.extend(rest_info)
            else:
                rest_info_new = rest_info

            #if add_info != "" and add_info != None and city =="":
            #    city += add_info
            self.ef.add_to_my_obj("bank", bank, object_number=element_counter, only_filled=only_add_if_value)
            self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value)
            self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_value)
            #self.ef.add_to_my_obj("additional_info", add_info, object_number=element_counter, only_filled=only_add_if_value)
            #self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=only_add_if_value)
            self.ef.add_to_my_obj("rest_info", rest_info_new, object_number=element_counter, only_filled=only_add_if_value)

            element_counter += 1

        return True

    def parse_grundkapital(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # todo validate other currencies than 'DM'
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        only_add_if_value = True

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)



        # Try to normalize ; to : with prefix apital
        content_texts = [content_text.replace("apital;","apital:") for content_text in content_texts]

        gk = cf.parse_general_and_keys(content_texts,
                                  join_separated_lines=True,
                                  current_key_initial_value='start_value',
                                  abc_sections=True)
        #print(gk)
        # check start value for 'normal' grundkapital content
        # if found parse
        start_value = gk.get('start_value',"")
        if len(gk.keys()) == 1:
            start_value = gk[list(gk.keys())[0]]
        #if start_value =
        if len(start_value) >= 1:
            #print("could be grundkapital")
            my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \
                cf.parse_grundkapital_line(start_value[0], False, element_counter, only_add_if_value, [])
            currency = my_return_object.get('currency',"").strip()
            amount = my_return_object.get('amount',"").strip()
            if amount != "" and currency != "":
                self.ef.add_to_my_obj('Grundkapital', my_return_object, object_number=element_counter, only_filled=only_add_if_value)
            else:
                gk['additional_info'] = []
                gk['additional_info'].append(start_value[0].replace("↑", ":"))


        if len(start_value) >= 2: # get the additional values which are in start_value, but have nothing to do with that
            if 'additional_info' not in gk.keys():
                gk['additional_info'] = []

            gk['additional_info'] = []
            for index in range(1, len(start_value)):
                val = start_value[index]
                gk['additional_info'].append(val.replace("↑", ":"))

        """
        if 'additional_info' in gk.keys():
            gk_ai = cf.parse_general_and_keys(gk['additional_info'],
                                           join_separated_lines=True,
                                           current_key_initial_value='start_value_addinfo',
                                           abc_sections=True)

            print("lemme check")
        """


        for key in gk:
            if key is "start_value":
                continue
            entry = gk[key]
            # individual parsing here
            match_year = regex.search("\d\d\d\d", key) # key is year
            year = None
            key_rest = ""
            if match_year:
                year = match_year.group()
                key_rest = key.replace(year, "").strip()

            accumulated_text = []
            if key_rest != "":
                accumulated_text.append(key_rest)

            for inner_entry in entry:
                accumulated_text.append(inner_entry)

            final_entry = None
            if year is None:
                final_entry = accumulated_text
            else:
                final_entry = {
                    "year": year,
                    "text": accumulated_text
                }

            if final_entry != None and final_entry != "":
                self.ef.add_to_my_obj(key, final_entry, object_number=element_counter,
                                      only_filled=only_add_if_value)
                element_counter += 1

        # check all year lines and parse the
        return



        # old parsing style
        final_entries = []
        current_ref_index = -1
        found_main_amount = False
        additional_info = []
        only_add_if_value = True
        for text_index, text in enumerate(content_texts):
            text_stripped = text.strip()
            if text_stripped == "":
                continue

            # todo increment element ctr ?
            my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \
                cf.parse_grundkapital_line(text_stripped, found_main_amount, element_counter, only_add_if_value, additional_info)

            for key in my_return_object:
                value = my_return_object[key]
                self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=only_add_if_value)


        if len(additional_info) >= 1:
            add_lines_parsed = cf.parse_grundkapital_additional_lines(additional_info,element_counter,True, 0)
            self.ef.add_to_my_obj("additional_info", add_lines_parsed, object_number=element_counter,
                                     only_filled=only_add_if_value)

        return True

    def parse_ordnungsnrdaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        only_add_if_value = True
        # example values - each line of content_texts list
        # '589300 (St.-Akt.)'
        # '589300.'
        first_number_match = True
        for entry in content_texts:
            entry_stripped = entry.strip()
            rest = entry_stripped
            if entry_stripped == "":
                continue

            match_number = regex.search(r"^([\d\s]*)", entry_stripped)
            match_parenth = regex.search(r"\(.*\)", entry_stripped) # take content in parenthesis

            if match_number is not None and match_number.group(0).strip() != "":

                if not first_number_match:
                    element_counter += 1        # switch to next element if number not true
                number = match_number.group(0).strip()

                self.ef.add_to_my_obj("ord_number", number, object_number=element_counter, only_filled=only_add_if_value)
                rest = rest.replace(number, "", 1)
                first_number_match = False
            if match_parenth is not None:
                parenth = match_parenth.group(0)
                self.ef.add_to_my_obj("category", parenth, object_number=element_counter, only_filled=only_add_if_value)
                rest = rest.replace(parenth, "", 1)

            rest_stripped = rest.strip()
            if rest_stripped != "":
                self.ef.add_to_my_obj("additional_info", rest_stripped, object_number=element_counter, only_filled=only_add_if_value)

    def parse_grossaktionaer(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        lines_split = origpost_red.split(';')
        only_add_if_value = True
        for line in lines_split:
            # testline
            # line = "Société Sidérurgique de Participations et d’ Approvisionnement en Charbons, par abréviation (Sidechar), Paris (ca.60,2 %)."
            findings = regex.finditer(r"\([a-zü0-9\s\,\.]*%\).?",line)
            lof = list(findings)
            #findings = regex.search(r"(?m)a", line)
            if lof:
                findings = []
                for finding in lof:
                    findings.append(finding.regs[0])
            else:
                findings = [(len(line),len(line))]
            start = 0
            for idx, finding in enumerate(findings):
                #shareholder,location, share
                item = line[start:finding[0]]
                if ":" in item:
                    self.ef.add_to_my_obj("additional_information", item[:item.index(":")],
                                          object_number=element_counter, only_filled=only_add_if_value)
                    if line.index(":")+2 > finding[0]:
                        continue
                    else:
                        item = item[item.index(":"):]
                item = item.rsplit(",",1)
                self.ef.add_to_my_obj("shareholder", item[0].strip(),
                                      object_number=element_counter, only_filled=only_add_if_value)
                if len(item) > 1 and item[1] != "":
                    if item[1][-1] == ".":
                        item[1] = item[1][:len(item[1])-1]
                    if "(" in item[1] and ")" in item[1]:
                        find = regex.search(r"(\([0-9\s\,]*|maßgeblich|Mehrheit|Majorität)\)", item[1])
                        if find:
                            self.ef.add_to_my_obj("share",
                                              item[1][find.regs[0][0]:find.regs[0][1]-1].strip(), object_number=element_counter,
                                              only_filled=only_add_if_value)
                            item[1] = item[1][:find.regs[0][0]-1]
                    self.ef.add_to_my_obj("location", item[1].strip(),
                                      object_number=element_counter, only_filled=only_add_if_value)
                if finding[0] != len(line):
                    self.ef.add_to_my_obj("share", line[finding[0]:finding[1]].replace(", ",",").replace("(","").replace(").","").replace(")","").strip(), object_number=element_counter,only_filled=only_add_if_value)

                start = finding[1]
                element_counter += 1
            #print(self.ef.my_object["Großaktionär"])
            """
            # find parenthesis with 2 or more characters inside
            #for item in line.split("%)"):
            match_parenth = regex.findall(r"(\(.{2,}\))", line)
            found_parenth = None
            parenth_is_used = False
            organization = None
            location = None
            # find additional info in  each line and subtract it
            if match_parenth:
                found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp
                # if the parenthesis are at the end of line
                if line.strip()[-1] == ")" and not(len(found_parenth.replace(" ", "")) <= 5 and "%" in found_parenth): # exclude percentages from parenthesis matches
                    line = line.replace(found_parenth, "", 1)
                    parenth_is_used = True

            split_line = line.split(',')
            len_split_line = len(split_line)
            if len_split_line == 1:
                organization = line.strip("., ")
            else:
                organization = line.replace(split_line[-1], "", 1).strip("., ")
                location = split_line[-1].strip("., ")  # town
            self.ef.add_to_my_obj("organization", organization, object_number=element_counter,only_filled=only_add_if_value)
            self.ef.add_to_my_obj("location", location, object_number=element_counter,only_filled=only_add_if_value)
            if parenth_is_used:
                self.ef.add_to_my_obj("additional_info", found_parenth, object_number=element_counter,only_filled=only_add_if_value)
            element_counter += 1
        """
        return True


    def parse_geschaeftsjahr(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        only_add_if_value = True
        final_jahr = []

        for text in content_texts:
            text_stripped = text.strip("., ")
            if text_stripped != "":
                if "bis" in text_stripped:
                    split_text = text_stripped.split('bis ')#
                    # regex.split('\.bis|\sbis\s', text_stripped)
                    if len(split_text) == 1:
                        final_jahr.append(split_text[0].strip())
                        continue
                    gesch_jahr_start = split_text[0].strip("( ")
                    gesch_jahr_stop = split_text[1].strip(" )")
                    self.ef.add_to_my_obj('gesch_jahr_start', gesch_jahr_start, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    self.ef.add_to_my_obj('gesch_jahr_stop', gesch_jahr_stop, object_number=element_counter,
                                          only_filled=only_add_if_value)

                    if len(split_text) >= 3:
                        for rest in split_text[3:]:
                            if rest.strip() != "":
                                final_jahr.append(rest)
                else:
                    final_jahr.append(text_stripped)

        self.ef.add_to_my_obj('year', final_jahr, object_number=element_counter,only_filled=only_add_if_value)
        return True

    def parse_stimmrechtaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        # find last parenthesis and filter
        match_parenth = regex.findall(r"(\(.*?\))", origpost_red)
        found_parenth = None
        origpost_used = origpost_red
        # find additional info in  each line and subtract it
        if match_parenth:
            found_parenth = match_parenth[-1].strip("., ")  # find the last parenthesis grounp
            origpost_used = origpost_red.replace(found_parenth, "")  # update the orignpost used

        final_lines = []
        only_add_if_value = True
        skip = False
        final_text = ""
        for text_index, text in enumerate(content_texts):
            if text == "":
                continue
            text = text.replace("DM =", "DM 1 =").replace("DM=", "DM 1 =").replace("eine DM", "DM 1.-")
            if element_counter == 0 and "je nom" not in text.lower():
                self.ef.add_to_my_obj("additional_info", "".join(content_texts[text_index:]),
                                      object_number=element_counter,
                                      only_filled=only_add_if_value)
                break
            if skip:
                skip = False
                continue
            parse_aktie = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?[Aa]ktie[n]?)[^\d]*(?P<vote>[\d\s]*?)\s*?(?P<voteend>Stimme[n]*)")
            finding = parse_aktie.findall(text.replace("Stamm",""))
            if finding != []:
                finding = list(finding[0])
                if finding[1] == "":
                    finding[1] = "1"
                stck = {"kind": "Aktie",
                        "amount": finding[1],
                        "vote": finding[2].replace(" ", "").strip(),
                        "value": "",
                        "currency": "",
                        "rank": element_counter}
                self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter,
                                      only_filled=only_add_if_value)
                element_counter += 1
                continue
            #text = 'Je nom. DM 50.- =1 Stimme.'
            parse_stimmrecht = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?(?P<waste>[^\dA-Za-z]*)\s{0,}(?P<kind>[A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}(?P<voteend>Stimme[n]*)?")
            finding = parse_stimmrecht.findall(text.replace("DM", " DM").replace("RM"," RM"))
            # Special case "bzw."
            if finding and "bzw." in text:
                if "Stimm" not in text:
                    skip = True
                    text += content_texts[text_index+1]
                parse_bzw = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?[^\d]*\s*?(?P<value2>[\d\s]*)[^\dA-Za-z]*(?P<kind>[A-Za-z][A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}[^\d]*\s{0,}(?P<vote2>[\d]*)\s{0,}(?P<voteend>Stimme[n]*)?")
                finding = parse_bzw.findall(text)
                finding = finding[0]
                if finding:
                    stck = {"kind": finding[5].strip(),
                            "amount": "1",
                            "vote": finding[6].replace(" ", "").strip(),
                            "value": finding[3].strip(),
                            "currency": finding[2].strip(),
                            "rank": element_counter}
                    self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    element_counter += 1
                    stck = {"kind": finding[5].strip(),
                            "amount": "1",
                            "vote": finding[7].replace(" ", "").strip(),
                            "value": finding[4].strip(),
                            "currency": finding[2].strip(),
                            "rank": element_counter}
                    self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
            if not finding or finding[0][0] + finding[0][1] == "":
                final_text += text
                continue
            if final_text != "":
                self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter-1,
                                      only_filled=only_add_if_value)
                final_text = ""
            finding_next = None
            if finding[0][6] + finding[0][7] == "":
                if text_index == len(content_texts) - 1:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
                else:
                    finding_next = parse_stimmrecht.findall(text + " " + content_texts[text_index + 1])
            if finding_next:
                skip = True
                finding = finding_next
            finding = list(finding[0])
            if finding[5] == "":
                finding[5] = "nom."
            if finding[1] == "":
                finding[1] = "1"
            stck = {"kind": finding[5].strip(),
                    "amount": finding[1].strip(),
                    "vote": finding[6].replace(" ", "").strip(),
                    "value": finding[3].strip(),
                    "currency": finding[2].strip(),
                    "rank": element_counter}
            self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value)
            element_counter += 1
        # match_akt = regex.search(r"\.\s?\-\s?Akt", text)
        # if match_saemtlsakt is not None:
        #    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value)
        #    element_counter += 1
        #    continue
        if final_text != "":
            self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter,
                                  only_filled=only_add_if_value)
        return True
        """
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        # add extra splitting elements to each 'je' or 'Je'
        origpost_red_se = regex.sub(r"(Je |je )", r"~~~\1", origpost_red)

        split_text = origpost_red_se.split('~~~')
        # origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red)
        only_add_if_value = True

        for entry in split_text:
            if entry == "":
                continue
            match_sb = regex.search(r"Stimmrechtsbeschränkung:.*", entry)
            sbe = None
            if match_sb is not None:
                sbe = match_sb.group()
                sbe = sbe.replace("Stimmrechtsbeschränkung:", "", 1)
                entry = entry.replace(sbe, "").replace("Stimmrechtsbeschränkung:", "", 1)

            self.ef.add_to_my_obj("entry", entry, object_number=element_counter ,only_filled=only_add_if_value)
            self.ef.add_to_my_obj("Stimmrechtsbeschränkung", sbe, object_number=element_counter ,only_filled=only_add_if_value)
            element_counter += 1
        """

    def parse_boersennotiz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        found_parenth = None
        origpost_used = origpost_red

        # log all location elements
        only_add_if_value = True
        split_post = regex.split('u\.|und|,', origpost_used)
        for entry in split_post:
            entry_stripped = entry.strip("., ")

            # find additional info in  each line and subtract it
            # find last parenthesis and filter
            #match_parenth = regex.findall(r"(\(.*?\))", entry_stripped)
            #combined_ps = []
            #for res in match_parenth:
                #combined_ps.append(res.strip())
                #origpost_used = origpost_red.replace(found_parenth, "")  # update the orignpost used
                # log additional info in last parenthesis

            #self.ef.add_to_my_obj("additional_info", combined_ps, object_number=element_counter,
            #                          only_filled = only_add_if_value)

            #if entry_stripped is None or entry_stripped == "":
                #if match_parenth:
                #    element_counter += 1
            entry_stripped = entry.replace("im Freiverkehr", "").replace("(amtl.)", "").strip("., ")
            if entry_stripped == None or entry_stripped == "":
                continue
            self.ef.add_to_my_obj("location", entry_stripped, object_number=element_counter, only_filled= only_add_if_value)
            element_counter += 1

        return True


    def preprocess_stueckelung_texts(self, content_texts):
        final_stueckelung_texts = []

        previous_text_stripped = ""
        for index, current_text in enumerate(content_texts):
            current_text_stripped = current_text.strip()
            if current_text_stripped == "":
                continue

            if current_text_stripped.startswith("zu je") or current_text_stripped.startswith("Zu je"):
                final_stueckelung_texts.append(previous_text_stripped + "  "+current_text_stripped)
                previous_text_stripped = ""
            elif "(" == current_text_stripped[0] and ")" == current_text_stripped[-1]:
                final_stueckelung_texts.append(previous_text_stripped + "  "+current_text_stripped)
                previous_text_stripped = ""
            else:
                final_stueckelung_texts.append(previous_text_stripped)
                previous_text_stripped = current_text_stripped
                if index == len(content_texts)-1:
                    final_stueckelung_texts.append(current_text_stripped)

        final_texts_filtered = []
        for text in final_stueckelung_texts:
            text_stripped = text.strip()
            if text_stripped != "":
                final_texts_filtered.append(text_stripped)

        return final_texts_filtered

    def parse_stueckelung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        # find last parenthesis and filter
        match_parenth = regex.findall(r"(\(.*?\))", origpost_red)
        found_parenth = None
        origpost_used = origpost_red
        # find additional info in  each line and subtract it
        if match_parenth:
            found_parenth = match_parenth[-1].strip("., ")  # find the last parenthesis grounp
            origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used

        final_lines = []
        additional_info_final = []
        only_add_if_value = True
        skip = False
        final_text = ""
        final_add_rest = ""
        content_texts = self.preprocess_stueckelung_texts(content_texts)
        for text_index, text in enumerate(content_texts):
            if text.strip() == "":
                continue
            if skip:
                skip = False
                continue
            parse_stck = regex.compile(r"(?P<amount>[\d\s\.]*)\s*(?P<kind>[^\d]*?)[\s]?(?P<nominal>zu je|zuje|zu|je)\s{0,}(?P<currency>[^\d\s]*)\s{0,}(?P<value>[\d\s]*)")
            finding = parse_stck.findall(text.replace(" Stücke ", " Aktien ").replace(" Stück ", " Aktie ").replace("DM", " DM").replace("RM", " RM").replace("hfl"," hfl"))

            rest_finding = ""
            if len(finding) >= 1:
                rest_finding = text # get the rest of finding
                subtract_sorted = sorted(finding[0],key=len)
                subtract_sorted.reverse()
                for find_chunk in subtract_sorted:
                    rest_finding = rest_finding.replace(find_chunk, "", 1).strip()
                rest_finding = regex.sub("\s{2,}"," ", rest_finding) # just replace redundant spaces for better subtraction

            if not finding or finding[0][0]+finding[0][1] == "" or finding[0][0]+finding[0][4] == "":
                match_akt = regex.search(r"\.\s?\-\s?Akt", text)
                match_saemtlsakt, err_saemtlsakt = regu.fuzzy_search(
                    r"([Ss]ämtliche [Ss]tammaktien.*|[Ss]ämtliche [Aa]ktien.*|[Ss]ämtliche Namens\-Stammaktien.*)", text, err_number=1)
                if match_saemtlsakt is not None: #and match_akt is not None: @jk is this second condition really necessary ?
                    saemtl_res = match_saemtlsakt.group()
                    self.ef.add_to_my_obj("additional_info", saemtl_res, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    reduced_text = text.replace(saemtl_res, "")
                    final_lines.append(reduced_text)
                    rest_finding = rest_finding.replace(reduced_text,"")
                if "Börse" in text or "Besondere" in text:
                    addendum = "".join(content_texts[text_index:])
                    self.ef.add_to_my_obj("additional_info", addendum, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    element_counter += 1
                    rest_finding = rest_finding.replace("".join(content_texts[text_index:]), "")
                    break
                if "(" in text:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter-1,
                                          only_filled=only_add_if_value)
                    rest_finding = rest_finding.replace(text, "")

                else:
                    rest_finding = rest_finding.replace(text, "")
                    final_text += text
                continue
            finding_next = None
            if finding[0][2] == "" or (("zu" in finding[0][2] or "je" in finding[0][2]) and finding[0][3] == ""):
                #test =  '2 638 514 Inh. - bzw. Namensaktien zuje FF 75.-'
                if text_index == len(content_texts) - 1:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
                else:
                    finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1])
            if finding[0][3]+finding[0][4] == "":
                if text_index == len(content_texts) - 1:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
                else:
                    finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1])
            if finding_next:
                skip = True
                finding = finding_next
            stck = {"amount": finding[0][0].replace("."," ").strip(),
                     "kind": finding[0][1].replace(" ","").strip(),
                     "nominal": "zu je",
                     "currency": finding[0][3],
                     "value": finding[0][4],
                     "rank": element_counter}
            self.ef.add_to_my_obj("entry", stck, object_number=element_counter, only_filled=only_add_if_value)
            if rest_finding != "":
                final_add_rest += rest_finding + " "
            element_counter += 1
           # match_akt = regex.search(r"\.\s?\-\s?Akt", text)
            #if match_saemtlsakt is not None:
            #    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value)
            #    element_counter += 1
            #    continue
        if final_text != "":
            self.ef.add_to_my_obj("additional_info", final_text.replace(final_add_rest.strip(".,- "),
                                                                        "", 1).strip(".,- "), object_number=element_counter,
                                  only_filled=only_add_if_value)
            element_counter += 1

        if final_add_rest != "":
            self.ef.add_to_my_obj("additional_info", final_add_rest.strip(".,- "), object_number=element_counter,
                                  only_filled=only_add_if_value)
        return True
Ejemplo n.º 29
0
class DictionaryHandler(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.cpr.print("init dictionary handler")
        self.data_functs = None  # storage for json object
        self.data_titles = None  # storage for json object
        self.texts_functs = None
        self.texts_titles = None
        if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING:
            self.load_dictionaries()
            # get the rows as sorted list of texts longest first
            if self.data_functs is not None:
                check_tf = self.sort_rows(self.get_rows(self.data_functs))
                self.texts_functs = check_tf
            if self.data_titles is not None:
                check_tt = self.sort_rows(self.get_rows(self.data_titles))
                self.texts_titles = check_tt

    def diff_name_title(self, text_to_check):

        len_text_to_check = len(text_to_check)
        name_found = text_to_check
        title_found = ""

        for entry_index, entry in enumerate(self.texts_titles):
            title, tlen = entry
            # accelerate the process, by skipping comparisons which have longer texts
            if tlen > len_text_to_check:
                continue
            # compare the texts
            if title in text_to_check:
                name_found = text_to_check.replace(title, "", 1).strip()
                title_found = title
                break

        return name_found, title_found

    def load_dictionaries(self):
        base_dict_path = self.get_dict_path()

        filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json")
        filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json")

        # load titles
        if os.path.exists(filepath_titles_dict):
            with open(filepath_titles_dict) as f:
                self.data_titles = json.load(f)
        else:
            self.cpr.printex(
                "dictionary dict_titles.json missing at specificied path",
                filepath_titles_dict)

        # load functs
        if os.path.exists(filepath_functs_dict):
            with open(filepath_functs_dict) as f:
                self.data_functs = json.load(f)
        else:
            self.cpr.printex(
                "dictionary dict_functs.json missing at specificied path",
                filepath_functs_dict)

    def get_rows(self, dict_data):
        rows = dict_data['rows']
        final_rows = []
        for entry in rows:
            text = entry[0]
            final_rows.append((text, len(text)))
        return final_rows

    def sort_rows(self, rows):
        #itemgetter(1),
        rows.sort(key=lambda t: len(t[0]), reverse=True)
        return rows

    def path(self):
        return os.getcwd()

    def get_dict_path(self):
        complete = os.path.join(self.path(), "additionals", "dictionaries")
        return complete
Ejemplo n.º 30
0
class IsriHandler(object):
    def __init__(self):
        self.os = os.name.lower()
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()

        if 'ExceptionInitializing' in self.config:
            print("Exception initializing config, don't print")
            self.cpr = ConditionalPrint(False, False, False)
        else:

            self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                        self.config.PRINT_EXCEPTION_LEVEL,
                                        self.config.PRINT_WARNING_LEVEL)

        if self.os != 'linux' and self.os != 'posix':
            raise OSError(
                "Untested operating system adapt code and continue at own risk"
            )

    def accuracy(self,
                 path_correctfile,
                 path_generatedfile,
                 path_accuracy_report=""):

        try:
            call([
                "accuracy", path_correctfile, path_generatedfile,
                path_accuracy_report
            ])
        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    class SynctextConfig(object):
        def __init__(self):
            self._used_config_acc = []

        def use_T_algorithm(self):
            self._used_config_acc.append("-T")

        def use_H_algorithm(self):
            self._used_config_acc.append("-H")

        def use_case_insensitive(self):
            self._used_config_acc.append("-i")

        def use_display_suspect_markers_in_output(self):
            self._used_config_acc.append("-s")

        def get_used_config(self):
            return self._used_config_acc

        def clear_used_config(self):
            self._used_config_acc = []

    def synctext(self,
                 filepaths,
                 path_generatedfile=None,
                 synctext_config=SynctextConfig()):

        try:
            flags = synctext_config.get_used_config()
            calls = ["synctext"]
            calls.extend(flags)
            calls.extend(filepaths)

            if path_generatedfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generatedfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def accsum(self, filepaths_accreports, path_generatedfile=None):

        try:
            calls = ["accsum"]
            calls.extend(filepaths_accreports)

            if path_generatedfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generatedfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def groupacc(self,
                 path_groupfile,
                 path_accuracy_report,
                 path_groupacc_report=None):

        try:
            calls = ["groupacc"]
            calls.append(path_groupfile)
            calls.append(path_accuracy_report)

            if path_groupacc_report is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_groupacc_report, True)
                filehandle.close()
                calls.append(path_groupacc_report)
                call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def accdist(self, filepaths_accreports, path_generated_xyfile=None):

        try:
            calls = ["accdist"]
            calls.extend(filepaths_accreports)

            if path_generated_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generated_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    class NGramConfig(object):
        def __init__(self):
            self._used_config_acc = []

        def set_ngram_size(self, number):
            if number >= 1 and number <= 3:
                self._used_config_acc.append("-n")
                self._used_config_acc.append(str(number))

        def clear_used_config(self):
            self._used_config_acc = []

        def get_used_config(self):
            return self._used_config_acc

    def ngram(self,
              filepaths,
              path_generatedfile=None,
              ngram_config=NGramConfig()):

        try:
            flags = ngram_config.get_used_config()
            calls = ["ngram"]
            calls.extend(flags)
            calls.extend(filepaths)

            if path_generatedfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_generatedfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    class VoteConfig(object):
        def __init__(self):
            self._used_config_acc = []

        def enable_O_optimization(self):
            self._used_config_acc.append("-O")

        def set_s(self, fraction_counter, fraction_denominator):
            self._used_config_acc.append("-s")
            self._used_config_acc.append(fraction_counter + "/" +
                                         fraction_denominator)

        def set_w(self, fraction_counter, fraction_denominator):
            self._used_config_acc.append("-w")
            self._used_config_acc.append(fraction_counter + "/" +
                                         fraction_denominator)

        def set_output_file(self, path_outputfile):
            self._used_config_acc.append("-o")
            self._used_config_acc.append(path_outputfile)  #ok?

        def clear_used_config(self):
            self._used_config_acc = []

        def get_used_config(self):
            return self._used_config_acc

    def vote(self, filepaths, ngram_config=VoteConfig()):

        try:

            flags = ngram_config.get_used_config()
            calls = ["vote"]
            calls.extend(flags)
            calls.extend(filepaths)

            call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordacc(self,
                path_correctfile,
                path_comparison_file,
                path_stopwordfile=None,
                path_wordacc_report=None):

        try:
            calls = ["wordacc"]

            if path_stopwordfile is not None:
                calls.append("-S")
                calls.append(path_stopwordfile)

            calls.append(path_correctfile)
            calls.append(path_comparison_file)

            if path_wordacc_report is not None:
                calls.append(path_wordacc_report)

            call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordaccsum(self, filepaths_wordacc_reports, path_accsumreport=None):

        try:
            calls = ["wordaccsum"]
            calls.extend(filepaths_wordacc_reports)

            if path_accsumreport is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_accsumreport, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def nonstopacc(self,
                   path_stopwordfile,
                   path_wordacc_report,
                   path_output_xyfile=None):

        try:
            calls = ["nonstopacc"]
            calls.append(path_stopwordfile)
            calls.append(path_wordacc_report)

            if path_output_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_output_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordaccci(self, filepaths_wordacc_reports, path_outputfile=None):

        try:
            calls = ["wordaccci"]
            calls.extend(filepaths_wordacc_reports)

            if path_outputfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_outputfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordaccdist(self, filepaths_wordacc_reports, path_output_xyfile=None):

        try:
            calls = ["wordaccdist"]
            calls.extend(filepaths_wordacc_reports)

            if path_output_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_output_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def wordfreq(self, filepaths_inputtext, path_resultfile=None):

        try:
            calls = ["wordfreq"]
            calls.extend(filepaths_inputtext)

            if path_resultfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_resultfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    #todo add the zoning programs some day: point 4 in doc
    def editop(self,
               path_correctfile,
               path_comparison_file,
               path_editop_report=None):

        try:
            calls = ["editop"]

            calls.append(path_correctfile)
            calls.append(path_comparison_file)

            if path_editop_report is not None:
                calls.append(path_editop_report)

            call(calls)

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def editopsum(self, filepaths_editopreports, path_summed_report=None):

        try:
            calls = ["editopsum"]
            calls.extend(filepaths_editopreports)

            if path_summed_report is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_summed_report, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def editopcost(self,
                   path_editop_report,
                   path_editop_report2=None,
                   path_output_xyfile=None):

        try:
            calls = ["editopcost"]

            calls.append(path_editop_report)

            if path_editop_report2 is not None:
                calls.append(path_editop_report2)

            if path_output_xyfile is None:
                call(calls)
            else:
                filehandle = self.create_file_if_doesnt_exist(
                    path_output_xyfile, True)
                call(calls, stdout=filehandle)
                filehandle.close()

        except Exception as ex:
            self.cpr.printex("Exception calling pycharm", ex)

    def create_file_if_doesnt_exist(self, filepath, overwrite=False):

        file = open(filepath, 'w+')
        if overwrite:
            self.delete_file_content(file)
        return file

    def delete_file_content(self, pfile):
        pfile.seek(0)
        pfile.truncate()