def __init__(self, lines_size, y_mean, msa_handler): lineset = [] for x in range(0, lines_size): lineset.append(False) self._set_lines = lineset self._size = lines_size self._y_mean = y_mean # mean y coordinate of all lines referenced in this set self.shortest_distance_line_index = -1 self._unspaced = False # indicates the set_lines was unspaced self._refspaced = False # indicates the set_lines was reference spaced self._text_unspacer = TextUnspacer() self.shortest_distance_line = None # holder element for recognized shortest distance line self._best_msa_text = "" self._text_seg = None self._is_origin_database = False self._database_handler = None config_handler = ConfigurationHandler(first_init=False) self._config = config_handler.get_config() if 'ExceptionInitializing' in self._config: print("Exception initializing config, don't print") self._cpr = ConditionalPrint(False, False, False) else: self._cpr = ConditionalPrint(self._config.PRINT_MSA_HANDLER, self._config.PRINT_EXCEPTION_LEVEL, self._config.PRINT_WARNING_LEVEL) self._msa_handler = msa_handler
def __init__(self): self.os = os.name.lower() config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init segment classifier")
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init additional info handler")
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.PRINT_TO_CHECKFILE = False # a line starting with these words can't be in a table self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:", "von","Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:","Umstellung"]
def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TWO, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions two") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler
def __init__(self): self.my_object = {} self.current_main_list = None self.pp = pprint.PrettyPrinter(indent=5) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) if self.config.REMOVE_TAGS_IN_ORIG_DIFF: self.known_uc = KnownUncategories()
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.filter_start_words = [ "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung" ]
class AkfParsingFunctionsTablesOne(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint( self.config.PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions tables one") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) def parse_dividenden(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) def parse_dividenden_auf_xyaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag)
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SPECIALCHAR_PREDICTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)
def __init__(self, y_size, x_size, wildcard_character, substitution_character): self._y_size = y_size self._x_size = x_size self._middle_index = Random.find_middle(self._x_size, True) self._pre_middle_index = self.get_middle_index() - 1 self._nex_middle_index = self.get_middle_index() + 1 self._wildcard_character = wildcard_character self._substitution_character = substitution_character self.similar_chars = [] self.similar_chars.append(['o', 'ö']) self.similar_chars.append(['<', 'o']) # untested is this really better? self.similar_chars.append(['O', 'Ö']) self.similar_chars.append(['0', 'O', '9']) self.similar_chars.append(['d', 'ö']) #self.similar_chars.append(['1', 'l']) self.similar_chars.append(['l', 'j', '1']) self.similar_chars.append(['I', 'l']) self.similar_chars.append(['u', 'ü']) self.similar_chars.append(['U', 'Ü', 'O']) self.similar_chars.append(['a', 'ä']) self.similar_chars.append(['A', 'Ä']) self.similar_chars.append([':', ';']) self.similar_chars.append(['-', '¬']) self.similar_chars.append(['"', "'"]) self.similar_chars.append(['C', "G", "c"]) # just for testing ... self.similar_chars.append(['.', ',']) self.similar_chars.append([',', ';']) self.similar_chars.append(['v', 'V']) self.similar_chars.append(['w', 'W']) self.similar_chars.append(['i', 'l', 't', '1', '.']) # 1 l i also possible self.similar_chars.append(['r', 'n']) self.similar_chars.append(['%', 'm']) self.similar_chars.append(['&', 'é']) self.similar_chars.append(['e', 'é']) config_handler = ConfigurationHandler(first_init=False) self._config = config_handler.get_config() self._cpr = ConditionalPrint(self._config.PRINT_SEARCH_SPACE_PROCESSOR, self._config.PRINT_EXCEPTION_LEVEL, self._config.PRINT_WARNING_LEVEL)
def __init__(self, predictor=None, vocabulary_checker=None, first_config_init=False): self.ocr_sets = [] self.line_height_information = [] config_handler = ConfigurationHandler(first_init=first_config_init) self.config = config_handler.get_config() if 'ExceptionInitializing' in self.config: self.cpr = ConditionalPrint(False, False, False) else: self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.predictor = predictor self.vocabulary_checker = vocabulary_checker
def __init__(self): self.os = os.name.lower() config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() if 'ExceptionInitializing' in self.config: print("Exception initializing config, don't print") self.cpr = ConditionalPrint(False, False, False) else: self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) if self.os != 'linux' and self.os != 'posix': raise OSError( "Untested operating system adapt code and continue at own risk" )
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_VOCABULARY_CHECKER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.dict_lines = [] self.max_edist = None self.suggenstion_verbosity = None #self.spellchecker = None self.special_chars_borders = "!¦1234567890,)(;.:\"-" self.pattern_start = re.compile(r"^[" + self.special_chars_borders + "]+") self.pattern_trail = re.compile(r"[" + self.special_chars_borders + "]+$") self.pattern_trail_dash = re.compile(r"[-]$") self.pattern_only_normal_chars = re.compile(r"[a-zA-Z]+")
def __init__(self, output_analyzer, dictionary_handler, ocromore_data=None): self.ef = EndobjectFactory() self.dictionary_handler = dictionary_handler # map which maps tags to functions for parsing -> change constuctor for other project fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.function_map = fmap.get_function_map() self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/"
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init dictionary handler") self.data_functs = None # storage for json object self.data_titles = None # storage for json object self.texts_functs = None self.texts_titles = None if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING: self.load_dictionaries() # get the rows as sorted list of texts longest first if self.data_functs is not None: check_tf = self.sort_rows(self.get_rows(self.data_functs)) self.texts_functs = check_tf if self.data_titles is not None: check_tt = self.sort_rows(self.get_rows(self.data_titles)) self.texts_titles = check_tt
class VisualizationHandler(object): def __init__(self): self.os = os.name.lower() config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) def show_file_comparison_pycharm(self, filepath_1, filepath_2): if self.os == 'linux' or self.os == 'posix': try: process = Popen(["charm", "diff", filepath_1, filepath_2]) return process except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) else: self.cpr.printex("Write code here for other os, or take other os") return None def show_file_comparison_meld(self, filepath_1, filepath_2, just_add_tab=False): if self.os == 'linux' or self.os == 'posix': try: if just_add_tab: process = Popen( ["meld", "--newtab", filepath_1, filepath_2]) else: process = Popen(["meld", filepath_1, filepath_2]) return process except Exception as ex: self.cpr.printex("Exception calling meld", ex) else: self.cpr.printex("Write code here for other os, or take other os") return None
def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.cpr_vocab_check = ConditionalPrint( self.config.PRINT_VOCABULARY_CHECKER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.cpr_sc_predict = ConditionalPrint( self.config.PRINT_SPECIALCHAR_PREDICTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.filo_last_chars = Filo(250) self.predictor = None self.use_aufsichtsrat_prediction = False self.vocab_checker = None self.previous_word_with_seperator = False
class AkfParsingFunctionsOne(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint( self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions one") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler def parse_firmenname(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # get relevant info accumulated_text = "" for text in content_texts: accumulated_text += " " + text only_add_if_value = False accumulated_text = accumulated_text.strip() self.ef.add_to_my_obj("Firmenname", accumulated_text, object_number=element_counter, only_filled=only_add_if_value) def parse_sitz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): """ "Sitz": [ { "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:[email protected] Internetseite:http://www.3u.net ", "type": "Sitz", "street": "Mergenthalerallee", "street_number": "79-81", "zip": "65760", "city": "Eschborn", "phone": "(069) 7 50 06-0", "fax": "(069) 7 50 06-111", "email": [ "*****@*****.**" ], "www": [ "http://www.3u.net" ] } ], """ # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # get relevant info num_id, city, street, street_number, additional_info = cf.parse_id_location( origpost_red) # add stuff to ef only_add_if_value = True self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled=only_add_if_value) return True def parse_verwaltung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # kmy_obj_2 = self.ef.print_me_and_return() # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) if "srat" in real_start_tag: # Verwaltungsrat .. persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 return True elif "Verw." in real_start_tag: # Verw. num_id, city, street, street_number, additional_info = cf.parse_id_location( origpost_red) # add stuff to ef only_add_if_value = True self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled=only_add_if_value) return True else: # Verwaltung final_items = cf.parse_general_and_keys( content_texts, join_separated_lines=False, current_key_initial_value="General_Info") for key in final_items.keys(): value = final_items[key] if value is None or value == "": continue self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) element_counter += 1 return True def parse_telefon_fernruf(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data origpost, origpost_red, element_counter, content_texts = cf.add_check_element( self, content_texts, real_start_tag, segmentation_class, 0) # do special match: Verwaltung und Betriebshof split_post = [] match_special = regex.match( r"(?<Verw>Verwaltung.*)" r"(?<Betr>Betriebshof.*)", origpost_red) if match_special: betriebshof = match_special.group("Betr") verwaltung = match_special.group("Verw") origpost_red = origpost_red.replace(betriebshof, "") origpost_red = origpost_red.replace(verwaltung, "") split_post.append(betriebshof) split_post.append(verwaltung) # do special match: Ortsgespräche and Ferngespräche match_special2 = regex.match( r"(?<og>Ortsgespräche.*)" r"(?<fg>Ferngespräche.*)", origpost_red) if match_special2: ortsgespr = match_special2.group("og") ferngespr = match_special2.group("fg") origpost_red = origpost_red.replace(ortsgespr, "") origpost_red = origpost_red.replace(ferngespr, "") split_post.append(ortsgespr) split_post.append(ferngespr) # do special match: Ortsverkehr and Fernverkehr match_special3 = regex.match( r"(?<ov>Ortsverkehr.*)" r"(?<fv>Fernverkehr.*)", origpost_red) if match_special3: ortsverkehr = match_special3.group("ov") fernverkehr = match_special3.group("fv") origpost_red = origpost_red.replace(ortsverkehr, "") origpost_red = origpost_red.replace(fernverkehr, "") split_post.append(ortsverkehr) split_post.append(fernverkehr) # do special match: check if only numbers origpost_red_new = origpost_red #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "") test_split = regex.split("\su\.|\sund\s|,|;", origpost_red) for number in test_split: # additional parenthesis block match_parenthesis = regex.search("\(.*\)", number) parenthesis = None if match_parenthesis: parenthesis = match_parenthesis.group() number = number.replace(parenthesis, "") # remove number self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) match_word_num = regex.search("(?<word>[^\d]*)(?<num>[\d\s\-/]*)", number) if match_word_num is None: continue word = match_word_num.group("word") num = match_word_num.group("num") if "Sa." in word and "Nr" in word: continue number_stripped = num.strip(" ./").replace("/", "").replace( "-", "").replace(" ", "") if number_stripped.isdigit(): origpost_red_new = origpost_red_new.replace( number, "") # remove number origpost_red_new = origpost_red_new.replace( word, "") # remove word found change1 = self.ef.add_to_my_obj("number_Sa.-Nr.", num.strip(), object_number=element_counter, only_filled=True) change2 = self.ef.add_to_my_obj("location", word.strip(), object_number=element_counter, only_filled=True) if change1 or change2: element_counter += 1 #if "32 20 47" in origpost_red: # print("asd") origpost_red = origpost_red_new # substitute in a separator char to integrate delimiters in next step origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red) # do further matches (sc-separated) split_post.extend(regex.split(';|~~~~|\su\.', origpost_red)) for index, entry in enumerate(split_post): if entry is None: continue entry_stripped = entry.strip() if entry_stripped == "": continue # additional parenthesis block match_parenthesis = regex.search("\(.*\)", entry_stripped) parenthesis = None if match_parenthesis: parenthesis = match_parenthesis.group() entry_stripped = entry_stripped.replace(parenthesis, "") # remove entry self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) match_word = regex.match(r"(?<Tag>\D*)" r"(?<Numbers>[\d\s\W]*)", entry_stripped) if match_word is not None: # fetch match results tag_match = match_word.group("Tag") numbers_match = match_word.group("Numbers") rest_from_entry_str = entry_stripped.replace(tag_match, "", 1) rest_from_entry_str = rest_from_entry_str.replace( numbers_match, "", 1) tag = dh.strip_if_not_none(tag_match, "") match_tag = regex.match( r"(?<rest_bef>.*)(?<sanr>Sa\.?\-Nr\.?)(?<rest_end>.*)", tag) location = "" if match_tag is not None: rest_tag = match_tag.group('rest_bef') rest_tag_2 = match_tag.group('rest_end') # sanr = match_tag.group('sanr') # this is the filtered group location = dh.strip_if_not_none( rest_tag + " " + rest_tag_2, ":., ") else: # if there are no real descriptors in tag then tag is usually location (like Düsseldorf 1 36 62.) location = tag if "und" in location: location = regex.sub("[^\w]und[^\w]", "", location) number = dh.strip_if_not_none(numbers_match, "., ") self.ef.add_to_my_obj("number_Sa.-Nr.", number.strip(), object_number=element_counter, only_filled=True) self.ef.add_to_my_obj("location", location.strip(), object_number=element_counter, only_filled=True) additional_info_entry_level = dh.strip_if_not_none( rest_from_entry_str, ",. ") self.ef.add_to_my_obj("additional_info", additional_info_entry_level.strip(), object_number=element_counter, only_filled=True) element_counter += 1 origpost_red = origpost_red.replace(number, "", 1) origpost_red = origpost_red.replace(location, "", 1) origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "") origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red) if len(origpost_red_end) > 3: self.ef.add_to_my_obj("additional_info_unparsed", origpost_red_end.strip(), object_number=element_counter) def parse_vorstand(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 """ # do matches (;-separated) split_post = origpost_red.split(';') for index, entry in enumerate(split_post): entry_stripped = entry.strip() if index == len(split_post)-1: matchend = regex.match("^[Aa]lle", entry_stripped) if matchend: self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter) element_counter += 1 continue match = regex.match(r"(?<Name>.*)[,]" # find location string r"(?<Rest>.*+)", # just get the rest which is usually streetname and number, but has other possibilities entry_stripped) if match is None: name = dh.strip_if_not_none(entry_stripped, ", ") self.ef.add_to_my_obj("name", name, object_number=element_counter) element_counter += 1 continue name = dh.strip_if_not_none(match.group("Name"), ", ") rest = dh.strip_if_not_none(match.group("Rest"), ",. ") name_split = name.split(',') if len(name_split) > 1: position = rest name = name_split[0] city = name_split[1] else: city = rest position = "" self.ef.add_to_my_obj("name", name, object_number=element_counter) self.ef.add_to_my_obj("city", city, object_number=element_counter) self.ef.add_to_my_obj("position", position, object_number=element_counter) element_counter += 1 """ return True def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) #Try to fix +) problems origpost_red = origpost_red.replace("; +)", "+);").replace( ";+)", "+);").replace("')", "").replace("*)", "") persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 return True def parse_arbeitnehmervertreter(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 return True # Gruendung def parse_gruendung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) match_year = regex.search("^\d*", origpost_red.strip()) if match_year: result = match_year.group() origpost_red_new = origpost_red.replace(result, "", 1) year = dh.strip_if_not_none(result, ".,() ") rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ") self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) self.ef.add_to_my_obj("year", year, object_number=element_counter, only_filled=True) else: rest_info = dh.strip_if_not_none(origpost_red, ".,() ") self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) # Tätigkeitsgebiet def parse_taetigkeitsgebiet(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) final_items = cf.parse_general_and_keys( content_texts, join_separated_lines=False, current_key_initial_value="General_Info") for key in final_items.keys(): value = final_items[key] if value is None or len(value) == 0: continue self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) element_counter += 1
class FeatureExtractor(): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.filter_start_words = [ "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung" ] def extract_file_features(self, ocromore_data): all_line_features = [] for line in ocromore_data['lines']: current_line_features = self.extract_line_features(line) all_line_features.append(current_line_features) ocromore_data['line_features'] = all_line_features return ocromore_data def extract_line_features(self, line): final_line_features = {} whole_text = line['text'] self.cpr.print("recognizing text:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for word_obj in line['words']: word_index = word_obj['word_index'] word_text = word_obj['text'] hocr_coordinates = word_obj['hocr_coordinates'] word_xstart = hocr_coordinates[0] word_xstop = hocr_coordinates[2] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if word_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word_text is None or word_text == "": continue if word_index == 0: if word_text in self.filter_start_words: first_word_no_table_indicator = True if word_text.lower() == "ultimo": ultimo_is_first_word = True if word_text[0] == "(": starts_with_parenthesis = True if word_index == len(whole_text) - 1: if word_text[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word_text) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round( counter_alphabetical_word / len(word_text), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word_text)) counters_numbers.append(counter_numbers_word) character_index += len(word_text) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars in line:", str(line['line_index']), "no features here") return False special_chars_ratio = counter_special_chars / counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces / counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers / counter, 2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths) - 1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True final_line_features = LineFeatures(cpr=self.cpr) final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.counter_special_chars = counter_special_chars final_line_features.counter_chars = counter_chars final_line_features.counter_spaces = counter_spaces final_line_features.counter_numbers = counter_numbers final_line_features.counter_alphabetical = counter_alphabetical final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars final_line_features.counter_words = counter_words final_line_features.counters_numbers = counters_numbers final_line_features.counters_wordlengths = counters_wordlengths final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios final_line_features.numbers_ratio = numbers_ratio final_line_features.alphabetical_ratio = alphabetical_ratio final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio final_line_features.special_chars_ratio = special_chars_ratio final_line_features.spaces_ratio = spaces_ratio final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words final_line_features.many_numbers_in_first_word = many_numbers_in_first_word final_line_features.x_box_sizes = x_box_sizes final_line_features.x_gaps = x_gaps final_line_features.maximum_x_gap = maximum_x_gap final_line_features.mean_x_gap = mean_x_gap final_line_features.median_x_gap = median_x_gap return final_line_features
class TableHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.PRINT_TO_CHECKFILE = False # a line starting with these words can't be in a table self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:", "von","Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:","Umstellung"] #with open("checkfile_tables.txt", "w") as myfile: # myfile.write("----" + "\n") def recognize_a_line(self, line): if line == None or line == False or line == True or line.textstr == None: return False whole_text = line.textstr self.cpr.print("recognizing line:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for key_index, key in enumerate(line.word['text']): word = line.word['text'][key] uid_info = line.word['UID'][key] word_xstart = line.data['word_x0'][character_index] word_xstop = line.data['word_x1'][character_index] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if key_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word is None or word == "": continue if key_index == 0: if word in self.filter_start_words: first_word_no_table_indicator = True if word.lower() == "ultimo": ultimo_is_first_word = True if word[0] == "(": starts_with_parenthesis = True if key_index == len(line.word['text'])-1: if word[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word)) counters_numbers.append(counter_numbers_word) character_index += len(uid_info) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars shouldn't happen, no recognizion") return False special_chars_ratio = counter_special_chars/ counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces/ counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers/counter,2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths)-1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True self.cpr.print("alle cntr:", counter_chars) self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio) self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio) self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio) self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio) self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio) self.cpr.print("x_box_sizes", x_box_sizes) self.cpr.print("x_gaps", x_gaps) self.cpr.print("x_gap_max_size", maximum_x_gap) self.cpr.print("x_gaps_mean", mean_x_gap) self.cpr.print("x_gaps_median", median_x_gap) if "Gewinn nach Vortrag" in whole_text: print("") if ((alphabetical_ratio < 0.75 and \ numbers_ratio > 0.2 and \ counter_chars > 5 and \ counter_words >= 2) and not \ (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word: if first_word_no_table_indicator: return False if mean_x_gap <= 115: return False if many_alphabetical_in_last_word: return False if many_alphabetical_in_middle_words and many_numbers_in_first_word: return False self.cpr.print("possible entry:", whole_text) if self.PRINT_TO_CHECKFILE: with open("checkfile_tables.txt", "a") as myfile: myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \ + "||| median x_gap: " + str(median_x_gap)+"\n") print("jab") return True return False
class OCRVoter(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.cpr_vocab_check = ConditionalPrint( self.config.PRINT_VOCABULARY_CHECKER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.cpr_sc_predict = ConditionalPrint( self.config.PRINT_SPECIALCHAR_PREDICTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.filo_last_chars = Filo(250) self.predictor = None self.use_aufsichtsrat_prediction = False self.vocab_checker = None self.previous_word_with_seperator = False def add_predictor(self, predictor): self.predictor = predictor def add_vocab_checker(self, vocab_checker): self.vocab_checker = vocab_checker def get_same_count(self, c1, c2, c3): same_ctr = 0 if c1 == c2: same_ctr += 1 if c1 == c3: same_ctr += 1 return same_ctr def get_confidence_count(self, char1, char2, char3, cconf1, cconf2, cconf3, wildcard_char='¦'): def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3): if char_first != char_sec: return char_sec, float(co2) elif char_first != char_thrd: return char_thrd, float(co3) same_ctr = 0 cconf_ctr = float(cconf1) if char1 == char2: same_ctr += 1 cconf_ctr += float(cconf2) if char1 == char3: same_ctr += 1 cconf_ctr += float(cconf3) # special cases space: ' ', ' ', 'x' # wildcard character : '¦', '¦', '¦' if char1 == ' ' and same_ctr == 1: # if the confidence of the other character is below that value, space gets the high put in confidence value return 1, 95.0 #todo j4t SPACE_TRESH = 50.0 SPACE_PUT_IN_VALUE = 99.0 otherchar, otherconf = get_other_char(char1, char2, char3, cconf1, cconf2, cconf3) #print("otherchar",otherchar,"otherconf",otherconf) if otherconf < SPACE_TRESH: return 1, SPACE_PUT_IN_VALUE elif char1 == wildcard_char and same_ctr == 1: #todo: differentiate type of character ?? # if there is two wildcards and one characters, characters confidence has to be higher than # WILDCARD_TRESH to be taken wildcard_tresh = 98.5 if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE: wildcard_tresh -= 10 # 0:99,19%, 20:99.16%, 10:99.27% return 1, wildcard_tresh elif char1 == wildcard_char and same_ctr == 0: pass # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard) elif char1 == '' and same_ctr == 0: pass # todo maybe cover this case (cause space has no confidence ... elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \ and Random.is_special_character(char1) and same_ctr == 0 \ and char2 == wildcard_char and char3 == wildcard_char: # lower the confidence of special characters which stand without any other chars return same_ctr, cconf_ctr * 0.9 return same_ctr, cconf_ctr def vote_best_of_three_simple(self, text_1, text_2, text_3, index_best, wildcard_character='¦'): list_line_1 = list(text_1) list_line_2 = list(text_2) list_line_3 = list(text_3) accumulated_chars = "" accumulated_confs = Filo for character_index, character_1 in enumerate(list_line_1): character_2 = list_line_2[character_index] character_3 = list_line_3[character_index] clist = [character_1, character_2, character_3] # get the character which occurs the most sc1 = self.get_same_count(character_1, character_2, character_3) sc2 = self.get_same_count(character_2, character_1, character_3) sc3 = self.get_same_count(character_3, character_2, character_1) maxindices = np.argmax([sc2, sc1, sc3]) if maxindices == 0: accumulated_chars += character_2 elif maxindices == 1: accumulated_chars += character_1 else: accumulated_chars += character_3 accumulated_chars_stripped = accumulated_chars.replace( wildcard_character, '') return accumulated_chars, accumulated_chars_stripped def vote_best_of_three_charconfs(self, line_1, line_2, line_3, index_best, wildcard_character='¦'): try: def try_obtain_charconf(value, undef_value=0): if value is None or value is False or value is True: return undef_value return value def try_obtain_char(charlist, index): if index >= len(charlist): return False #j4t means not defined else: return charlist[index] key_confs_mapping = 'UID' key_confs = 'x_confs' key_char = 'calc_char' self.cpr.print("vote_text1", line_1.textstr) self.cpr.print("vote_text2", line_2.textstr) self.cpr.print("vote_text3", line_3.textstr) #if "¦¦lt.H" in line_1.textstr: # self.cpr.print("asd") maximum_char_number = max(len(line_1.textstr), len(line_2.textstr), len(line_3.textstr)) accumulated_chars = "" for character_index in range( 0, maximum_char_number ): # check: is list 1 always best reference? character_1 = line_1.value(key_char, character_index) character_2 = line_2.value(key_char, character_index) character_3 = line_3.value(key_char, character_index) charconf_1 = try_obtain_charconf( line_1.value(key_confs, character_index, wsval=50.0)) charconf_2 = try_obtain_charconf( line_2.value(key_confs, character_index, wsval=50.0)) charconf_3 = try_obtain_charconf( line_3.value(key_confs, character_index, wsval=50.0)) clist = [character_1, character_2, character_3] # get the character which occurs the most sc1, acc_conf_1 = self.get_confidence_count( character_1, character_2, character_3, charconf_1, charconf_2, charconf_3) sc2, acc_conf_2 = self.get_confidence_count( character_2, character_1, character_3, charconf_2, charconf_1, charconf_3) sc3, acc_conf_3 = self.get_confidence_count( character_3, character_2, character_1, charconf_3, charconf_2, charconf_1) maxindices = np.argmax([ acc_conf_2, acc_conf_1, acc_conf_3 ]) # this takes in priorisation in case the chars are same #todo:import to config if character_index == maximum_char_number - 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I": continue if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True: tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3) if maximum_conf < tresh: if [character_2, character_1, character_3 ][maxindices] != '¦': continue if maxindices == 0: accumulated_chars += character_2 elif maxindices == 1: accumulated_chars += character_1 else: accumulated_chars += character_3 accumulated_chars_stripped = accumulated_chars.replace( wildcard_character, '') return accumulated_chars, accumulated_chars_stripped except Exception as ex: tr = inspect.trace() self.cpr.printex("ocr_voter.py Exception during confidence vote:", ex) self.cpr.printex("trace is:", tr) def increase_umlaut_confidence(self, chars, charconfs): charconfs_adapted = [] for char_index, char in enumerate(chars): if char in SpecialChars.umlauts_caps or char in SpecialChars.umlauts: cconf_to_add = charconfs[ char_index] + SpecialChars.umlaut_increment elif char in SpecialChars.special_chars: cconf_to_add = charconfs[ char_index] + SpecialChars.special_char_increment else: cconf_to_add = charconfs[char_index] charconfs_adapted.append(cconf_to_add) return charconfs_adapted def vote_best_of_three_charconfs_searchspaces(self, line_1, line_2, line_3, index_best, wildcard_character='¦'): try: key_confs_mapping = 'UID' key_confs = 'x_confs' key_char = 'calc_char' self.cpr.print("vote_text1", line_1.textstr) self.cpr.print("vote_text2", line_2.textstr) self.cpr.print("vote_text3", line_3.textstr) #if "Beteiligung:" in line_1.textstr: # self.cpr.print("asd") maximum_char_number = max(len(line_1.textstr), len(line_2.textstr), len(line_3.textstr)) accumulated_chars = "" accumulated_confs = Filo(300) # search space settings SEARCH_SPACE_Y_SIZE = 3 SEARCH_SPACE_X_SIZE_OUTER = 7 SEARCH_SPACE_X_SIZE_INNER = 3 SEARCH_SPACE_X_SEARCH_RANGE = 1 SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR = '¦' SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS = True SEARCH_RANGE = 1 PRINT_MATRICES = self.config.PRINT_SEARCH_SPACE_MATRICES # initialize search space processor and search spaces search_space_processor = SearchSpaceProcessor(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_INNER, \ wildcard_character, SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR) ssp_chars = SearchSpace(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_OUTER, SEARCH_SPACE_X_SEARCH_RANGE, True) ssp_confs = SearchSpace(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_OUTER, SEARCH_SPACE_X_SEARCH_RANGE, True) # check if one of the lines is empty for certain settings one_line_empty = False if self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS or \ self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE: one_line_empty = self.check_if_one_line_empty( [line_1, line_2, line_3], wildcard_character) # loop through the maximum character range of the lines range_extension = SEARCH_SPACE_X_SIZE_INNER for character_index in range( 0, maximum_char_number + range_extension + 2): # check: is list 1 always best reference? if character_index < maximum_char_number: # if there is a character within range (no padding char from extension) # get character values and obtain corresponding confidences (from searchspace because they might # be different to normal values because of swapping line_vals = [line_1.value(key_char, character_index), line_2.value(key_char, character_index), \ line_3.value(key_char, character_index)] line_1_conf = line_1.value(key_confs, character_index, wsval=50.0) line_2_conf = line_2.value(key_confs, character_index, wsval=50.0) line_3_conf = line_3.value(key_confs, character_index, wsval=50.0) charconf_1 = self.try_obtain_charconf_searchspace( line_1_conf, line_vals[0], engine_key=line_1.name[0], one_line_empty=one_line_empty) charconf_2 = self.try_obtain_charconf_searchspace( line_2_conf, line_vals[1], engine_key=line_2.name[0], one_line_empty=one_line_empty) charconf_3 = self.try_obtain_charconf_searchspace( line_3_conf, line_vals[2], engine_key=line_3.name[0], one_line_empty=one_line_empty) charconf_vals = [charconf_1, charconf_2, charconf_3] else: # if the character is within padding range just give none values for characters and confidences line_vals = [None, None, None] charconf_vals = [None, None, None] # fill searchspace with the chars and confidences ssp_chars.push_column(line_vals) ssp_confs.push_column(charconf_vals) # update the mid-window of the search space (this is the actual search space processing step) mid_chars = ssp_chars.get_middle_matrix(PRINT_MATRICES) mid_confs = ssp_confs.get_middle_matrix(PRINT_MATRICES) mid_chars_processed, mid_confs_processed, change_done = \ search_space_processor.process_search_space(mid_chars, mid_confs,SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS) if change_done is True: ssp_chars.update_middle_matrix(mid_chars_processed) ssp_confs.update_middle_matrix(mid_confs_processed) # extract changed values from search space character_offset = -(SEARCH_SPACE_X_SEARCH_RANGE + 1) character_1 = ssp_chars.get_value_around_middle( 0, character_offset) character_2 = ssp_chars.get_value_around_middle( 1, character_offset) character_3 = ssp_chars.get_value_around_middle( 2, character_offset) charconf_1 = ssp_confs.get_value_around_middle( 0, character_offset) charconf_2 = ssp_confs.get_value_around_middle( 1, character_offset) charconf_3 = ssp_confs.get_value_around_middle( 2, character_offset) if character_1 is None or character_2 is None or character_3 is None: # self.cpr.print("test") continue # in case umlaut confidence increment is active change charconfs otherwise same charconfs charconf_1, charconf_2, charconf_3 = self.increase_umlaut_confidence_searchspace( character_1, character_2, character_3, charconf_1, charconf_2, charconf_3) # get the previous characters from other lines as string (mainly for predictor) filo_content = self.filo_last_chars.get_content_as_string() # trigger predicted section for aufsichtsrat predictor self.toggle_predictor(filo_content) # predict_char if predictor is enabled predicted_char = self.predict_char(filo_content) # get the character which occurs the most by accumulating confidence scores sc1, acc_conf_1 = self.get_confidence_count( character_1, character_2, character_3, charconf_1, charconf_2, charconf_3) sc2, acc_conf_2 = self.get_confidence_count( character_2, character_1, character_3, charconf_2, charconf_1, charconf_3) sc3, acc_conf_3 = self.get_confidence_count( character_3, character_2, character_1, charconf_3, charconf_2, charconf_1) maxindices = np.argmax([ acc_conf_2, acc_conf_1, acc_conf_3 ]) # this takes in priorisation in case the chars are same if character_index == maximum_char_number + range_extension + 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I": continue # drop chars completely if they fall below a certain dropping treshhold and the setting is active if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True: tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3) if maximum_conf < tresh: if [character_2, character_1, character_3 ][maxindices] != '¦': continue # determine character with the best accumulated confidence voted_char = None voted_acc_conf = None if maxindices == 0: voted_char = character_2 voted_acc_conf = acc_conf_2 elif maxindices == 1: voted_char = character_1 voted_acc_conf = acc_conf_1 else: voted_char = character_3 voted_acc_conf = acc_conf_3 # if predictor is active, check if there is a better char predicted which can replace voted character voted_char = self.maybe_replace_voted_by_predicted_char( voted_char, self.use_aufsichtsrat_prediction, predicted_char, wildcard_character, voted_acc_conf, character_1, character_2, character_3) # push the voted char and the accumulated confidence of this char to results accumulated_confs.push(voted_acc_conf) accumulated_chars += voted_char # if the predictor is enabled fill the filo with the voted_char self.fill_filo_last_chars(voted_char) # do vocabulary related steps, if activated accumulated_chars = self.vocabulary_related_corrections( accumulated_chars, wildcard_character, accumulated_confs) # remove the wilcard characters and return result accumulated_chars_stripped = accumulated_chars.replace( wildcard_character, '') return accumulated_chars, accumulated_chars_stripped except Exception as ex: tr = inspect.trace() self.cpr.printex("ocr_voter.py Exception during confidence vote", ex) self.cpr.printex("trace", tr) def vocabulary_related_corrections(self, accumulated_chars, wildcard_character, accumulated_confs): if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE: accumulated_chars_final = "" acc_split = accumulated_chars.split() len_split = len(acc_split) for word_index, word in enumerate(acc_split): if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION: if word_index == len_split - 1 and word.replace( wildcard_character, "").endswith('-'): self.previous_word_with_seperator = True accumulated_chars_final += word + " " continue if word_index == 0: if self.previous_word_with_seperator is True: self.previous_word_with_seperator = False accumulated_chars_final += word + " " continue acc_confs_word = accumulated_confs.pop_multi(len(word)) acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \ self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character) self.cpr_vocab_check.print("w:", word, "wr:", word_reduced, "accr:", acc_conf, "rate", rate) # don't correct words below min vocab length ( mind that special chars in dict are toggled) check_len = len(word) if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS: check_len = len(word_reduced) if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH: accumulated_chars_final += word + " " continue if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS: swappable_char_indices = [] acc_confs_used = None word_used = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: # use the full length confidences array including trailing and leading special characters acc_confs_used = acc_confs_word word_used = word else: # don't use trailing and starting special characters if no special chars needed acc_confs_used = acc_confs_word[ len(word_starting_borders):( len(acc_confs_word) - len(word_trailing_borders))] word_used = word_reduced for conf_index, conf in enumerate(acc_confs_used): if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if conf <= 250: character_related = word_used[conf_index] is_special_char = Random.is_special_character( character_related) if is_special_char and character_related != wildcard_character: # only swap special character indices swappable_char_indices.append(conf_index) else: if conf <= 215: swappable_char_indices.append(conf_index) if len(swappable_char_indices) >= 1: word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only( word_used, swappable_char_indices) if word_reduced_correct != None: word_correct_withtrails = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if Random.has_special_character( word_reduced_correct): # if special character was replaced with special character word_correct_withtrails = word_reduced_correct else: # if special character was replaced by alphanumerical character word_correct_withtrails = word else: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders # only print the changed results if word != word_correct_withtrails: self.cpr_vocab_check.print( "w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " continue if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \ and len(word_reduced) > 2: # if the rate drops below tresh, try to fetch vocab entry word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text( word_reduced) if word_reduced_correct != None and word_reduced_correct != word_reduced: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders self.cpr_vocab_check.print("w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " accumulated_chars = accumulated_chars_final return accumulated_chars def try_obtain_charconf_searchspace( self, value_confidence, value, undef_value=0, engine_key=None, one_line_empty=False, ): if value_confidence is None or value_confidence is False or value_confidence is True: return undef_value returnvalue = value_confidence if self.config.MSA_BEST_VOTER_SCALE_ENGINE_CONFIDENCES and engine_key is not None: if engine_key == 'Abbyy': if self.config.MSA_BEST_INCREASE_CONFIDENCE_OF_SOME_ABBYY_CHARS: if value == "%": # improve ocropus in confidence of % because it was trained value_confidence = value_confidence + 80 returnvalue = ConfidenceModifications.abby_factor * value_confidence elif engine_key == 'Tess': returnvalue = ConfidenceModifications.tesseract_factor * value_confidence elif engine_key == 'Ocro': returnvalue = ConfidenceModifications.ocropus_factor * value_confidence if (self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS and one_line_empty and value == " ") \ or (self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD and one_line_empty \ and value == " "): returnvalue += ConfidenceModifications.whitespace_push return returnvalue def check_if_one_line_empty(self, lines, wildcard_character): for line in lines: text_wo_wildcards = line.textstr.replace(wildcard_character, '') if text_wo_wildcards == "": return True if self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD: # also count in high whitecard ratios as empty line wildcard_ratio = 1 - (len(text_wo_wildcards) / len(line.textstr)) if wildcard_ratio > 0.70: return True def toggle_predictor(self, filo_content): if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED: if "Aufsichtsrat" in filo_content: self.use_aufsichtsrat_prediction = True if "Gründung:" in filo_content: self.use_aufsichtsrat_prediction = False def predict_char(self, filo_content): predicted_char = None if self.use_aufsichtsrat_prediction: if len(filo_content ) >= 19: # if filo_content bigger than one prediction chunk len_aufsichtsrat = 19 predicted_char = self.predictor.predict_next_aufsichtsrat_chars( len_aufsichtsrat, filo_content) # print("filo", filo_content,"predict:", predicted_char) # print("dd") return predicted_char def fill_filo_last_chars(self, voted_char): """ fill filo for predictor usage with voted_char some additional chars around this char :param voted_char: :return: """ if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED: # create pre semi-tokenized input strings in the filos from the voted characters for prediction if voted_char == ' ': # the models usally use the 'ƿ' char in substitution for spaces self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push('ƿ', filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') elif Random.is_special_character(voted_char): self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push(voted_char, filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') else: self.filo_last_chars.push(voted_char, filterchar='¦') def increase_umlaut_confidence_searchspace(self, character_1, character_2, character_3, charconf_1, charconf_2, charconf_3): if self.config.MSA_BEST_SEARCHSPACE_INCREASE_UMLAUT_CONFIDENCE: clist = [character_1, character_2, character_3] conflist = [charconf_1, charconf_2, charconf_3] conflist_new = self.increase_umlaut_confidence(clist, conflist) charconf_1 = conflist_new[0] charconf_2 = conflist_new[1] charconf_3 = conflist_new[2] return charconf_1, charconf_2, charconf_3 return charconf_1, charconf_2, charconf_3 def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \ wildcard_character, voted_acc_conf, character_1, character_2, character_3): if aufsichtsrat_prediction_toggled: if Random.is_special_character(predicted_char): one_char_sc = Random.is_special_character(character_1) \ or Random.is_special_character(character_2) or Random.is_special_character( character_3) voted_char_sc = Random.is_special_character(voted_char) if predicted_char != voted_char and ( one_char_sc or voted_char_sc) and voted_char != wildcard_character: # print("FiloContent:", filo_content) self.cpr_sc_predict.print("pc:", predicted_char, "vc:", voted_char, "vc_acc", voted_acc_conf) if voted_acc_conf <= 90.0: if voted_char != '\f': # don't swap formfeeds, they don't get predicted at all self.cpr_sc_predict.print("swap") voted_char = predicted_char return voted_char
class AdditionalInfoHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init additional info handler") def write_excel_to_json(self, fileinfo, filepath, filetype, idxcol=None, parse_cols=None, page=0): """" At the moment a little helper script for the Aktienführer-Project. Be free to modify as you wish. """ #if isinstance(parse_cols, list): parse_cols = [parse_cols], additional_filepath = path.normpath( f"{filepath}/**/*{fileinfo.dbname}.{filetype}") file = glob.glob(additional_filepath, recursive=True) if len(file) != 1: return None if filetype in ["xlsx", "xls"]: df = pd.read_excel(file[0]).set_index("ProfileID") jsondata = {fileinfo.dbname: {"Year": fileinfo.dbname}} jsondf = df.to_dict(orient="index") jsondata.update(jsondf) with open(file[0].replace("xlsx", "json"), "w") as output: json.dump(jsondata, output, indent=4) return None def fetch_additional_information_simple(self, file): """ Same as fetch additional information, but config related info is already included in given parameters :return: additional info """ if self.config.ADDITIONAL_INFORMATION: additional_info = self.fetch_additional_information( file, self.config.INPUT_ADDINFOPATH, idxcol=self.config.IDXCOL, parse_cols=self.config.PARSE_COLS, filetype=self.config.INPUT_ADDINFOFILETPYE) return additional_info return None def fetch_additional_information(self, fileinfo, filepath, filetype, idxcol=None, parse_cols=None, page=0): """ Reads an additional file with information It searches the file where the index_name matches tablename or dbname :param file: :param index_name: :return: additional info """ #if isinstance(parse_cols, list): parse_cols = [parse_cols] additional_filepath = path.normpath( f"{filepath}/**/*{fileinfo.dbname}.{filetype}") file = glob.glob(additional_filepath, recursive=True) len_files = len(file) if len_files > 1: self.cpr.printex( "More than one additional information file was found!") return None if len_files == 0: self.cpr.printex("No additional information file was found!") return None file = file[0] current_db_and_table = { "db": fileinfo.dbname, "table": fileinfo.tablename } if filetype in ["xlsx", "xls"]: infos = {} info_df = pd.read_excel(file) #.set_index("ProfileID") parse_cols.remove(idxcol) for db_and_table_id, current_db_and_tablename in current_db_and_table.items( ): infos[db_and_table_id] = {} for line, rubric_content in info_df.loc[ info_df[idxcol] == current_db_and_tablename][parse_cols].to_dict( orient="index").items(): for rubric, content in rubric_content.items(): if rubric != idxcol: if infos[db_and_table_id].get(rubric, None) is None: infos[db_and_table_id][rubric] = content elif infos[db_and_table_id].get(rubric, None) != content: if not isinstance( infos[db_and_table_id][rubric], list): infos[db_and_table_id][rubric] = [ infos[db_and_table_id][rubric] ] infos[db_and_table_id][rubric].append(content) elif filetype == "json": with open(file, "r") as add_info_file: infos = json.load(add_info_file) for possible_db_or_tablenames in reversed(list(infos.keys())): possible_db_or_tablenames_orig = possible_db_or_tablenames # unchanged name if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON: psplit = possible_db_or_tablenames.split("-") possible_db_or_tablenames = psplit[0] if possible_db_or_tablenames not in current_db_and_table[ 'table']: del infos[possible_db_or_tablenames_orig] else: for db_and_table_id, current_db_and_tablename in current_db_and_table.items( ): if possible_db_or_tablenames == current_db_and_tablename: infos[db_and_table_id] = infos[ possible_db_or_tablenames_orig] del infos[possible_db_or_tablenames_orig] else: return None return infos
class SegmentClassifier(object): """ This is the basic handler for classification which get's accessed from root/-outside classes. """ def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init segment classifier") def classify_file_segments(self, ocromore_data): lines = ocromore_data['lines'] feats = ocromore_data['line_features'] file_info = ocromore_data['file_info'] all_file_segments = AllSegments(len(lines), self.cpr, self.config) prev_line = None prev_text = None for current_line_index, current_line in enumerate(lines): current_features = feats[current_line_index] current_text = current_line['text'] current_index = current_line['line_index'] # create a combined lined object with optimized (removed) separation combined_line = None if prev_line is not None: combined_lines = dh.join_separated_lines([prev_text, current_text]) combined_line = dh.join_joined_lines(combined_lines) else: combined_line = current_text # pass parameters to matching functions all_file_segments.match_my_segments(current_line, current_text, current_index, current_features, prev_line, combined_line) prev_line = current_line prev_text = current_text if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: self.adapt_non_explicit_indices(all_file_segments) else: all_file_segments.correct_overlaps_index_field(only_start_tags=True) self.adapt_stop_index_in_last_segment(all_file_segments) # does the last steps in segment matching all_file_segments.finish_segment_matching(lines, feats, file_info) # do again after final step if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: self.adapt_non_explicit_indices(all_file_segments) else: all_file_segments.correct_overlaps_index_field(only_start_tags=True) self.adapt_stop_index_in_last_segment(all_file_segments) ocromore_data['segmentation'] = all_file_segments return ocromore_data def adapt_stop_index_in_last_segment(self, all_file_segments): """ Sets the stop_index for the last recognized segment, which is a special case and is usually not filled beforehand, because there is no next start index :param all_file_segments: holder object for segment classes and other info :return: None """ # search for last segment saved_start_index = -1 saved_last_segment = None for segment in all_file_segments.my_classes: # only count segmented segments if segment.start_was_segmented is False: continue if segment.start_line_index >= saved_start_index: saved_start_index = segment.start_line_index saved_last_segment = segment if saved_last_segment is None: return # adapt the last stop index of last segment saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1 saved_last_segment.stop_was_segmented = True # todo think about if this is necessary? def adapt_non_explicit_indices(self, all_file_segments): # update start and explicit stop tags first all_file_segments.correct_overlaps_index_field(only_start_tags=True) # fill undefined stop regions until next start region all_file_segments.fill_start_index_until_next_stop()
class EndobjectFactory(object): """ Creates an object with the following structure and provides exporting methods: segment_tag_1: [ ---> this level is created by set_current_main_list { type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 city: "Neustadt" }, { type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 city: "Neustadt" } ], segment_tag_2: [ { ... } ... ] """ def __init__(self): self.my_object = {} self.current_main_list = None self.pp = pprint.PrettyPrinter(indent=5) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) if self.config.REMOVE_TAGS_IN_ORIG_DIFF: self.known_uc = KnownUncategories() def set_current_main_list(self, segment_tag): if segment_tag not in self.my_object.keys(): self.my_object[segment_tag] = [] # create the main list (all subsequent entries are stored here) self.current_main_list = self.my_object[segment_tag] # create a short link on the main list def add_to_my_obj(self, key, value, object_number=0, only_filled=False): if only_filled is True and (value == None or value == "" or value == [] or value == {}): return False # fill main list if object index not in len_list = len(self.current_main_list) if len_list < object_number+1: for index in range(len_list,object_number+1): self.current_main_list.append({}) self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value) # add or insert to the main_list self.current_main_list[object_number][key] = value return True def print_me_and_return(self): print("my_object is:") self.pp.pprint(self.my_object) return self.my_object def print_current_main(self): print("current_main:") self.pp.pprint(self.current_main_list) def export_as_json(self): my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False) return my_obj_json def export_as_json_at_key(self, key, remove_first_object=False): if key not in self.my_object.keys(): return None my_obj = self.my_object[key] if remove_first_object: if len(my_obj) >= 1: my_obj = my_obj[1:] # remove the first object which usally contains generic info my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False) return my_obj_json @staticmethod def fetch_subentries_recursive_check(entry): """ Fetches all subentries (values) from an entry and writes them to a list of texts This get's called recursively within the function until all subentries are found :param entry: entry to fetch the subentries from :return: list of subentries """ final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value) final_texts.extend(recursive_texts) return final_texts @staticmethod def fetch_keys_recusive_check(entry, final_keys, create_multiple=True): """ Fetches all keys in an object and it's sub-objects calls itself recursively until all keys are found writes final keys to final_keys array and returns this :param entry: object to fetch the sub-keys from :param final_keys: list of final keys (initial state) :param create_multiple: if the same key occurs multiple times it still gets added :return: final_keys with added keys from object """ if isinstance(entry, list): for item in entry: final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple) return final_keys elif not isinstance(entry, dict): # just return if there are no keys (cause no dictionary) return final_keys for key in entry: value = entry[key] if create_multiple or key not in final_keys: if isinstance(key, int): continue final_keys.append(key) final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys) return final_keys def diff_seg_to_orig_at_key(self, key): """ def fetch_subentries_recursive(entry): final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = fetch_subentries_recursive(value) final_texts.extend(recursive_texts) return final_texts """ if key not in self.my_object.keys(): return None my_data = self.my_object[key] # check if the orig-post property can exist warn if not if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: self.cpr.printw("trying to fetch original data, original data is not added to results") self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") if len(my_data) <= 0: self.cpr.printw("no data to do returning") return return # todo this seems to be wrong # copy orig string original_text = my_data[0]['origpost'] rest_text = original_text # fetch parsed entries for diff all_final_entries = [] # array of final entries for index in range(1, len(my_data)): entry = my_data[index] final_entries = fetch_subentries_recursive(entry) all_final_entries.extend(final_entries) # order diff data after length all_final_entries.sort(key=lambda x: len(x)) all_final_entries.reverse() # subtract for text in all_final_entries: rest_text = rest_text.replace(text, "") rest_text = rest_text.strip() return rest_text, original_text def diff_parsed_to_orig_at_key(self, key): """ def fetch_subentries_recursive(entry): final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = fetch_subentries_recursive(value) final_texts.extend(recursive_texts) return final_texts def fetch_keys_recusive(entry, final_keys, create_multiple=True): # just return if there are no keys (cause no dictionary) if not isinstance(entry, dict): return final_keys for key in entry: value = entry[key] if create_multiple or key not in final_keys: if isinstance(key, int): continue final_keys.append(key) final_keys = fetch_keys_recusive(value, final_keys) return final_keys """ if key not in self.my_object.keys(): return None #if key == "KursVonZuteilungsrechten": # print("todo remove debug") my_data = self.my_object[key] # check if the orig-post property can exist warn if not if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: self.cpr.printw("trying to fetch original data, original data is not added to results") self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") if len(my_data) <= 0: self.cpr.printw("no data to do returning") return # copy orig string original_text = my_data[0]['origpost'] rest_text = original_text # fetch parsed entries for diff pool_entries = [] # array of final entries for index in range(1, len(my_data)): entry = my_data[index] final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry) pool_entries.extend(final_entries) if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: # removes all spaces from rest and comparison values because spaces are often # a problem in subtracting the rests rest_text = rest_text.replace(" ", "") for index in range(0,len(pool_entries)): pool_entries[index] = pool_entries[index].replace(" ", "") all_final_entries = [] # add the entries to the complete subtraction and tag them with '1' for pentry in pool_entries: all_final_entries.append((pentry, 1)) # if keys shall be subracted also add them also if self.config.REMOVE_TAGS_IN_ORIG_DIFF: pool_keys = [] # gets multiple of the same key for later 1 by 1 subtraction for index in range(1, len(my_data)): pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True) # also remove spaces in keys if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: for index in range(0, len(pool_keys)): pool_keys[index] = pool_keys[index].replace(" ", "") final_keys = [] for pkey in pool_keys: final_keys.append((pkey, 2)) all_final_entries.extend(final_keys) # order diff data after length all_final_entries.sort(key=lambda x: len(x[0])) all_final_entries.reverse() # subtract for entry in all_final_entries: text = entry[0] text_or_key = entry[1] if text_or_key == 2: if text in self.known_uc.unkeys: continue text_stripped = text.strip() # remove spaces so texts better fit in rest_text = rest_text.replace(text_stripped, "", 1) rest_text = rest_text.strip() return rest_text, original_text
class SegmentParser(object): """ Parse the classified segments segment by segment, each segment defined code the parser points to. """ def __init__(self, output_analyzer, dictionary_handler, ocromore_data=None): self.ef = EndobjectFactory() self.dictionary_handler = dictionary_handler # map which maps tags to functions for parsing -> change constuctor for other project fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.function_map = fmap.get_function_map() self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/" def clear_result(self, output_analyzer, dictionary_handler, ocromore_data=None): # create a new end object factory, new content self.ef = EndobjectFactory() # map to the new ef object which has been recreated fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) self.function_map = fmap.get_function_map() def parse_segments(self, ocromore_data): self.ocromore_data = ocromore_data segmentation = ocromore_data['segmentation'] segmentation_classes = segmentation.my_classes # add all text from original file if activated (i.e. for debugging purposes) if self.config.ADD_FULLTEXT_ENTRY: all_texts = self.get_all_text(ocromore_data) self.ef.set_current_main_list("overall_info") self.ef.add_to_my_obj("fulltexts", all_texts) # add additional info to result if self.config.ADDITIONAL_INFORMATION and self.config.ADD_ADDITIONAL_INFO: if not self.config.ADD_FULLTEXT_ENTRY: self.ef.set_current_main_list("Information") self.ef.add_to_my_obj("additionals", ocromore_data["additional_info"]) # add a duplicate of the original text from which in the below analysis case the files get subtracted if self.config.LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE: if self.config.ADD_FULLTEXT_ENTRY: ocromore_data['analysis_to_orig'] = {} original_rest, complete_text = self.get_all_text( ocromore_data, join_separated_lines=True) ocromore_data['analysis_to_orig'][ 'original_rest'] = original_rest ocromore_data['analysis_to_orig'][ 'original_length_initial'] = len(complete_text) else: self.cpr.printw( "activated segment to orig diff, but no saving of origin activate ADD_FULLTEXT_ENTRY " "in config for this functionality") #Init toolbbox snippet = None if self.config.USE_SNIPPET: if "./" in self.config.IMGPATH: ipath = os.path.dirname( ocromore_data["file_info"].path) + self.config.IMGPATH[1:] else: ipath = os.path.normcase(self.config.IMGPATH) results = glob.glob( ipath + ocromore_data["file_info"].name.split(".")[0].replace( "_msa_best", "") + "*", recursive=True) if results: snippet = Snippet() snippet.imread(results[0]) else: self.config.USE_TOOLBBOX = False info_handler = {} # start parsing for each successfully segmented area for segmentation_class in segmentation_classes: # if the class segment was recognized ... if segmentation_class.is_start_segmented(): # get the unique identifier for this class segment_tag = segmentation_class.get_segment_tag() segmentation_class.snippet = snippet segmentation_class.info_handler = info_handler self.trigger_mapped_function(segment_tag, segmentation_class, ocromore_data) # add and return result ocromore_data['results'] = self.ef return ocromore_data def trigger_mapped_function(self, segment_tag, segmentation_class, ocromore_data): if segment_tag not in self.function_map.keys(): return #todo: fileinfo -> parsing real_start_tag, content_texts, content_lines, feature_lines = self.prepare_parsing_info( segmentation_class, ocromore_data) # switch the object to save context segment_tag = segmentation_class.segment_tag self.ef.set_current_main_list(segment_tag) # call the mapped function, which fills the end-factory self.function_map[segment_tag].__call__(real_start_tag, content_texts, content_lines, feature_lines, segmentation_class) def prepare_parsing_info(self, segmentation_class, ocromore_data): lines = ocromore_data['lines'] line_features = ocromore_data['line_features'] real_start_tag, content_texts, content_lines, feature_lines = \ DataHelper.get_content(lines,line_features, segmentation_class) return real_start_tag, content_texts, content_lines, feature_lines def get_all_text(self, ocromore_data, join_separated_lines=False): """ Gets all text lines in ocromore_data as array and as joined string :param ocromore_data: data from which the text is extracted :return: texts list, complete text """ all_texts = [] complete_text = "" for line in ocromore_data['lines']: text = line['text'] all_texts.append(text) complete_text += text if join_separated_lines: complete_text = "" all_texts = dh.join_separated_lines(all_texts) for text in all_texts: complete_text += text return all_texts, complete_text def write_result_to_output(self, as_json, ocromore_data): if as_json is True: my_json = self.ef.export_as_json() my_json_lines = my_json.split("\n") dh.write_array_to_root("result_json/", my_json_lines, ocromore_data, self.result_root)
class AkfParsingFunctionsJK(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler, ocromore_data=None): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint( self.config.PRINT_SEGMENT_PARSER_AKF_FN_THREE, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions three") self.ef = endobject_factory self.output_analyzer = output_analyzer self.ocromore_data = ocromore_data self.dictionary_handler = dictionary_handler def parse_bilanzen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True if self.config.LOG_SIMPLE: geschaeftslage = origpost_red.replace("- ", "") #parsing self.ef.add_to_my_obj("balances", geschaeftslage, object_number=element_counter, only_filled=only_add_if_string) return True #parsing table = Datatable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines, template="datatable_balance") table.extract_content(content_lines, feature_lines, template="datatable_balance") # Write information for income table parsing segmentation_class.info_handler["income"] = {} segmentation_class.info_handler["income"]["amount"] = table.info.amount segmentation_class.info_handler["income"]["col"] = table.info.col segmentation_class.info_handler["income"][ "separator"] = table.info.separator # Parsing the tables based on whitespace and number of numbers of each group # This should be the last option to parse (error-prone) self.ef.add_to_my_obj("balances", table.content, object_number=element_counter, only_filled=only_add_if_string) def parse_gewinn_und_verlust(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True if self.config.LOG_SIMPLE: geschaeftslage = origpost_red.replace("- ", "") #parsing self.ef.add_to_my_obj("income", geschaeftslage, object_number=element_counter, only_filled=only_add_if_string) return True # parsing table = Datatable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines, template="datatable_income") if segmentation_class.info_handler and "income" in set( segmentation_class.info_handler.keys()): table.info.col = segmentation_class.info_handler["income"]["col"] table.info.amount = segmentation_class.info_handler["income"][ "amount"] table.info.separator = segmentation_class.info_handler["income"][ "separator"] table.extract_content(content_lines, feature_lines, template="datatable_income") #parsing self.ef.add_to_my_obj("income", table.content, object_number=element_counter, only_filled=only_add_if_string) def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True #self.config.LOG_SIMPLE = False if self.config.LOG_SIMPLE: # self.config.LOG_SIMPLE = False skip = origpost_red.replace("- ", "") # parsing self.ef.add_to_my_obj("shares", skip, object_number=element_counter, only_filled=only_add_if_string) return True # parsing table = Sharetable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines) table.extract_content(content_lines, feature_lines) #from timeit import timeit #print(timeit(test)) # parsing self.ef.add_to_my_obj("shares", table.content, object_number=element_counter, only_filled=only_add_if_string) def parse_dividend(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True # self.config.LOG_SIMPLE = True if self.config.LOG_SIMPLE: # self.config.LOG_SIMPLE = False skip = origpost_red.replace("- ", "") # parsing self.ef.add_to_my_obj("dividende", skip, object_number=element_counter, only_filled=only_add_if_string) return True # parsing table = Dividendtable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines) table.extract_content(content_lines, feature_lines) # from timeit import timeit # print(timeit(test)) # parsing self.ef.add_to_my_obj("dividende", table.content, object_number=element_counter, only_filled=only_add_if_string)
class AkfParsingFunctionsTwo(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TWO, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions two") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler def parse_zahlstellen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) split_post = origpost_red.split(';') DEFAULT_ENTRY = 1 ADDITIONAL_INFO_BOTH = 2 # beide - two previous ADDITIONAL_INFO_ALL_PREV = 3 # sämtl. - all previous final_entries = [] for index, entry in enumerate(split_post): entry_stripped = entry.strip() if "beide" in entry_stripped: entry_final = regex.sub(r"beide\s?\.?", "##", entry_stripped).strip() entry_final_split = entry_final.split('##') for index_fs, entry_fs in enumerate(entry_final_split): if entry_fs.strip() == "" : continue if index_fs < len(entry_final_split)-1: final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", "")) else: final_entries.append((ADDITIONAL_INFO_BOTH, entry_fs, "", "", "")) continue if regex.search("sämtl\s?\.?", entry_stripped): entry_final = regex.sub(r"sämtl\s?\.?", "##", entry_stripped).strip() entry_final_split = entry_final.split('##') for index_fs, entry_fs in enumerate(entry_final_split): if entry_fs.strip() == "": continue if index_fs < len(entry_final_split)-1: final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", "")) else: final_entries.append((ADDITIONAL_INFO_ALL_PREV, entry_fs, "", "", "")) continue entry_split = entry_stripped.split(',') bank = "" city = "" title = "" rest_info = [] for fragment_index, fragment in enumerate(entry_split): if fragment_index == 0: bank = fragment elif fragment_index == 1: city = fragment elif fragment_index >= 2: rest_info.append(fragment) if bank != "" or city != "" or title != "": final_entries.append((DEFAULT_ENTRY, bank, city, title, rest_info)) # reverse list for better processing reverse_fe = final_entries.__reversed__() current_additional_info = "" current_info_index = None current_entry_type = None final_list = [] for item_index, item in enumerate(reverse_fe): entry_type, entryorbank, city, title, rest_info = item # change current additional info if entry_type == ADDITIONAL_INFO_BOTH or entry_type == ADDITIONAL_INFO_ALL_PREV: current_info_index = item_index current_additional_info = entryorbank elif entry_type == DEFAULT_ENTRY: templist = [(entryorbank, city, title, current_additional_info, rest_info)] templist.extend(final_list) final_list = templist # end 'beide'-entry because it's over after 2 iterations if current_entry_type == ADDITIONAL_INFO_BOTH and item_index-current_info_index >= 1: current_info_index = None current_additional_info = "" # finally note the entries to output only_add_if_value = True for entry in final_list: bank, city, title, add_info, rest_info = entry if add_info.strip() != "": rest_info_new = [add_info] rest_info_new.extend(rest_info) else: rest_info_new = rest_info #if add_info != "" and add_info != None and city =="": # city += add_info self.ef.add_to_my_obj("bank", bank, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_value) #self.ef.add_to_my_obj("additional_info", add_info, object_number=element_counter, only_filled=only_add_if_value) #self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("rest_info", rest_info_new, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 return True def parse_grundkapital(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # todo validate other currencies than 'DM' # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) only_add_if_value = True # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # Try to normalize ; to : with prefix apital content_texts = [content_text.replace("apital;","apital:") for content_text in content_texts] gk = cf.parse_general_and_keys(content_texts, join_separated_lines=True, current_key_initial_value='start_value', abc_sections=True) #print(gk) # check start value for 'normal' grundkapital content # if found parse start_value = gk.get('start_value',"") if len(gk.keys()) == 1: start_value = gk[list(gk.keys())[0]] #if start_value = if len(start_value) >= 1: #print("could be grundkapital") my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \ cf.parse_grundkapital_line(start_value[0], False, element_counter, only_add_if_value, []) currency = my_return_object.get('currency',"").strip() amount = my_return_object.get('amount',"").strip() if amount != "" and currency != "": self.ef.add_to_my_obj('Grundkapital', my_return_object, object_number=element_counter, only_filled=only_add_if_value) else: gk['additional_info'] = [] gk['additional_info'].append(start_value[0].replace("↑", ":")) if len(start_value) >= 2: # get the additional values which are in start_value, but have nothing to do with that if 'additional_info' not in gk.keys(): gk['additional_info'] = [] gk['additional_info'] = [] for index in range(1, len(start_value)): val = start_value[index] gk['additional_info'].append(val.replace("↑", ":")) """ if 'additional_info' in gk.keys(): gk_ai = cf.parse_general_and_keys(gk['additional_info'], join_separated_lines=True, current_key_initial_value='start_value_addinfo', abc_sections=True) print("lemme check") """ for key in gk: if key is "start_value": continue entry = gk[key] # individual parsing here match_year = regex.search("\d\d\d\d", key) # key is year year = None key_rest = "" if match_year: year = match_year.group() key_rest = key.replace(year, "").strip() accumulated_text = [] if key_rest != "": accumulated_text.append(key_rest) for inner_entry in entry: accumulated_text.append(inner_entry) final_entry = None if year is None: final_entry = accumulated_text else: final_entry = { "year": year, "text": accumulated_text } if final_entry != None and final_entry != "": self.ef.add_to_my_obj(key, final_entry, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 # check all year lines and parse the return # old parsing style final_entries = [] current_ref_index = -1 found_main_amount = False additional_info = [] only_add_if_value = True for text_index, text in enumerate(content_texts): text_stripped = text.strip() if text_stripped == "": continue # todo increment element ctr ? my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \ cf.parse_grundkapital_line(text_stripped, found_main_amount, element_counter, only_add_if_value, additional_info) for key in my_return_object: value = my_return_object[key] self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=only_add_if_value) if len(additional_info) >= 1: add_lines_parsed = cf.parse_grundkapital_additional_lines(additional_info,element_counter,True, 0) self.ef.add_to_my_obj("additional_info", add_lines_parsed, object_number=element_counter, only_filled=only_add_if_value) return True def parse_ordnungsnrdaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) only_add_if_value = True # example values - each line of content_texts list # '589300 (St.-Akt.)' # '589300.' first_number_match = True for entry in content_texts: entry_stripped = entry.strip() rest = entry_stripped if entry_stripped == "": continue match_number = regex.search(r"^([\d\s]*)", entry_stripped) match_parenth = regex.search(r"\(.*\)", entry_stripped) # take content in parenthesis if match_number is not None and match_number.group(0).strip() != "": if not first_number_match: element_counter += 1 # switch to next element if number not true number = match_number.group(0).strip() self.ef.add_to_my_obj("ord_number", number, object_number=element_counter, only_filled=only_add_if_value) rest = rest.replace(number, "", 1) first_number_match = False if match_parenth is not None: parenth = match_parenth.group(0) self.ef.add_to_my_obj("category", parenth, object_number=element_counter, only_filled=only_add_if_value) rest = rest.replace(parenth, "", 1) rest_stripped = rest.strip() if rest_stripped != "": self.ef.add_to_my_obj("additional_info", rest_stripped, object_number=element_counter, only_filled=only_add_if_value) def parse_grossaktionaer(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) lines_split = origpost_red.split(';') only_add_if_value = True for line in lines_split: # testline # line = "Société Sidérurgique de Participations et d’ Approvisionnement en Charbons, par abréviation (Sidechar), Paris (ca.60,2 %)." findings = regex.finditer(r"\([a-zü0-9\s\,\.]*%\).?",line) lof = list(findings) #findings = regex.search(r"(?m)a", line) if lof: findings = [] for finding in lof: findings.append(finding.regs[0]) else: findings = [(len(line),len(line))] start = 0 for idx, finding in enumerate(findings): #shareholder,location, share item = line[start:finding[0]] if ":" in item: self.ef.add_to_my_obj("additional_information", item[:item.index(":")], object_number=element_counter, only_filled=only_add_if_value) if line.index(":")+2 > finding[0]: continue else: item = item[item.index(":"):] item = item.rsplit(",",1) self.ef.add_to_my_obj("shareholder", item[0].strip(), object_number=element_counter, only_filled=only_add_if_value) if len(item) > 1 and item[1] != "": if item[1][-1] == ".": item[1] = item[1][:len(item[1])-1] if "(" in item[1] and ")" in item[1]: find = regex.search(r"(\([0-9\s\,]*|maßgeblich|Mehrheit|Majorität)\)", item[1]) if find: self.ef.add_to_my_obj("share", item[1][find.regs[0][0]:find.regs[0][1]-1].strip(), object_number=element_counter, only_filled=only_add_if_value) item[1] = item[1][:find.regs[0][0]-1] self.ef.add_to_my_obj("location", item[1].strip(), object_number=element_counter, only_filled=only_add_if_value) if finding[0] != len(line): self.ef.add_to_my_obj("share", line[finding[0]:finding[1]].replace(", ",",").replace("(","").replace(").","").replace(")","").strip(), object_number=element_counter,only_filled=only_add_if_value) start = finding[1] element_counter += 1 #print(self.ef.my_object["Großaktionär"]) """ # find parenthesis with 2 or more characters inside #for item in line.split("%)"): match_parenth = regex.findall(r"(\(.{2,}\))", line) found_parenth = None parenth_is_used = False organization = None location = None # find additional info in each line and subtract it if match_parenth: found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp # if the parenthesis are at the end of line if line.strip()[-1] == ")" and not(len(found_parenth.replace(" ", "")) <= 5 and "%" in found_parenth): # exclude percentages from parenthesis matches line = line.replace(found_parenth, "", 1) parenth_is_used = True split_line = line.split(',') len_split_line = len(split_line) if len_split_line == 1: organization = line.strip("., ") else: organization = line.replace(split_line[-1], "", 1).strip("., ") location = split_line[-1].strip("., ") # town self.ef.add_to_my_obj("organization", organization, object_number=element_counter,only_filled=only_add_if_value) self.ef.add_to_my_obj("location", location, object_number=element_counter,only_filled=only_add_if_value) if parenth_is_used: self.ef.add_to_my_obj("additional_info", found_parenth, object_number=element_counter,only_filled=only_add_if_value) element_counter += 1 """ return True def parse_geschaeftsjahr(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) only_add_if_value = True final_jahr = [] for text in content_texts: text_stripped = text.strip("., ") if text_stripped != "": if "bis" in text_stripped: split_text = text_stripped.split('bis ')# # regex.split('\.bis|\sbis\s', text_stripped) if len(split_text) == 1: final_jahr.append(split_text[0].strip()) continue gesch_jahr_start = split_text[0].strip("( ") gesch_jahr_stop = split_text[1].strip(" )") self.ef.add_to_my_obj('gesch_jahr_start', gesch_jahr_start, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj('gesch_jahr_stop', gesch_jahr_stop, object_number=element_counter, only_filled=only_add_if_value) if len(split_text) >= 3: for rest in split_text[3:]: if rest.strip() != "": final_jahr.append(rest) else: final_jahr.append(text_stripped) self.ef.add_to_my_obj('year', final_jahr, object_number=element_counter,only_filled=only_add_if_value) return True def parse_stimmrechtaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # find last parenthesis and filter match_parenth = regex.findall(r"(\(.*?\))", origpost_red) found_parenth = None origpost_used = origpost_red # find additional info in each line and subtract it if match_parenth: found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used final_lines = [] only_add_if_value = True skip = False final_text = "" for text_index, text in enumerate(content_texts): if text == "": continue text = text.replace("DM =", "DM 1 =").replace("DM=", "DM 1 =").replace("eine DM", "DM 1.-") if element_counter == 0 and "je nom" not in text.lower(): self.ef.add_to_my_obj("additional_info", "".join(content_texts[text_index:]), object_number=element_counter, only_filled=only_add_if_value) break if skip: skip = False continue parse_aktie = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?[Aa]ktie[n]?)[^\d]*(?P<vote>[\d\s]*?)\s*?(?P<voteend>Stimme[n]*)") finding = parse_aktie.findall(text.replace("Stamm","")) if finding != []: finding = list(finding[0]) if finding[1] == "": finding[1] = "1" stck = {"kind": "Aktie", "amount": finding[1], "vote": finding[2].replace(" ", "").strip(), "value": "", "currency": "", "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 continue #text = 'Je nom. DM 50.- =1 Stimme.' parse_stimmrecht = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?(?P<waste>[^\dA-Za-z]*)\s{0,}(?P<kind>[A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}(?P<voteend>Stimme[n]*)?") finding = parse_stimmrecht.findall(text.replace("DM", " DM").replace("RM"," RM")) # Special case "bzw." if finding and "bzw." in text: if "Stimm" not in text: skip = True text += content_texts[text_index+1] parse_bzw = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?[^\d]*\s*?(?P<value2>[\d\s]*)[^\dA-Za-z]*(?P<kind>[A-Za-z][A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}[^\d]*\s{0,}(?P<vote2>[\d]*)\s{0,}(?P<voteend>Stimme[n]*)?") finding = parse_bzw.findall(text) finding = finding[0] if finding: stck = {"kind": finding[5].strip(), "amount": "1", "vote": finding[6].replace(" ", "").strip(), "value": finding[3].strip(), "currency": finding[2].strip(), "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 stck = {"kind": finding[5].strip(), "amount": "1", "vote": finding[7].replace(" ", "").strip(), "value": finding[4].strip(), "currency": finding[2].strip(), "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) continue if not finding or finding[0][0] + finding[0][1] == "": final_text += text continue if final_text != "": self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter-1, only_filled=only_add_if_value) final_text = "" finding_next = None if finding[0][6] + finding[0][7] == "": if text_index == len(content_texts) - 1: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) continue else: finding_next = parse_stimmrecht.findall(text + " " + content_texts[text_index + 1]) if finding_next: skip = True finding = finding_next finding = list(finding[0]) if finding[5] == "": finding[5] = "nom." if finding[1] == "": finding[1] = "1" stck = {"kind": finding[5].strip(), "amount": finding[1].strip(), "vote": finding[6].replace(" ", "").strip(), "value": finding[3].strip(), "currency": finding[2].strip(), "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 # match_akt = regex.search(r"\.\s?\-\s?Akt", text) # if match_saemtlsakt is not None: # self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) # element_counter += 1 # continue if final_text != "": self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter, only_filled=only_add_if_value) return True """ # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # add extra splitting elements to each 'je' or 'Je' origpost_red_se = regex.sub(r"(Je |je )", r"~~~\1", origpost_red) split_text = origpost_red_se.split('~~~') # origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red) only_add_if_value = True for entry in split_text: if entry == "": continue match_sb = regex.search(r"Stimmrechtsbeschränkung:.*", entry) sbe = None if match_sb is not None: sbe = match_sb.group() sbe = sbe.replace("Stimmrechtsbeschränkung:", "", 1) entry = entry.replace(sbe, "").replace("Stimmrechtsbeschränkung:", "", 1) self.ef.add_to_my_obj("entry", entry, object_number=element_counter ,only_filled=only_add_if_value) self.ef.add_to_my_obj("Stimmrechtsbeschränkung", sbe, object_number=element_counter ,only_filled=only_add_if_value) element_counter += 1 """ def parse_boersennotiz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) found_parenth = None origpost_used = origpost_red # log all location elements only_add_if_value = True split_post = regex.split('u\.|und|,', origpost_used) for entry in split_post: entry_stripped = entry.strip("., ") # find additional info in each line and subtract it # find last parenthesis and filter #match_parenth = regex.findall(r"(\(.*?\))", entry_stripped) #combined_ps = [] #for res in match_parenth: #combined_ps.append(res.strip()) #origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used # log additional info in last parenthesis #self.ef.add_to_my_obj("additional_info", combined_ps, object_number=element_counter, # only_filled = only_add_if_value) #if entry_stripped is None or entry_stripped == "": #if match_parenth: # element_counter += 1 entry_stripped = entry.replace("im Freiverkehr", "").replace("(amtl.)", "").strip("., ") if entry_stripped == None or entry_stripped == "": continue self.ef.add_to_my_obj("location", entry_stripped, object_number=element_counter, only_filled= only_add_if_value) element_counter += 1 return True def preprocess_stueckelung_texts(self, content_texts): final_stueckelung_texts = [] previous_text_stripped = "" for index, current_text in enumerate(content_texts): current_text_stripped = current_text.strip() if current_text_stripped == "": continue if current_text_stripped.startswith("zu je") or current_text_stripped.startswith("Zu je"): final_stueckelung_texts.append(previous_text_stripped + " "+current_text_stripped) previous_text_stripped = "" elif "(" == current_text_stripped[0] and ")" == current_text_stripped[-1]: final_stueckelung_texts.append(previous_text_stripped + " "+current_text_stripped) previous_text_stripped = "" else: final_stueckelung_texts.append(previous_text_stripped) previous_text_stripped = current_text_stripped if index == len(content_texts)-1: final_stueckelung_texts.append(current_text_stripped) final_texts_filtered = [] for text in final_stueckelung_texts: text_stripped = text.strip() if text_stripped != "": final_texts_filtered.append(text_stripped) return final_texts_filtered def parse_stueckelung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # find last parenthesis and filter match_parenth = regex.findall(r"(\(.*?\))", origpost_red) found_parenth = None origpost_used = origpost_red # find additional info in each line and subtract it if match_parenth: found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used final_lines = [] additional_info_final = [] only_add_if_value = True skip = False final_text = "" final_add_rest = "" content_texts = self.preprocess_stueckelung_texts(content_texts) for text_index, text in enumerate(content_texts): if text.strip() == "": continue if skip: skip = False continue parse_stck = regex.compile(r"(?P<amount>[\d\s\.]*)\s*(?P<kind>[^\d]*?)[\s]?(?P<nominal>zu je|zuje|zu|je)\s{0,}(?P<currency>[^\d\s]*)\s{0,}(?P<value>[\d\s]*)") finding = parse_stck.findall(text.replace(" Stücke ", " Aktien ").replace(" Stück ", " Aktie ").replace("DM", " DM").replace("RM", " RM").replace("hfl"," hfl")) rest_finding = "" if len(finding) >= 1: rest_finding = text # get the rest of finding subtract_sorted = sorted(finding[0],key=len) subtract_sorted.reverse() for find_chunk in subtract_sorted: rest_finding = rest_finding.replace(find_chunk, "", 1).strip() rest_finding = regex.sub("\s{2,}"," ", rest_finding) # just replace redundant spaces for better subtraction if not finding or finding[0][0]+finding[0][1] == "" or finding[0][0]+finding[0][4] == "": match_akt = regex.search(r"\.\s?\-\s?Akt", text) match_saemtlsakt, err_saemtlsakt = regu.fuzzy_search( r"([Ss]ämtliche [Ss]tammaktien.*|[Ss]ämtliche [Aa]ktien.*|[Ss]ämtliche Namens\-Stammaktien.*)", text, err_number=1) if match_saemtlsakt is not None: #and match_akt is not None: @jk is this second condition really necessary ? saemtl_res = match_saemtlsakt.group() self.ef.add_to_my_obj("additional_info", saemtl_res, object_number=element_counter, only_filled=only_add_if_value) reduced_text = text.replace(saemtl_res, "") final_lines.append(reduced_text) rest_finding = rest_finding.replace(reduced_text,"") if "Börse" in text or "Besondere" in text: addendum = "".join(content_texts[text_index:]) self.ef.add_to_my_obj("additional_info", addendum, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 rest_finding = rest_finding.replace("".join(content_texts[text_index:]), "") break if "(" in text: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter-1, only_filled=only_add_if_value) rest_finding = rest_finding.replace(text, "") else: rest_finding = rest_finding.replace(text, "") final_text += text continue finding_next = None if finding[0][2] == "" or (("zu" in finding[0][2] or "je" in finding[0][2]) and finding[0][3] == ""): #test = '2 638 514 Inh. - bzw. Namensaktien zuje FF 75.-' if text_index == len(content_texts) - 1: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) continue else: finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1]) if finding[0][3]+finding[0][4] == "": if text_index == len(content_texts) - 1: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) continue else: finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1]) if finding_next: skip = True finding = finding_next stck = {"amount": finding[0][0].replace("."," ").strip(), "kind": finding[0][1].replace(" ","").strip(), "nominal": "zu je", "currency": finding[0][3], "value": finding[0][4], "rank": element_counter} self.ef.add_to_my_obj("entry", stck, object_number=element_counter, only_filled=only_add_if_value) if rest_finding != "": final_add_rest += rest_finding + " " element_counter += 1 # match_akt = regex.search(r"\.\s?\-\s?Akt", text) #if match_saemtlsakt is not None: # self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) # element_counter += 1 # continue if final_text != "": self.ef.add_to_my_obj("additional_info", final_text.replace(final_add_rest.strip(".,- "), "", 1).strip(".,- "), object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 if final_add_rest != "": self.ef.add_to_my_obj("additional_info", final_add_rest.strip(".,- "), object_number=element_counter, only_filled=only_add_if_value) return True
class DictionaryHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init dictionary handler") self.data_functs = None # storage for json object self.data_titles = None # storage for json object self.texts_functs = None self.texts_titles = None if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING: self.load_dictionaries() # get the rows as sorted list of texts longest first if self.data_functs is not None: check_tf = self.sort_rows(self.get_rows(self.data_functs)) self.texts_functs = check_tf if self.data_titles is not None: check_tt = self.sort_rows(self.get_rows(self.data_titles)) self.texts_titles = check_tt def diff_name_title(self, text_to_check): len_text_to_check = len(text_to_check) name_found = text_to_check title_found = "" for entry_index, entry in enumerate(self.texts_titles): title, tlen = entry # accelerate the process, by skipping comparisons which have longer texts if tlen > len_text_to_check: continue # compare the texts if title in text_to_check: name_found = text_to_check.replace(title, "", 1).strip() title_found = title break return name_found, title_found def load_dictionaries(self): base_dict_path = self.get_dict_path() filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json") filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json") # load titles if os.path.exists(filepath_titles_dict): with open(filepath_titles_dict) as f: self.data_titles = json.load(f) else: self.cpr.printex( "dictionary dict_titles.json missing at specificied path", filepath_titles_dict) # load functs if os.path.exists(filepath_functs_dict): with open(filepath_functs_dict) as f: self.data_functs = json.load(f) else: self.cpr.printex( "dictionary dict_functs.json missing at specificied path", filepath_functs_dict) def get_rows(self, dict_data): rows = dict_data['rows'] final_rows = [] for entry in rows: text = entry[0] final_rows.append((text, len(text))) return final_rows def sort_rows(self, rows): #itemgetter(1), rows.sort(key=lambda t: len(t[0]), reverse=True) return rows def path(self): return os.getcwd() def get_dict_path(self): complete = os.path.join(self.path(), "additionals", "dictionaries") return complete
class IsriHandler(object): def __init__(self): self.os = os.name.lower() config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() if 'ExceptionInitializing' in self.config: print("Exception initializing config, don't print") self.cpr = ConditionalPrint(False, False, False) else: self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) if self.os != 'linux' and self.os != 'posix': raise OSError( "Untested operating system adapt code and continue at own risk" ) def accuracy(self, path_correctfile, path_generatedfile, path_accuracy_report=""): try: call([ "accuracy", path_correctfile, path_generatedfile, path_accuracy_report ]) except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) class SynctextConfig(object): def __init__(self): self._used_config_acc = [] def use_T_algorithm(self): self._used_config_acc.append("-T") def use_H_algorithm(self): self._used_config_acc.append("-H") def use_case_insensitive(self): self._used_config_acc.append("-i") def use_display_suspect_markers_in_output(self): self._used_config_acc.append("-s") def get_used_config(self): return self._used_config_acc def clear_used_config(self): self._used_config_acc = [] def synctext(self, filepaths, path_generatedfile=None, synctext_config=SynctextConfig()): try: flags = synctext_config.get_used_config() calls = ["synctext"] calls.extend(flags) calls.extend(filepaths) if path_generatedfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_generatedfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def accsum(self, filepaths_accreports, path_generatedfile=None): try: calls = ["accsum"] calls.extend(filepaths_accreports) if path_generatedfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_generatedfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def groupacc(self, path_groupfile, path_accuracy_report, path_groupacc_report=None): try: calls = ["groupacc"] calls.append(path_groupfile) calls.append(path_accuracy_report) if path_groupacc_report is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_groupacc_report, True) filehandle.close() calls.append(path_groupacc_report) call(calls) except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def accdist(self, filepaths_accreports, path_generated_xyfile=None): try: calls = ["accdist"] calls.extend(filepaths_accreports) if path_generated_xyfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_generated_xyfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) class NGramConfig(object): def __init__(self): self._used_config_acc = [] def set_ngram_size(self, number): if number >= 1 and number <= 3: self._used_config_acc.append("-n") self._used_config_acc.append(str(number)) def clear_used_config(self): self._used_config_acc = [] def get_used_config(self): return self._used_config_acc def ngram(self, filepaths, path_generatedfile=None, ngram_config=NGramConfig()): try: flags = ngram_config.get_used_config() calls = ["ngram"] calls.extend(flags) calls.extend(filepaths) if path_generatedfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_generatedfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) class VoteConfig(object): def __init__(self): self._used_config_acc = [] def enable_O_optimization(self): self._used_config_acc.append("-O") def set_s(self, fraction_counter, fraction_denominator): self._used_config_acc.append("-s") self._used_config_acc.append(fraction_counter + "/" + fraction_denominator) def set_w(self, fraction_counter, fraction_denominator): self._used_config_acc.append("-w") self._used_config_acc.append(fraction_counter + "/" + fraction_denominator) def set_output_file(self, path_outputfile): self._used_config_acc.append("-o") self._used_config_acc.append(path_outputfile) #ok? def clear_used_config(self): self._used_config_acc = [] def get_used_config(self): return self._used_config_acc def vote(self, filepaths, ngram_config=VoteConfig()): try: flags = ngram_config.get_used_config() calls = ["vote"] calls.extend(flags) calls.extend(filepaths) call(calls) except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def wordacc(self, path_correctfile, path_comparison_file, path_stopwordfile=None, path_wordacc_report=None): try: calls = ["wordacc"] if path_stopwordfile is not None: calls.append("-S") calls.append(path_stopwordfile) calls.append(path_correctfile) calls.append(path_comparison_file) if path_wordacc_report is not None: calls.append(path_wordacc_report) call(calls) except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def wordaccsum(self, filepaths_wordacc_reports, path_accsumreport=None): try: calls = ["wordaccsum"] calls.extend(filepaths_wordacc_reports) if path_accsumreport is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_accsumreport, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def nonstopacc(self, path_stopwordfile, path_wordacc_report, path_output_xyfile=None): try: calls = ["nonstopacc"] calls.append(path_stopwordfile) calls.append(path_wordacc_report) if path_output_xyfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_output_xyfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def wordaccci(self, filepaths_wordacc_reports, path_outputfile=None): try: calls = ["wordaccci"] calls.extend(filepaths_wordacc_reports) if path_outputfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_outputfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def wordaccdist(self, filepaths_wordacc_reports, path_output_xyfile=None): try: calls = ["wordaccdist"] calls.extend(filepaths_wordacc_reports) if path_output_xyfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_output_xyfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def wordfreq(self, filepaths_inputtext, path_resultfile=None): try: calls = ["wordfreq"] calls.extend(filepaths_inputtext) if path_resultfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_resultfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) #todo add the zoning programs some day: point 4 in doc def editop(self, path_correctfile, path_comparison_file, path_editop_report=None): try: calls = ["editop"] calls.append(path_correctfile) calls.append(path_comparison_file) if path_editop_report is not None: calls.append(path_editop_report) call(calls) except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def editopsum(self, filepaths_editopreports, path_summed_report=None): try: calls = ["editopsum"] calls.extend(filepaths_editopreports) if path_summed_report is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_summed_report, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def editopcost(self, path_editop_report, path_editop_report2=None, path_output_xyfile=None): try: calls = ["editopcost"] calls.append(path_editop_report) if path_editop_report2 is not None: calls.append(path_editop_report2) if path_output_xyfile is None: call(calls) else: filehandle = self.create_file_if_doesnt_exist( path_output_xyfile, True) call(calls, stdout=filehandle) filehandle.close() except Exception as ex: self.cpr.printex("Exception calling pycharm", ex) def create_file_if_doesnt_exist(self, filepath, overwrite=False): file = open(filepath, 'w+') if overwrite: self.delete_file_content(file) return file def delete_file_content(self, pfile): pfile.seek(0) pfile.truncate()