class AkfParsingFunctionsTablesOne(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint( self.config.PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions tables one") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) def parse_dividenden(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) def parse_dividenden_auf_xyaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag)
class AkfParsingFunctionsOne(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint( self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions one") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler def parse_firmenname(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # get relevant info accumulated_text = "" for text in content_texts: accumulated_text += " " + text only_add_if_value = False accumulated_text = accumulated_text.strip() self.ef.add_to_my_obj("Firmenname", accumulated_text, object_number=element_counter, only_filled=only_add_if_value) def parse_sitz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): """ "Sitz": [ { "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:[email protected] Internetseite:http://www.3u.net ", "type": "Sitz", "street": "Mergenthalerallee", "street_number": "79-81", "zip": "65760", "city": "Eschborn", "phone": "(069) 7 50 06-0", "fax": "(069) 7 50 06-111", "email": [ "*****@*****.**" ], "www": [ "http://www.3u.net" ] } ], """ # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # get relevant info num_id, city, street, street_number, additional_info = cf.parse_id_location( origpost_red) # add stuff to ef only_add_if_value = True self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled=only_add_if_value) return True def parse_verwaltung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # kmy_obj_2 = self.ef.print_me_and_return() # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) if "srat" in real_start_tag: # Verwaltungsrat .. persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 return True elif "Verw." in real_start_tag: # Verw. num_id, city, street, street_number, additional_info = cf.parse_id_location( origpost_red) # add stuff to ef only_add_if_value = True self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled=only_add_if_value) return True else: # Verwaltung final_items = cf.parse_general_and_keys( content_texts, join_separated_lines=False, current_key_initial_value="General_Info") for key in final_items.keys(): value = final_items[key] if value is None or value == "": continue self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) element_counter += 1 return True def parse_telefon_fernruf(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data origpost, origpost_red, element_counter, content_texts = cf.add_check_element( self, content_texts, real_start_tag, segmentation_class, 0) # do special match: Verwaltung und Betriebshof split_post = [] match_special = regex.match( r"(?<Verw>Verwaltung.*)" r"(?<Betr>Betriebshof.*)", origpost_red) if match_special: betriebshof = match_special.group("Betr") verwaltung = match_special.group("Verw") origpost_red = origpost_red.replace(betriebshof, "") origpost_red = origpost_red.replace(verwaltung, "") split_post.append(betriebshof) split_post.append(verwaltung) # do special match: Ortsgespräche and Ferngespräche match_special2 = regex.match( r"(?<og>Ortsgespräche.*)" r"(?<fg>Ferngespräche.*)", origpost_red) if match_special2: ortsgespr = match_special2.group("og") ferngespr = match_special2.group("fg") origpost_red = origpost_red.replace(ortsgespr, "") origpost_red = origpost_red.replace(ferngespr, "") split_post.append(ortsgespr) split_post.append(ferngespr) # do special match: Ortsverkehr and Fernverkehr match_special3 = regex.match( r"(?<ov>Ortsverkehr.*)" r"(?<fv>Fernverkehr.*)", origpost_red) if match_special3: ortsverkehr = match_special3.group("ov") fernverkehr = match_special3.group("fv") origpost_red = origpost_red.replace(ortsverkehr, "") origpost_red = origpost_red.replace(fernverkehr, "") split_post.append(ortsverkehr) split_post.append(fernverkehr) # do special match: check if only numbers origpost_red_new = origpost_red #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "") test_split = regex.split("\su\.|\sund\s|,|;", origpost_red) for number in test_split: # additional parenthesis block match_parenthesis = regex.search("\(.*\)", number) parenthesis = None if match_parenthesis: parenthesis = match_parenthesis.group() number = number.replace(parenthesis, "") # remove number self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) match_word_num = regex.search("(?<word>[^\d]*)(?<num>[\d\s\-/]*)", number) if match_word_num is None: continue word = match_word_num.group("word") num = match_word_num.group("num") if "Sa." in word and "Nr" in word: continue number_stripped = num.strip(" ./").replace("/", "").replace( "-", "").replace(" ", "") if number_stripped.isdigit(): origpost_red_new = origpost_red_new.replace( number, "") # remove number origpost_red_new = origpost_red_new.replace( word, "") # remove word found change1 = self.ef.add_to_my_obj("number_Sa.-Nr.", num.strip(), object_number=element_counter, only_filled=True) change2 = self.ef.add_to_my_obj("location", word.strip(), object_number=element_counter, only_filled=True) if change1 or change2: element_counter += 1 #if "32 20 47" in origpost_red: # print("asd") origpost_red = origpost_red_new # substitute in a separator char to integrate delimiters in next step origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red) # do further matches (sc-separated) split_post.extend(regex.split(';|~~~~|\su\.', origpost_red)) for index, entry in enumerate(split_post): if entry is None: continue entry_stripped = entry.strip() if entry_stripped == "": continue # additional parenthesis block match_parenthesis = regex.search("\(.*\)", entry_stripped) parenthesis = None if match_parenthesis: parenthesis = match_parenthesis.group() entry_stripped = entry_stripped.replace(parenthesis, "") # remove entry self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) match_word = regex.match(r"(?<Tag>\D*)" r"(?<Numbers>[\d\s\W]*)", entry_stripped) if match_word is not None: # fetch match results tag_match = match_word.group("Tag") numbers_match = match_word.group("Numbers") rest_from_entry_str = entry_stripped.replace(tag_match, "", 1) rest_from_entry_str = rest_from_entry_str.replace( numbers_match, "", 1) tag = dh.strip_if_not_none(tag_match, "") match_tag = regex.match( r"(?<rest_bef>.*)(?<sanr>Sa\.?\-Nr\.?)(?<rest_end>.*)", tag) location = "" if match_tag is not None: rest_tag = match_tag.group('rest_bef') rest_tag_2 = match_tag.group('rest_end') # sanr = match_tag.group('sanr') # this is the filtered group location = dh.strip_if_not_none( rest_tag + " " + rest_tag_2, ":., ") else: # if there are no real descriptors in tag then tag is usually location (like Düsseldorf 1 36 62.) location = tag if "und" in location: location = regex.sub("[^\w]und[^\w]", "", location) number = dh.strip_if_not_none(numbers_match, "., ") self.ef.add_to_my_obj("number_Sa.-Nr.", number.strip(), object_number=element_counter, only_filled=True) self.ef.add_to_my_obj("location", location.strip(), object_number=element_counter, only_filled=True) additional_info_entry_level = dh.strip_if_not_none( rest_from_entry_str, ",. ") self.ef.add_to_my_obj("additional_info", additional_info_entry_level.strip(), object_number=element_counter, only_filled=True) element_counter += 1 origpost_red = origpost_red.replace(number, "", 1) origpost_red = origpost_red.replace(location, "", 1) origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "") origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red) if len(origpost_red_end) > 3: self.ef.add_to_my_obj("additional_info_unparsed", origpost_red_end.strip(), object_number=element_counter) def parse_vorstand(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 """ # do matches (;-separated) split_post = origpost_red.split(';') for index, entry in enumerate(split_post): entry_stripped = entry.strip() if index == len(split_post)-1: matchend = regex.match("^[Aa]lle", entry_stripped) if matchend: self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter) element_counter += 1 continue match = regex.match(r"(?<Name>.*)[,]" # find location string r"(?<Rest>.*+)", # just get the rest which is usually streetname and number, but has other possibilities entry_stripped) if match is None: name = dh.strip_if_not_none(entry_stripped, ", ") self.ef.add_to_my_obj("name", name, object_number=element_counter) element_counter += 1 continue name = dh.strip_if_not_none(match.group("Name"), ", ") rest = dh.strip_if_not_none(match.group("Rest"), ",. ") name_split = name.split(',') if len(name_split) > 1: position = rest name = name_split[0] city = name_split[1] else: city = rest position = "" self.ef.add_to_my_obj("name", name, object_number=element_counter) self.ef.add_to_my_obj("city", city, object_number=element_counter) self.ef.add_to_my_obj("position", position, object_number=element_counter) element_counter += 1 """ return True def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) #Try to fix +) problems origpost_red = origpost_red.replace("; +)", "+);").replace( ";+)", "+);").replace("')", "").replace("*)", "") persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 return True def parse_arbeitnehmervertreter(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) persons_final = cf.parse_persons( origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) only_add_if_filed = True for entry in persons_final: name, first_name, last_name, city, title, funct, rest_info = entry self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) element_counter += 1 return True # Gruendung def parse_gruendung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) match_year = regex.search("^\d*", origpost_red.strip()) if match_year: result = match_year.group() origpost_red_new = origpost_red.replace(result, "", 1) year = dh.strip_if_not_none(result, ".,() ") rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ") self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) self.ef.add_to_my_obj("year", year, object_number=element_counter, only_filled=True) else: rest_info = dh.strip_if_not_none(origpost_red, ".,() ") self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) # Tätigkeitsgebiet def parse_taetigkeitsgebiet(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) final_items = cf.parse_general_and_keys( content_texts, join_separated_lines=False, current_key_initial_value="General_Info") for key in final_items.keys(): value = final_items[key] if value is None or len(value) == 0: continue self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) element_counter += 1
class TableHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.PRINT_TO_CHECKFILE = False # a line starting with these words can't be in a table self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:", "von","Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:","Umstellung"] #with open("checkfile_tables.txt", "w") as myfile: # myfile.write("----" + "\n") def recognize_a_line(self, line): if line == None or line == False or line == True or line.textstr == None: return False whole_text = line.textstr self.cpr.print("recognizing line:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for key_index, key in enumerate(line.word['text']): word = line.word['text'][key] uid_info = line.word['UID'][key] word_xstart = line.data['word_x0'][character_index] word_xstop = line.data['word_x1'][character_index] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if key_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word is None or word == "": continue if key_index == 0: if word in self.filter_start_words: first_word_no_table_indicator = True if word.lower() == "ultimo": ultimo_is_first_word = True if word[0] == "(": starts_with_parenthesis = True if key_index == len(line.word['text'])-1: if word[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word)) counters_numbers.append(counter_numbers_word) character_index += len(uid_info) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars shouldn't happen, no recognizion") return False special_chars_ratio = counter_special_chars/ counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces/ counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers/counter,2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths)-1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True self.cpr.print("alle cntr:", counter_chars) self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio) self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio) self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio) self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio) self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio) self.cpr.print("x_box_sizes", x_box_sizes) self.cpr.print("x_gaps", x_gaps) self.cpr.print("x_gap_max_size", maximum_x_gap) self.cpr.print("x_gaps_mean", mean_x_gap) self.cpr.print("x_gaps_median", median_x_gap) if "Gewinn nach Vortrag" in whole_text: print("") if ((alphabetical_ratio < 0.75 and \ numbers_ratio > 0.2 and \ counter_chars > 5 and \ counter_words >= 2) and not \ (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word: if first_word_no_table_indicator: return False if mean_x_gap <= 115: return False if many_alphabetical_in_last_word: return False if many_alphabetical_in_middle_words and many_numbers_in_first_word: return False self.cpr.print("possible entry:", whole_text) if self.PRINT_TO_CHECKFILE: with open("checkfile_tables.txt", "a") as myfile: myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \ + "||| median x_gap: " + str(median_x_gap)+"\n") print("jab") return True return False
class OCRVoter(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.cpr_vocab_check = ConditionalPrint( self.config.PRINT_VOCABULARY_CHECKER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.cpr_sc_predict = ConditionalPrint( self.config.PRINT_SPECIALCHAR_PREDICTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.filo_last_chars = Filo(250) self.predictor = None self.use_aufsichtsrat_prediction = False self.vocab_checker = None self.previous_word_with_seperator = False def add_predictor(self, predictor): self.predictor = predictor def add_vocab_checker(self, vocab_checker): self.vocab_checker = vocab_checker def get_same_count(self, c1, c2, c3): same_ctr = 0 if c1 == c2: same_ctr += 1 if c1 == c3: same_ctr += 1 return same_ctr def get_confidence_count(self, char1, char2, char3, cconf1, cconf2, cconf3, wildcard_char='¦'): def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3): if char_first != char_sec: return char_sec, float(co2) elif char_first != char_thrd: return char_thrd, float(co3) same_ctr = 0 cconf_ctr = float(cconf1) if char1 == char2: same_ctr += 1 cconf_ctr += float(cconf2) if char1 == char3: same_ctr += 1 cconf_ctr += float(cconf3) # special cases space: ' ', ' ', 'x' # wildcard character : '¦', '¦', '¦' if char1 == ' ' and same_ctr == 1: # if the confidence of the other character is below that value, space gets the high put in confidence value return 1, 95.0 #todo j4t SPACE_TRESH = 50.0 SPACE_PUT_IN_VALUE = 99.0 otherchar, otherconf = get_other_char(char1, char2, char3, cconf1, cconf2, cconf3) #print("otherchar",otherchar,"otherconf",otherconf) if otherconf < SPACE_TRESH: return 1, SPACE_PUT_IN_VALUE elif char1 == wildcard_char and same_ctr == 1: #todo: differentiate type of character ?? # if there is two wildcards and one characters, characters confidence has to be higher than # WILDCARD_TRESH to be taken wildcard_tresh = 98.5 if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE: wildcard_tresh -= 10 # 0:99,19%, 20:99.16%, 10:99.27% return 1, wildcard_tresh elif char1 == wildcard_char and same_ctr == 0: pass # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard) elif char1 == '' and same_ctr == 0: pass # todo maybe cover this case (cause space has no confidence ... elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \ and Random.is_special_character(char1) and same_ctr == 0 \ and char2 == wildcard_char and char3 == wildcard_char: # lower the confidence of special characters which stand without any other chars return same_ctr, cconf_ctr * 0.9 return same_ctr, cconf_ctr def vote_best_of_three_simple(self, text_1, text_2, text_3, index_best, wildcard_character='¦'): list_line_1 = list(text_1) list_line_2 = list(text_2) list_line_3 = list(text_3) accumulated_chars = "" accumulated_confs = Filo for character_index, character_1 in enumerate(list_line_1): character_2 = list_line_2[character_index] character_3 = list_line_3[character_index] clist = [character_1, character_2, character_3] # get the character which occurs the most sc1 = self.get_same_count(character_1, character_2, character_3) sc2 = self.get_same_count(character_2, character_1, character_3) sc3 = self.get_same_count(character_3, character_2, character_1) maxindices = np.argmax([sc2, sc1, sc3]) if maxindices == 0: accumulated_chars += character_2 elif maxindices == 1: accumulated_chars += character_1 else: accumulated_chars += character_3 accumulated_chars_stripped = accumulated_chars.replace( wildcard_character, '') return accumulated_chars, accumulated_chars_stripped def vote_best_of_three_charconfs(self, line_1, line_2, line_3, index_best, wildcard_character='¦'): try: def try_obtain_charconf(value, undef_value=0): if value is None or value is False or value is True: return undef_value return value def try_obtain_char(charlist, index): if index >= len(charlist): return False #j4t means not defined else: return charlist[index] key_confs_mapping = 'UID' key_confs = 'x_confs' key_char = 'calc_char' self.cpr.print("vote_text1", line_1.textstr) self.cpr.print("vote_text2", line_2.textstr) self.cpr.print("vote_text3", line_3.textstr) #if "¦¦lt.H" in line_1.textstr: # self.cpr.print("asd") maximum_char_number = max(len(line_1.textstr), len(line_2.textstr), len(line_3.textstr)) accumulated_chars = "" for character_index in range( 0, maximum_char_number ): # check: is list 1 always best reference? character_1 = line_1.value(key_char, character_index) character_2 = line_2.value(key_char, character_index) character_3 = line_3.value(key_char, character_index) charconf_1 = try_obtain_charconf( line_1.value(key_confs, character_index, wsval=50.0)) charconf_2 = try_obtain_charconf( line_2.value(key_confs, character_index, wsval=50.0)) charconf_3 = try_obtain_charconf( line_3.value(key_confs, character_index, wsval=50.0)) clist = [character_1, character_2, character_3] # get the character which occurs the most sc1, acc_conf_1 = self.get_confidence_count( character_1, character_2, character_3, charconf_1, charconf_2, charconf_3) sc2, acc_conf_2 = self.get_confidence_count( character_2, character_1, character_3, charconf_2, charconf_1, charconf_3) sc3, acc_conf_3 = self.get_confidence_count( character_3, character_2, character_1, charconf_3, charconf_2, charconf_1) maxindices = np.argmax([ acc_conf_2, acc_conf_1, acc_conf_3 ]) # this takes in priorisation in case the chars are same #todo:import to config if character_index == maximum_char_number - 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I": continue if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True: tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3) if maximum_conf < tresh: if [character_2, character_1, character_3 ][maxindices] != '¦': continue if maxindices == 0: accumulated_chars += character_2 elif maxindices == 1: accumulated_chars += character_1 else: accumulated_chars += character_3 accumulated_chars_stripped = accumulated_chars.replace( wildcard_character, '') return accumulated_chars, accumulated_chars_stripped except Exception as ex: tr = inspect.trace() self.cpr.printex("ocr_voter.py Exception during confidence vote:", ex) self.cpr.printex("trace is:", tr) def increase_umlaut_confidence(self, chars, charconfs): charconfs_adapted = [] for char_index, char in enumerate(chars): if char in SpecialChars.umlauts_caps or char in SpecialChars.umlauts: cconf_to_add = charconfs[ char_index] + SpecialChars.umlaut_increment elif char in SpecialChars.special_chars: cconf_to_add = charconfs[ char_index] + SpecialChars.special_char_increment else: cconf_to_add = charconfs[char_index] charconfs_adapted.append(cconf_to_add) return charconfs_adapted def vote_best_of_three_charconfs_searchspaces(self, line_1, line_2, line_3, index_best, wildcard_character='¦'): try: key_confs_mapping = 'UID' key_confs = 'x_confs' key_char = 'calc_char' self.cpr.print("vote_text1", line_1.textstr) self.cpr.print("vote_text2", line_2.textstr) self.cpr.print("vote_text3", line_3.textstr) #if "Beteiligung:" in line_1.textstr: # self.cpr.print("asd") maximum_char_number = max(len(line_1.textstr), len(line_2.textstr), len(line_3.textstr)) accumulated_chars = "" accumulated_confs = Filo(300) # search space settings SEARCH_SPACE_Y_SIZE = 3 SEARCH_SPACE_X_SIZE_OUTER = 7 SEARCH_SPACE_X_SIZE_INNER = 3 SEARCH_SPACE_X_SEARCH_RANGE = 1 SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR = '¦' SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS = True SEARCH_RANGE = 1 PRINT_MATRICES = self.config.PRINT_SEARCH_SPACE_MATRICES # initialize search space processor and search spaces search_space_processor = SearchSpaceProcessor(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_INNER, \ wildcard_character, SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR) ssp_chars = SearchSpace(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_OUTER, SEARCH_SPACE_X_SEARCH_RANGE, True) ssp_confs = SearchSpace(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_OUTER, SEARCH_SPACE_X_SEARCH_RANGE, True) # check if one of the lines is empty for certain settings one_line_empty = False if self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS or \ self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE: one_line_empty = self.check_if_one_line_empty( [line_1, line_2, line_3], wildcard_character) # loop through the maximum character range of the lines range_extension = SEARCH_SPACE_X_SIZE_INNER for character_index in range( 0, maximum_char_number + range_extension + 2): # check: is list 1 always best reference? if character_index < maximum_char_number: # if there is a character within range (no padding char from extension) # get character values and obtain corresponding confidences (from searchspace because they might # be different to normal values because of swapping line_vals = [line_1.value(key_char, character_index), line_2.value(key_char, character_index), \ line_3.value(key_char, character_index)] line_1_conf = line_1.value(key_confs, character_index, wsval=50.0) line_2_conf = line_2.value(key_confs, character_index, wsval=50.0) line_3_conf = line_3.value(key_confs, character_index, wsval=50.0) charconf_1 = self.try_obtain_charconf_searchspace( line_1_conf, line_vals[0], engine_key=line_1.name[0], one_line_empty=one_line_empty) charconf_2 = self.try_obtain_charconf_searchspace( line_2_conf, line_vals[1], engine_key=line_2.name[0], one_line_empty=one_line_empty) charconf_3 = self.try_obtain_charconf_searchspace( line_3_conf, line_vals[2], engine_key=line_3.name[0], one_line_empty=one_line_empty) charconf_vals = [charconf_1, charconf_2, charconf_3] else: # if the character is within padding range just give none values for characters and confidences line_vals = [None, None, None] charconf_vals = [None, None, None] # fill searchspace with the chars and confidences ssp_chars.push_column(line_vals) ssp_confs.push_column(charconf_vals) # update the mid-window of the search space (this is the actual search space processing step) mid_chars = ssp_chars.get_middle_matrix(PRINT_MATRICES) mid_confs = ssp_confs.get_middle_matrix(PRINT_MATRICES) mid_chars_processed, mid_confs_processed, change_done = \ search_space_processor.process_search_space(mid_chars, mid_confs,SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS) if change_done is True: ssp_chars.update_middle_matrix(mid_chars_processed) ssp_confs.update_middle_matrix(mid_confs_processed) # extract changed values from search space character_offset = -(SEARCH_SPACE_X_SEARCH_RANGE + 1) character_1 = ssp_chars.get_value_around_middle( 0, character_offset) character_2 = ssp_chars.get_value_around_middle( 1, character_offset) character_3 = ssp_chars.get_value_around_middle( 2, character_offset) charconf_1 = ssp_confs.get_value_around_middle( 0, character_offset) charconf_2 = ssp_confs.get_value_around_middle( 1, character_offset) charconf_3 = ssp_confs.get_value_around_middle( 2, character_offset) if character_1 is None or character_2 is None or character_3 is None: # self.cpr.print("test") continue # in case umlaut confidence increment is active change charconfs otherwise same charconfs charconf_1, charconf_2, charconf_3 = self.increase_umlaut_confidence_searchspace( character_1, character_2, character_3, charconf_1, charconf_2, charconf_3) # get the previous characters from other lines as string (mainly for predictor) filo_content = self.filo_last_chars.get_content_as_string() # trigger predicted section for aufsichtsrat predictor self.toggle_predictor(filo_content) # predict_char if predictor is enabled predicted_char = self.predict_char(filo_content) # get the character which occurs the most by accumulating confidence scores sc1, acc_conf_1 = self.get_confidence_count( character_1, character_2, character_3, charconf_1, charconf_2, charconf_3) sc2, acc_conf_2 = self.get_confidence_count( character_2, character_1, character_3, charconf_2, charconf_1, charconf_3) sc3, acc_conf_3 = self.get_confidence_count( character_3, character_2, character_1, charconf_3, charconf_2, charconf_1) maxindices = np.argmax([ acc_conf_2, acc_conf_1, acc_conf_3 ]) # this takes in priorisation in case the chars are same if character_index == maximum_char_number + range_extension + 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I": continue # drop chars completely if they fall below a certain dropping treshhold and the setting is active if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True: tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3) if maximum_conf < tresh: if [character_2, character_1, character_3 ][maxindices] != '¦': continue # determine character with the best accumulated confidence voted_char = None voted_acc_conf = None if maxindices == 0: voted_char = character_2 voted_acc_conf = acc_conf_2 elif maxindices == 1: voted_char = character_1 voted_acc_conf = acc_conf_1 else: voted_char = character_3 voted_acc_conf = acc_conf_3 # if predictor is active, check if there is a better char predicted which can replace voted character voted_char = self.maybe_replace_voted_by_predicted_char( voted_char, self.use_aufsichtsrat_prediction, predicted_char, wildcard_character, voted_acc_conf, character_1, character_2, character_3) # push the voted char and the accumulated confidence of this char to results accumulated_confs.push(voted_acc_conf) accumulated_chars += voted_char # if the predictor is enabled fill the filo with the voted_char self.fill_filo_last_chars(voted_char) # do vocabulary related steps, if activated accumulated_chars = self.vocabulary_related_corrections( accumulated_chars, wildcard_character, accumulated_confs) # remove the wilcard characters and return result accumulated_chars_stripped = accumulated_chars.replace( wildcard_character, '') return accumulated_chars, accumulated_chars_stripped except Exception as ex: tr = inspect.trace() self.cpr.printex("ocr_voter.py Exception during confidence vote", ex) self.cpr.printex("trace", tr) def vocabulary_related_corrections(self, accumulated_chars, wildcard_character, accumulated_confs): if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE: accumulated_chars_final = "" acc_split = accumulated_chars.split() len_split = len(acc_split) for word_index, word in enumerate(acc_split): if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION: if word_index == len_split - 1 and word.replace( wildcard_character, "").endswith('-'): self.previous_word_with_seperator = True accumulated_chars_final += word + " " continue if word_index == 0: if self.previous_word_with_seperator is True: self.previous_word_with_seperator = False accumulated_chars_final += word + " " continue acc_confs_word = accumulated_confs.pop_multi(len(word)) acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \ self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character) self.cpr_vocab_check.print("w:", word, "wr:", word_reduced, "accr:", acc_conf, "rate", rate) # don't correct words below min vocab length ( mind that special chars in dict are toggled) check_len = len(word) if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS: check_len = len(word_reduced) if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH: accumulated_chars_final += word + " " continue if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS: swappable_char_indices = [] acc_confs_used = None word_used = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: # use the full length confidences array including trailing and leading special characters acc_confs_used = acc_confs_word word_used = word else: # don't use trailing and starting special characters if no special chars needed acc_confs_used = acc_confs_word[ len(word_starting_borders):( len(acc_confs_word) - len(word_trailing_borders))] word_used = word_reduced for conf_index, conf in enumerate(acc_confs_used): if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if conf <= 250: character_related = word_used[conf_index] is_special_char = Random.is_special_character( character_related) if is_special_char and character_related != wildcard_character: # only swap special character indices swappable_char_indices.append(conf_index) else: if conf <= 215: swappable_char_indices.append(conf_index) if len(swappable_char_indices) >= 1: word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only( word_used, swappable_char_indices) if word_reduced_correct != None: word_correct_withtrails = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if Random.has_special_character( word_reduced_correct): # if special character was replaced with special character word_correct_withtrails = word_reduced_correct else: # if special character was replaced by alphanumerical character word_correct_withtrails = word else: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders # only print the changed results if word != word_correct_withtrails: self.cpr_vocab_check.print( "w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " continue if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \ and len(word_reduced) > 2: # if the rate drops below tresh, try to fetch vocab entry word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text( word_reduced) if word_reduced_correct != None and word_reduced_correct != word_reduced: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders self.cpr_vocab_check.print("w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " accumulated_chars = accumulated_chars_final return accumulated_chars def try_obtain_charconf_searchspace( self, value_confidence, value, undef_value=0, engine_key=None, one_line_empty=False, ): if value_confidence is None or value_confidence is False or value_confidence is True: return undef_value returnvalue = value_confidence if self.config.MSA_BEST_VOTER_SCALE_ENGINE_CONFIDENCES and engine_key is not None: if engine_key == 'Abbyy': if self.config.MSA_BEST_INCREASE_CONFIDENCE_OF_SOME_ABBYY_CHARS: if value == "%": # improve ocropus in confidence of % because it was trained value_confidence = value_confidence + 80 returnvalue = ConfidenceModifications.abby_factor * value_confidence elif engine_key == 'Tess': returnvalue = ConfidenceModifications.tesseract_factor * value_confidence elif engine_key == 'Ocro': returnvalue = ConfidenceModifications.ocropus_factor * value_confidence if (self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS and one_line_empty and value == " ") \ or (self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD and one_line_empty \ and value == " "): returnvalue += ConfidenceModifications.whitespace_push return returnvalue def check_if_one_line_empty(self, lines, wildcard_character): for line in lines: text_wo_wildcards = line.textstr.replace(wildcard_character, '') if text_wo_wildcards == "": return True if self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD: # also count in high whitecard ratios as empty line wildcard_ratio = 1 - (len(text_wo_wildcards) / len(line.textstr)) if wildcard_ratio > 0.70: return True def toggle_predictor(self, filo_content): if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED: if "Aufsichtsrat" in filo_content: self.use_aufsichtsrat_prediction = True if "Gründung:" in filo_content: self.use_aufsichtsrat_prediction = False def predict_char(self, filo_content): predicted_char = None if self.use_aufsichtsrat_prediction: if len(filo_content ) >= 19: # if filo_content bigger than one prediction chunk len_aufsichtsrat = 19 predicted_char = self.predictor.predict_next_aufsichtsrat_chars( len_aufsichtsrat, filo_content) # print("filo", filo_content,"predict:", predicted_char) # print("dd") return predicted_char def fill_filo_last_chars(self, voted_char): """ fill filo for predictor usage with voted_char some additional chars around this char :param voted_char: :return: """ if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED: # create pre semi-tokenized input strings in the filos from the voted characters for prediction if voted_char == ' ': # the models usally use the 'ƿ' char in substitution for spaces self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push('ƿ', filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') elif Random.is_special_character(voted_char): self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push(voted_char, filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') else: self.filo_last_chars.push(voted_char, filterchar='¦') def increase_umlaut_confidence_searchspace(self, character_1, character_2, character_3, charconf_1, charconf_2, charconf_3): if self.config.MSA_BEST_SEARCHSPACE_INCREASE_UMLAUT_CONFIDENCE: clist = [character_1, character_2, character_3] conflist = [charconf_1, charconf_2, charconf_3] conflist_new = self.increase_umlaut_confidence(clist, conflist) charconf_1 = conflist_new[0] charconf_2 = conflist_new[1] charconf_3 = conflist_new[2] return charconf_1, charconf_2, charconf_3 return charconf_1, charconf_2, charconf_3 def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \ wildcard_character, voted_acc_conf, character_1, character_2, character_3): if aufsichtsrat_prediction_toggled: if Random.is_special_character(predicted_char): one_char_sc = Random.is_special_character(character_1) \ or Random.is_special_character(character_2) or Random.is_special_character( character_3) voted_char_sc = Random.is_special_character(voted_char) if predicted_char != voted_char and ( one_char_sc or voted_char_sc) and voted_char != wildcard_character: # print("FiloContent:", filo_content) self.cpr_sc_predict.print("pc:", predicted_char, "vc:", voted_char, "vc_acc", voted_acc_conf) if voted_acc_conf <= 90.0: if voted_char != '\f': # don't swap formfeeds, they don't get predicted at all self.cpr_sc_predict.print("swap") voted_char = predicted_char return voted_char
class AdditionalInfoHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init additional info handler") def write_excel_to_json(self, fileinfo, filepath, filetype, idxcol=None, parse_cols=None, page=0): """" At the moment a little helper script for the Aktienführer-Project. Be free to modify as you wish. """ #if isinstance(parse_cols, list): parse_cols = [parse_cols], additional_filepath = path.normpath( f"{filepath}/**/*{fileinfo.dbname}.{filetype}") file = glob.glob(additional_filepath, recursive=True) if len(file) != 1: return None if filetype in ["xlsx", "xls"]: df = pd.read_excel(file[0]).set_index("ProfileID") jsondata = {fileinfo.dbname: {"Year": fileinfo.dbname}} jsondf = df.to_dict(orient="index") jsondata.update(jsondf) with open(file[0].replace("xlsx", "json"), "w") as output: json.dump(jsondata, output, indent=4) return None def fetch_additional_information_simple(self, file): """ Same as fetch additional information, but config related info is already included in given parameters :return: additional info """ if self.config.ADDITIONAL_INFORMATION: additional_info = self.fetch_additional_information( file, self.config.INPUT_ADDINFOPATH, idxcol=self.config.IDXCOL, parse_cols=self.config.PARSE_COLS, filetype=self.config.INPUT_ADDINFOFILETPYE) return additional_info return None def fetch_additional_information(self, fileinfo, filepath, filetype, idxcol=None, parse_cols=None, page=0): """ Reads an additional file with information It searches the file where the index_name matches tablename or dbname :param file: :param index_name: :return: additional info """ #if isinstance(parse_cols, list): parse_cols = [parse_cols] additional_filepath = path.normpath( f"{filepath}/**/*{fileinfo.dbname}.{filetype}") file = glob.glob(additional_filepath, recursive=True) len_files = len(file) if len_files > 1: self.cpr.printex( "More than one additional information file was found!") return None if len_files == 0: self.cpr.printex("No additional information file was found!") return None file = file[0] current_db_and_table = { "db": fileinfo.dbname, "table": fileinfo.tablename } if filetype in ["xlsx", "xls"]: infos = {} info_df = pd.read_excel(file) #.set_index("ProfileID") parse_cols.remove(idxcol) for db_and_table_id, current_db_and_tablename in current_db_and_table.items( ): infos[db_and_table_id] = {} for line, rubric_content in info_df.loc[ info_df[idxcol] == current_db_and_tablename][parse_cols].to_dict( orient="index").items(): for rubric, content in rubric_content.items(): if rubric != idxcol: if infos[db_and_table_id].get(rubric, None) is None: infos[db_and_table_id][rubric] = content elif infos[db_and_table_id].get(rubric, None) != content: if not isinstance( infos[db_and_table_id][rubric], list): infos[db_and_table_id][rubric] = [ infos[db_and_table_id][rubric] ] infos[db_and_table_id][rubric].append(content) elif filetype == "json": with open(file, "r") as add_info_file: infos = json.load(add_info_file) for possible_db_or_tablenames in reversed(list(infos.keys())): possible_db_or_tablenames_orig = possible_db_or_tablenames # unchanged name if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON: psplit = possible_db_or_tablenames.split("-") possible_db_or_tablenames = psplit[0] if possible_db_or_tablenames not in current_db_and_table[ 'table']: del infos[possible_db_or_tablenames_orig] else: for db_and_table_id, current_db_and_tablename in current_db_and_table.items( ): if possible_db_or_tablenames == current_db_and_tablename: infos[db_and_table_id] = infos[ possible_db_or_tablenames_orig] del infos[possible_db_or_tablenames_orig] else: return None return infos
class EndobjectFactory(object): """ Creates an object with the following structure and provides exporting methods: segment_tag_1: [ ---> this level is created by set_current_main_list { type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 city: "Neustadt" }, { type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 city: "Neustadt" } ], segment_tag_2: [ { ... } ... ] """ def __init__(self): self.my_object = {} self.current_main_list = None self.pp = pprint.PrettyPrinter(indent=5) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) if self.config.REMOVE_TAGS_IN_ORIG_DIFF: self.known_uc = KnownUncategories() def set_current_main_list(self, segment_tag): if segment_tag not in self.my_object.keys(): self.my_object[segment_tag] = [] # create the main list (all subsequent entries are stored here) self.current_main_list = self.my_object[segment_tag] # create a short link on the main list def add_to_my_obj(self, key, value, object_number=0, only_filled=False): if only_filled is True and (value == None or value == "" or value == [] or value == {}): return False # fill main list if object index not in len_list = len(self.current_main_list) if len_list < object_number+1: for index in range(len_list,object_number+1): self.current_main_list.append({}) self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value) # add or insert to the main_list self.current_main_list[object_number][key] = value return True def print_me_and_return(self): print("my_object is:") self.pp.pprint(self.my_object) return self.my_object def print_current_main(self): print("current_main:") self.pp.pprint(self.current_main_list) def export_as_json(self): my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False) return my_obj_json def export_as_json_at_key(self, key, remove_first_object=False): if key not in self.my_object.keys(): return None my_obj = self.my_object[key] if remove_first_object: if len(my_obj) >= 1: my_obj = my_obj[1:] # remove the first object which usally contains generic info my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False) return my_obj_json @staticmethod def fetch_subentries_recursive_check(entry): """ Fetches all subentries (values) from an entry and writes them to a list of texts This get's called recursively within the function until all subentries are found :param entry: entry to fetch the subentries from :return: list of subentries """ final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value) final_texts.extend(recursive_texts) return final_texts @staticmethod def fetch_keys_recusive_check(entry, final_keys, create_multiple=True): """ Fetches all keys in an object and it's sub-objects calls itself recursively until all keys are found writes final keys to final_keys array and returns this :param entry: object to fetch the sub-keys from :param final_keys: list of final keys (initial state) :param create_multiple: if the same key occurs multiple times it still gets added :return: final_keys with added keys from object """ if isinstance(entry, list): for item in entry: final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple) return final_keys elif not isinstance(entry, dict): # just return if there are no keys (cause no dictionary) return final_keys for key in entry: value = entry[key] if create_multiple or key not in final_keys: if isinstance(key, int): continue final_keys.append(key) final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys) return final_keys def diff_seg_to_orig_at_key(self, key): """ def fetch_subentries_recursive(entry): final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = fetch_subentries_recursive(value) final_texts.extend(recursive_texts) return final_texts """ if key not in self.my_object.keys(): return None my_data = self.my_object[key] # check if the orig-post property can exist warn if not if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: self.cpr.printw("trying to fetch original data, original data is not added to results") self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") if len(my_data) <= 0: self.cpr.printw("no data to do returning") return return # todo this seems to be wrong # copy orig string original_text = my_data[0]['origpost'] rest_text = original_text # fetch parsed entries for diff all_final_entries = [] # array of final entries for index in range(1, len(my_data)): entry = my_data[index] final_entries = fetch_subentries_recursive(entry) all_final_entries.extend(final_entries) # order diff data after length all_final_entries.sort(key=lambda x: len(x)) all_final_entries.reverse() # subtract for text in all_final_entries: rest_text = rest_text.replace(text, "") rest_text = rest_text.strip() return rest_text, original_text def diff_parsed_to_orig_at_key(self, key): """ def fetch_subentries_recursive(entry): final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = fetch_subentries_recursive(value) final_texts.extend(recursive_texts) return final_texts def fetch_keys_recusive(entry, final_keys, create_multiple=True): # just return if there are no keys (cause no dictionary) if not isinstance(entry, dict): return final_keys for key in entry: value = entry[key] if create_multiple or key not in final_keys: if isinstance(key, int): continue final_keys.append(key) final_keys = fetch_keys_recusive(value, final_keys) return final_keys """ if key not in self.my_object.keys(): return None #if key == "KursVonZuteilungsrechten": # print("todo remove debug") my_data = self.my_object[key] # check if the orig-post property can exist warn if not if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: self.cpr.printw("trying to fetch original data, original data is not added to results") self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") if len(my_data) <= 0: self.cpr.printw("no data to do returning") return # copy orig string original_text = my_data[0]['origpost'] rest_text = original_text # fetch parsed entries for diff pool_entries = [] # array of final entries for index in range(1, len(my_data)): entry = my_data[index] final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry) pool_entries.extend(final_entries) if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: # removes all spaces from rest and comparison values because spaces are often # a problem in subtracting the rests rest_text = rest_text.replace(" ", "") for index in range(0,len(pool_entries)): pool_entries[index] = pool_entries[index].replace(" ", "") all_final_entries = [] # add the entries to the complete subtraction and tag them with '1' for pentry in pool_entries: all_final_entries.append((pentry, 1)) # if keys shall be subracted also add them also if self.config.REMOVE_TAGS_IN_ORIG_DIFF: pool_keys = [] # gets multiple of the same key for later 1 by 1 subtraction for index in range(1, len(my_data)): pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True) # also remove spaces in keys if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: for index in range(0, len(pool_keys)): pool_keys[index] = pool_keys[index].replace(" ", "") final_keys = [] for pkey in pool_keys: final_keys.append((pkey, 2)) all_final_entries.extend(final_keys) # order diff data after length all_final_entries.sort(key=lambda x: len(x[0])) all_final_entries.reverse() # subtract for entry in all_final_entries: text = entry[0] text_or_key = entry[1] if text_or_key == 2: if text in self.known_uc.unkeys: continue text_stripped = text.strip() # remove spaces so texts better fit in rest_text = rest_text.replace(text_stripped, "", 1) rest_text = rest_text.strip() return rest_text, original_text
class AkfParsingFunctionsJK(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler, ocromore_data=None): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint( self.config.PRINT_SEGMENT_PARSER_AKF_FN_THREE, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions three") self.ef = endobject_factory self.output_analyzer = output_analyzer self.ocromore_data = ocromore_data self.dictionary_handler = dictionary_handler def parse_bilanzen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True if self.config.LOG_SIMPLE: geschaeftslage = origpost_red.replace("- ", "") #parsing self.ef.add_to_my_obj("balances", geschaeftslage, object_number=element_counter, only_filled=only_add_if_string) return True #parsing table = Datatable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines, template="datatable_balance") table.extract_content(content_lines, feature_lines, template="datatable_balance") # Write information for income table parsing segmentation_class.info_handler["income"] = {} segmentation_class.info_handler["income"]["amount"] = table.info.amount segmentation_class.info_handler["income"]["col"] = table.info.col segmentation_class.info_handler["income"][ "separator"] = table.info.separator # Parsing the tables based on whitespace and number of numbers of each group # This should be the last option to parse (error-prone) self.ef.add_to_my_obj("balances", table.content, object_number=element_counter, only_filled=only_add_if_string) def parse_gewinn_und_verlust(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True if self.config.LOG_SIMPLE: geschaeftslage = origpost_red.replace("- ", "") #parsing self.ef.add_to_my_obj("income", geschaeftslage, object_number=element_counter, only_filled=only_add_if_string) return True # parsing table = Datatable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines, template="datatable_income") if segmentation_class.info_handler and "income" in set( segmentation_class.info_handler.keys()): table.info.col = segmentation_class.info_handler["income"]["col"] table.info.amount = segmentation_class.info_handler["income"][ "amount"] table.info.separator = segmentation_class.info_handler["income"][ "separator"] table.extract_content(content_lines, feature_lines, template="datatable_income") #parsing self.ef.add_to_my_obj("income", table.content, object_number=element_counter, only_filled=only_add_if_string) def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True #self.config.LOG_SIMPLE = False if self.config.LOG_SIMPLE: # self.config.LOG_SIMPLE = False skip = origpost_red.replace("- ", "") # parsing self.ef.add_to_my_obj("shares", skip, object_number=element_counter, only_filled=only_add_if_string) return True # parsing table = Sharetable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines) table.extract_content(content_lines, feature_lines) #from timeit import timeit #print(timeit(test)) # parsing self.ef.add_to_my_obj("shares", table.content, object_number=element_counter, only_filled=only_add_if_string) def parse_dividend(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information( segmentation_class.segment_tag, content_texts, real_start_tag) # init only_add_if_string = True # self.config.LOG_SIMPLE = True if self.config.LOG_SIMPLE: # self.config.LOG_SIMPLE = False skip = origpost_red.replace("- ", "") # parsing self.ef.add_to_my_obj("dividende", skip, object_number=element_counter, only_filled=only_add_if_string) return True # parsing table = Dividendtable(snippet=segmentation_class.snippet) table.analyse_structure(content_lines, feature_lines) table.extract_content(content_lines, feature_lines) # from timeit import timeit # print(timeit(test)) # parsing self.ef.add_to_my_obj("dividende", table.content, object_number=element_counter, only_filled=only_add_if_string)
class AkfParsingFunctionsTwo(object): def __init__(self, endobject_factory, output_analyzer, dictionary_handler): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TWO, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init akf parsing functions two") self.ef = endobject_factory self.output_analyzer = output_analyzer self.dictionary_handler = dictionary_handler def parse_zahlstellen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) split_post = origpost_red.split(';') DEFAULT_ENTRY = 1 ADDITIONAL_INFO_BOTH = 2 # beide - two previous ADDITIONAL_INFO_ALL_PREV = 3 # sämtl. - all previous final_entries = [] for index, entry in enumerate(split_post): entry_stripped = entry.strip() if "beide" in entry_stripped: entry_final = regex.sub(r"beide\s?\.?", "##", entry_stripped).strip() entry_final_split = entry_final.split('##') for index_fs, entry_fs in enumerate(entry_final_split): if entry_fs.strip() == "" : continue if index_fs < len(entry_final_split)-1: final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", "")) else: final_entries.append((ADDITIONAL_INFO_BOTH, entry_fs, "", "", "")) continue if regex.search("sämtl\s?\.?", entry_stripped): entry_final = regex.sub(r"sämtl\s?\.?", "##", entry_stripped).strip() entry_final_split = entry_final.split('##') for index_fs, entry_fs in enumerate(entry_final_split): if entry_fs.strip() == "": continue if index_fs < len(entry_final_split)-1: final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", "")) else: final_entries.append((ADDITIONAL_INFO_ALL_PREV, entry_fs, "", "", "")) continue entry_split = entry_stripped.split(',') bank = "" city = "" title = "" rest_info = [] for fragment_index, fragment in enumerate(entry_split): if fragment_index == 0: bank = fragment elif fragment_index == 1: city = fragment elif fragment_index >= 2: rest_info.append(fragment) if bank != "" or city != "" or title != "": final_entries.append((DEFAULT_ENTRY, bank, city, title, rest_info)) # reverse list for better processing reverse_fe = final_entries.__reversed__() current_additional_info = "" current_info_index = None current_entry_type = None final_list = [] for item_index, item in enumerate(reverse_fe): entry_type, entryorbank, city, title, rest_info = item # change current additional info if entry_type == ADDITIONAL_INFO_BOTH or entry_type == ADDITIONAL_INFO_ALL_PREV: current_info_index = item_index current_additional_info = entryorbank elif entry_type == DEFAULT_ENTRY: templist = [(entryorbank, city, title, current_additional_info, rest_info)] templist.extend(final_list) final_list = templist # end 'beide'-entry because it's over after 2 iterations if current_entry_type == ADDITIONAL_INFO_BOTH and item_index-current_info_index >= 1: current_info_index = None current_additional_info = "" # finally note the entries to output only_add_if_value = True for entry in final_list: bank, city, title, add_info, rest_info = entry if add_info.strip() != "": rest_info_new = [add_info] rest_info_new.extend(rest_info) else: rest_info_new = rest_info #if add_info != "" and add_info != None and city =="": # city += add_info self.ef.add_to_my_obj("bank", bank, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_value) #self.ef.add_to_my_obj("additional_info", add_info, object_number=element_counter, only_filled=only_add_if_value) #self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj("rest_info", rest_info_new, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 return True def parse_grundkapital(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # todo validate other currencies than 'DM' # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) only_add_if_value = True # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # Try to normalize ; to : with prefix apital content_texts = [content_text.replace("apital;","apital:") for content_text in content_texts] gk = cf.parse_general_and_keys(content_texts, join_separated_lines=True, current_key_initial_value='start_value', abc_sections=True) #print(gk) # check start value for 'normal' grundkapital content # if found parse start_value = gk.get('start_value',"") if len(gk.keys()) == 1: start_value = gk[list(gk.keys())[0]] #if start_value = if len(start_value) >= 1: #print("could be grundkapital") my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \ cf.parse_grundkapital_line(start_value[0], False, element_counter, only_add_if_value, []) currency = my_return_object.get('currency',"").strip() amount = my_return_object.get('amount',"").strip() if amount != "" and currency != "": self.ef.add_to_my_obj('Grundkapital', my_return_object, object_number=element_counter, only_filled=only_add_if_value) else: gk['additional_info'] = [] gk['additional_info'].append(start_value[0].replace("↑", ":")) if len(start_value) >= 2: # get the additional values which are in start_value, but have nothing to do with that if 'additional_info' not in gk.keys(): gk['additional_info'] = [] gk['additional_info'] = [] for index in range(1, len(start_value)): val = start_value[index] gk['additional_info'].append(val.replace("↑", ":")) """ if 'additional_info' in gk.keys(): gk_ai = cf.parse_general_and_keys(gk['additional_info'], join_separated_lines=True, current_key_initial_value='start_value_addinfo', abc_sections=True) print("lemme check") """ for key in gk: if key is "start_value": continue entry = gk[key] # individual parsing here match_year = regex.search("\d\d\d\d", key) # key is year year = None key_rest = "" if match_year: year = match_year.group() key_rest = key.replace(year, "").strip() accumulated_text = [] if key_rest != "": accumulated_text.append(key_rest) for inner_entry in entry: accumulated_text.append(inner_entry) final_entry = None if year is None: final_entry = accumulated_text else: final_entry = { "year": year, "text": accumulated_text } if final_entry != None and final_entry != "": self.ef.add_to_my_obj(key, final_entry, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 # check all year lines and parse the return # old parsing style final_entries = [] current_ref_index = -1 found_main_amount = False additional_info = [] only_add_if_value = True for text_index, text in enumerate(content_texts): text_stripped = text.strip() if text_stripped == "": continue # todo increment element ctr ? my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \ cf.parse_grundkapital_line(text_stripped, found_main_amount, element_counter, only_add_if_value, additional_info) for key in my_return_object: value = my_return_object[key] self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=only_add_if_value) if len(additional_info) >= 1: add_lines_parsed = cf.parse_grundkapital_additional_lines(additional_info,element_counter,True, 0) self.ef.add_to_my_obj("additional_info", add_lines_parsed, object_number=element_counter, only_filled=only_add_if_value) return True def parse_ordnungsnrdaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) only_add_if_value = True # example values - each line of content_texts list # '589300 (St.-Akt.)' # '589300.' first_number_match = True for entry in content_texts: entry_stripped = entry.strip() rest = entry_stripped if entry_stripped == "": continue match_number = regex.search(r"^([\d\s]*)", entry_stripped) match_parenth = regex.search(r"\(.*\)", entry_stripped) # take content in parenthesis if match_number is not None and match_number.group(0).strip() != "": if not first_number_match: element_counter += 1 # switch to next element if number not true number = match_number.group(0).strip() self.ef.add_to_my_obj("ord_number", number, object_number=element_counter, only_filled=only_add_if_value) rest = rest.replace(number, "", 1) first_number_match = False if match_parenth is not None: parenth = match_parenth.group(0) self.ef.add_to_my_obj("category", parenth, object_number=element_counter, only_filled=only_add_if_value) rest = rest.replace(parenth, "", 1) rest_stripped = rest.strip() if rest_stripped != "": self.ef.add_to_my_obj("additional_info", rest_stripped, object_number=element_counter, only_filled=only_add_if_value) def parse_grossaktionaer(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) lines_split = origpost_red.split(';') only_add_if_value = True for line in lines_split: # testline # line = "Société Sidérurgique de Participations et d’ Approvisionnement en Charbons, par abréviation (Sidechar), Paris (ca.60,2 %)." findings = regex.finditer(r"\([a-zü0-9\s\,\.]*%\).?",line) lof = list(findings) #findings = regex.search(r"(?m)a", line) if lof: findings = [] for finding in lof: findings.append(finding.regs[0]) else: findings = [(len(line),len(line))] start = 0 for idx, finding in enumerate(findings): #shareholder,location, share item = line[start:finding[0]] if ":" in item: self.ef.add_to_my_obj("additional_information", item[:item.index(":")], object_number=element_counter, only_filled=only_add_if_value) if line.index(":")+2 > finding[0]: continue else: item = item[item.index(":"):] item = item.rsplit(",",1) self.ef.add_to_my_obj("shareholder", item[0].strip(), object_number=element_counter, only_filled=only_add_if_value) if len(item) > 1 and item[1] != "": if item[1][-1] == ".": item[1] = item[1][:len(item[1])-1] if "(" in item[1] and ")" in item[1]: find = regex.search(r"(\([0-9\s\,]*|maßgeblich|Mehrheit|Majorität)\)", item[1]) if find: self.ef.add_to_my_obj("share", item[1][find.regs[0][0]:find.regs[0][1]-1].strip(), object_number=element_counter, only_filled=only_add_if_value) item[1] = item[1][:find.regs[0][0]-1] self.ef.add_to_my_obj("location", item[1].strip(), object_number=element_counter, only_filled=only_add_if_value) if finding[0] != len(line): self.ef.add_to_my_obj("share", line[finding[0]:finding[1]].replace(", ",",").replace("(","").replace(").","").replace(")","").strip(), object_number=element_counter,only_filled=only_add_if_value) start = finding[1] element_counter += 1 #print(self.ef.my_object["Großaktionär"]) """ # find parenthesis with 2 or more characters inside #for item in line.split("%)"): match_parenth = regex.findall(r"(\(.{2,}\))", line) found_parenth = None parenth_is_used = False organization = None location = None # find additional info in each line and subtract it if match_parenth: found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp # if the parenthesis are at the end of line if line.strip()[-1] == ")" and not(len(found_parenth.replace(" ", "")) <= 5 and "%" in found_parenth): # exclude percentages from parenthesis matches line = line.replace(found_parenth, "", 1) parenth_is_used = True split_line = line.split(',') len_split_line = len(split_line) if len_split_line == 1: organization = line.strip("., ") else: organization = line.replace(split_line[-1], "", 1).strip("., ") location = split_line[-1].strip("., ") # town self.ef.add_to_my_obj("organization", organization, object_number=element_counter,only_filled=only_add_if_value) self.ef.add_to_my_obj("location", location, object_number=element_counter,only_filled=only_add_if_value) if parenth_is_used: self.ef.add_to_my_obj("additional_info", found_parenth, object_number=element_counter,only_filled=only_add_if_value) element_counter += 1 """ return True def parse_geschaeftsjahr(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) only_add_if_value = True final_jahr = [] for text in content_texts: text_stripped = text.strip("., ") if text_stripped != "": if "bis" in text_stripped: split_text = text_stripped.split('bis ')# # regex.split('\.bis|\sbis\s', text_stripped) if len(split_text) == 1: final_jahr.append(split_text[0].strip()) continue gesch_jahr_start = split_text[0].strip("( ") gesch_jahr_stop = split_text[1].strip(" )") self.ef.add_to_my_obj('gesch_jahr_start', gesch_jahr_start, object_number=element_counter, only_filled=only_add_if_value) self.ef.add_to_my_obj('gesch_jahr_stop', gesch_jahr_stop, object_number=element_counter, only_filled=only_add_if_value) if len(split_text) >= 3: for rest in split_text[3:]: if rest.strip() != "": final_jahr.append(rest) else: final_jahr.append(text_stripped) self.ef.add_to_my_obj('year', final_jahr, object_number=element_counter,only_filled=only_add_if_value) return True def parse_stimmrechtaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # find last parenthesis and filter match_parenth = regex.findall(r"(\(.*?\))", origpost_red) found_parenth = None origpost_used = origpost_red # find additional info in each line and subtract it if match_parenth: found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used final_lines = [] only_add_if_value = True skip = False final_text = "" for text_index, text in enumerate(content_texts): if text == "": continue text = text.replace("DM =", "DM 1 =").replace("DM=", "DM 1 =").replace("eine DM", "DM 1.-") if element_counter == 0 and "je nom" not in text.lower(): self.ef.add_to_my_obj("additional_info", "".join(content_texts[text_index:]), object_number=element_counter, only_filled=only_add_if_value) break if skip: skip = False continue parse_aktie = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?[Aa]ktie[n]?)[^\d]*(?P<vote>[\d\s]*?)\s*?(?P<voteend>Stimme[n]*)") finding = parse_aktie.findall(text.replace("Stamm","")) if finding != []: finding = list(finding[0]) if finding[1] == "": finding[1] = "1" stck = {"kind": "Aktie", "amount": finding[1], "vote": finding[2].replace(" ", "").strip(), "value": "", "currency": "", "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 continue #text = 'Je nom. DM 50.- =1 Stimme.' parse_stimmrecht = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?(?P<waste>[^\dA-Za-z]*)\s{0,}(?P<kind>[A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}(?P<voteend>Stimme[n]*)?") finding = parse_stimmrecht.findall(text.replace("DM", " DM").replace("RM"," RM")) # Special case "bzw." if finding and "bzw." in text: if "Stimm" not in text: skip = True text += content_texts[text_index+1] parse_bzw = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?[^\d]*\s*?(?P<value2>[\d\s]*)[^\dA-Za-z]*(?P<kind>[A-Za-z][A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}[^\d]*\s{0,}(?P<vote2>[\d]*)\s{0,}(?P<voteend>Stimme[n]*)?") finding = parse_bzw.findall(text) finding = finding[0] if finding: stck = {"kind": finding[5].strip(), "amount": "1", "vote": finding[6].replace(" ", "").strip(), "value": finding[3].strip(), "currency": finding[2].strip(), "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 stck = {"kind": finding[5].strip(), "amount": "1", "vote": finding[7].replace(" ", "").strip(), "value": finding[4].strip(), "currency": finding[2].strip(), "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) continue if not finding or finding[0][0] + finding[0][1] == "": final_text += text continue if final_text != "": self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter-1, only_filled=only_add_if_value) final_text = "" finding_next = None if finding[0][6] + finding[0][7] == "": if text_index == len(content_texts) - 1: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) continue else: finding_next = parse_stimmrecht.findall(text + " " + content_texts[text_index + 1]) if finding_next: skip = True finding = finding_next finding = list(finding[0]) if finding[5] == "": finding[5] = "nom." if finding[1] == "": finding[1] = "1" stck = {"kind": finding[5].strip(), "amount": finding[1].strip(), "vote": finding[6].replace(" ", "").strip(), "value": finding[3].strip(), "currency": finding[2].strip(), "rank": element_counter} self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 # match_akt = regex.search(r"\.\s?\-\s?Akt", text) # if match_saemtlsakt is not None: # self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) # element_counter += 1 # continue if final_text != "": self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter, only_filled=only_add_if_value) return True """ # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # add extra splitting elements to each 'je' or 'Je' origpost_red_se = regex.sub(r"(Je |je )", r"~~~\1", origpost_red) split_text = origpost_red_se.split('~~~') # origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red) only_add_if_value = True for entry in split_text: if entry == "": continue match_sb = regex.search(r"Stimmrechtsbeschränkung:.*", entry) sbe = None if match_sb is not None: sbe = match_sb.group() sbe = sbe.replace("Stimmrechtsbeschränkung:", "", 1) entry = entry.replace(sbe, "").replace("Stimmrechtsbeschränkung:", "", 1) self.ef.add_to_my_obj("entry", entry, object_number=element_counter ,only_filled=only_add_if_value) self.ef.add_to_my_obj("Stimmrechtsbeschränkung", sbe, object_number=element_counter ,only_filled=only_add_if_value) element_counter += 1 """ def parse_boersennotiz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) found_parenth = None origpost_used = origpost_red # log all location elements only_add_if_value = True split_post = regex.split('u\.|und|,', origpost_used) for entry in split_post: entry_stripped = entry.strip("., ") # find additional info in each line and subtract it # find last parenthesis and filter #match_parenth = regex.findall(r"(\(.*?\))", entry_stripped) #combined_ps = [] #for res in match_parenth: #combined_ps.append(res.strip()) #origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used # log additional info in last parenthesis #self.ef.add_to_my_obj("additional_info", combined_ps, object_number=element_counter, # only_filled = only_add_if_value) #if entry_stripped is None or entry_stripped == "": #if match_parenth: # element_counter += 1 entry_stripped = entry.replace("im Freiverkehr", "").replace("(amtl.)", "").strip("., ") if entry_stripped == None or entry_stripped == "": continue self.ef.add_to_my_obj("location", entry_stripped, object_number=element_counter, only_filled= only_add_if_value) element_counter += 1 return True def preprocess_stueckelung_texts(self, content_texts): final_stueckelung_texts = [] previous_text_stripped = "" for index, current_text in enumerate(content_texts): current_text_stripped = current_text.strip() if current_text_stripped == "": continue if current_text_stripped.startswith("zu je") or current_text_stripped.startswith("Zu je"): final_stueckelung_texts.append(previous_text_stripped + " "+current_text_stripped) previous_text_stripped = "" elif "(" == current_text_stripped[0] and ")" == current_text_stripped[-1]: final_stueckelung_texts.append(previous_text_stripped + " "+current_text_stripped) previous_text_stripped = "" else: final_stueckelung_texts.append(previous_text_stripped) previous_text_stripped = current_text_stripped if index == len(content_texts)-1: final_stueckelung_texts.append(current_text_stripped) final_texts_filtered = [] for text in final_stueckelung_texts: text_stripped = text.strip() if text_stripped != "": final_texts_filtered.append(text_stripped) return final_texts_filtered def parse_stueckelung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): # get basic data element_counter = 0 origpost, origpost_red, element_counter, content_texts = \ cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) # logme self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) # find last parenthesis and filter match_parenth = regex.findall(r"(\(.*?\))", origpost_red) found_parenth = None origpost_used = origpost_red # find additional info in each line and subtract it if match_parenth: found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used final_lines = [] additional_info_final = [] only_add_if_value = True skip = False final_text = "" final_add_rest = "" content_texts = self.preprocess_stueckelung_texts(content_texts) for text_index, text in enumerate(content_texts): if text.strip() == "": continue if skip: skip = False continue parse_stck = regex.compile(r"(?P<amount>[\d\s\.]*)\s*(?P<kind>[^\d]*?)[\s]?(?P<nominal>zu je|zuje|zu|je)\s{0,}(?P<currency>[^\d\s]*)\s{0,}(?P<value>[\d\s]*)") finding = parse_stck.findall(text.replace(" Stücke ", " Aktien ").replace(" Stück ", " Aktie ").replace("DM", " DM").replace("RM", " RM").replace("hfl"," hfl")) rest_finding = "" if len(finding) >= 1: rest_finding = text # get the rest of finding subtract_sorted = sorted(finding[0],key=len) subtract_sorted.reverse() for find_chunk in subtract_sorted: rest_finding = rest_finding.replace(find_chunk, "", 1).strip() rest_finding = regex.sub("\s{2,}"," ", rest_finding) # just replace redundant spaces for better subtraction if not finding or finding[0][0]+finding[0][1] == "" or finding[0][0]+finding[0][4] == "": match_akt = regex.search(r"\.\s?\-\s?Akt", text) match_saemtlsakt, err_saemtlsakt = regu.fuzzy_search( r"([Ss]ämtliche [Ss]tammaktien.*|[Ss]ämtliche [Aa]ktien.*|[Ss]ämtliche Namens\-Stammaktien.*)", text, err_number=1) if match_saemtlsakt is not None: #and match_akt is not None: @jk is this second condition really necessary ? saemtl_res = match_saemtlsakt.group() self.ef.add_to_my_obj("additional_info", saemtl_res, object_number=element_counter, only_filled=only_add_if_value) reduced_text = text.replace(saemtl_res, "") final_lines.append(reduced_text) rest_finding = rest_finding.replace(reduced_text,"") if "Börse" in text or "Besondere" in text: addendum = "".join(content_texts[text_index:]) self.ef.add_to_my_obj("additional_info", addendum, object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 rest_finding = rest_finding.replace("".join(content_texts[text_index:]), "") break if "(" in text: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter-1, only_filled=only_add_if_value) rest_finding = rest_finding.replace(text, "") else: rest_finding = rest_finding.replace(text, "") final_text += text continue finding_next = None if finding[0][2] == "" or (("zu" in finding[0][2] or "je" in finding[0][2]) and finding[0][3] == ""): #test = '2 638 514 Inh. - bzw. Namensaktien zuje FF 75.-' if text_index == len(content_texts) - 1: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) continue else: finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1]) if finding[0][3]+finding[0][4] == "": if text_index == len(content_texts) - 1: self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) continue else: finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1]) if finding_next: skip = True finding = finding_next stck = {"amount": finding[0][0].replace("."," ").strip(), "kind": finding[0][1].replace(" ","").strip(), "nominal": "zu je", "currency": finding[0][3], "value": finding[0][4], "rank": element_counter} self.ef.add_to_my_obj("entry", stck, object_number=element_counter, only_filled=only_add_if_value) if rest_finding != "": final_add_rest += rest_finding + " " element_counter += 1 # match_akt = regex.search(r"\.\s?\-\s?Akt", text) #if match_saemtlsakt is not None: # self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value) # element_counter += 1 # continue if final_text != "": self.ef.add_to_my_obj("additional_info", final_text.replace(final_add_rest.strip(".,- "), "", 1).strip(".,- "), object_number=element_counter, only_filled=only_add_if_value) element_counter += 1 if final_add_rest != "": self.ef.add_to_my_obj("additional_info", final_add_rest.strip(".,- "), object_number=element_counter, only_filled=only_add_if_value) return True
class FeatureExtractor(): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.filter_start_words = [ "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung" ] def extract_file_features(self, ocromore_data): all_line_features = [] for line in ocromore_data['lines']: current_line_features = self.extract_line_features(line) all_line_features.append(current_line_features) ocromore_data['line_features'] = all_line_features return ocromore_data def extract_line_features(self, line): final_line_features = {} whole_text = line['text'] self.cpr.print("recognizing text:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for word_obj in line['words']: word_index = word_obj['word_index'] word_text = word_obj['text'] hocr_coordinates = word_obj['hocr_coordinates'] word_xstart = hocr_coordinates[0] word_xstop = hocr_coordinates[2] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if word_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word_text is None or word_text == "": continue if word_index == 0: if word_text in self.filter_start_words: first_word_no_table_indicator = True if word_text.lower() == "ultimo": ultimo_is_first_word = True if word_text[0] == "(": starts_with_parenthesis = True if word_index == len(whole_text) - 1: if word_text[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word_text) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round( counter_alphabetical_word / len(word_text), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word_text)) counters_numbers.append(counter_numbers_word) character_index += len(word_text) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars in line:", str(line['line_index']), "no features here") return False special_chars_ratio = counter_special_chars / counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces / counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers / counter, 2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths) - 1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True final_line_features = LineFeatures(cpr=self.cpr) final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.counter_special_chars = counter_special_chars final_line_features.counter_chars = counter_chars final_line_features.counter_spaces = counter_spaces final_line_features.counter_numbers = counter_numbers final_line_features.counter_alphabetical = counter_alphabetical final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars final_line_features.counter_words = counter_words final_line_features.counters_numbers = counters_numbers final_line_features.counters_wordlengths = counters_wordlengths final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios final_line_features.numbers_ratio = numbers_ratio final_line_features.alphabetical_ratio = alphabetical_ratio final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio final_line_features.special_chars_ratio = special_chars_ratio final_line_features.spaces_ratio = spaces_ratio final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words final_line_features.many_numbers_in_first_word = many_numbers_in_first_word final_line_features.x_box_sizes = x_box_sizes final_line_features.x_gaps = x_gaps final_line_features.maximum_x_gap = maximum_x_gap final_line_features.mean_x_gap = mean_x_gap final_line_features.median_x_gap = median_x_gap return final_line_features
class DictionaryHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init dictionary handler") self.data_functs = None # storage for json object self.data_titles = None # storage for json object self.texts_functs = None self.texts_titles = None if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING: self.load_dictionaries() # get the rows as sorted list of texts longest first if self.data_functs is not None: check_tf = self.sort_rows(self.get_rows(self.data_functs)) self.texts_functs = check_tf if self.data_titles is not None: check_tt = self.sort_rows(self.get_rows(self.data_titles)) self.texts_titles = check_tt def diff_name_title(self, text_to_check): len_text_to_check = len(text_to_check) name_found = text_to_check title_found = "" for entry_index, entry in enumerate(self.texts_titles): title, tlen = entry # accelerate the process, by skipping comparisons which have longer texts if tlen > len_text_to_check: continue # compare the texts if title in text_to_check: name_found = text_to_check.replace(title, "", 1).strip() title_found = title break return name_found, title_found def load_dictionaries(self): base_dict_path = self.get_dict_path() filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json") filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json") # load titles if os.path.exists(filepath_titles_dict): with open(filepath_titles_dict) as f: self.data_titles = json.load(f) else: self.cpr.printex( "dictionary dict_titles.json missing at specificied path", filepath_titles_dict) # load functs if os.path.exists(filepath_functs_dict): with open(filepath_functs_dict) as f: self.data_functs = json.load(f) else: self.cpr.printex( "dictionary dict_functs.json missing at specificied path", filepath_functs_dict) def get_rows(self, dict_data): rows = dict_data['rows'] final_rows = [] for entry in rows: text = entry[0] final_rows.append((text, len(text))) return final_rows def sort_rows(self, rows): #itemgetter(1), rows.sort(key=lambda t: len(t[0]), reverse=True) return rows def path(self): return os.getcwd() def get_dict_path(self): complete = os.path.join(self.path(), "additionals", "dictionaries") return complete
class SegmentClassifier(object): """ This is the basic handler for classification which get's accessed from root/-outside classes. """ def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.cpr.print("init segment classifier") def classify_file_segments(self, ocromore_data): lines = ocromore_data['lines'] feats = ocromore_data['line_features'] file_info = ocromore_data['file_info'] all_file_segments = AllSegments(len(lines), self.cpr, self.config) prev_line = None prev_text = None for current_line_index, current_line in enumerate(lines): current_features = feats[current_line_index] current_text = current_line['text'] current_index = current_line['line_index'] # create a combined lined object with optimized (removed) separation combined_line = None if prev_line is not None: combined_lines = dh.join_separated_lines([prev_text, current_text]) combined_line = dh.join_joined_lines(combined_lines) else: combined_line = current_text # pass parameters to matching functions all_file_segments.match_my_segments(current_line, current_text, current_index, current_features, prev_line, combined_line) prev_line = current_line prev_text = current_text if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: self.adapt_non_explicit_indices(all_file_segments) else: all_file_segments.correct_overlaps_index_field(only_start_tags=True) self.adapt_stop_index_in_last_segment(all_file_segments) # does the last steps in segment matching all_file_segments.finish_segment_matching(lines, feats, file_info) # do again after final step if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: self.adapt_non_explicit_indices(all_file_segments) else: all_file_segments.correct_overlaps_index_field(only_start_tags=True) self.adapt_stop_index_in_last_segment(all_file_segments) ocromore_data['segmentation'] = all_file_segments return ocromore_data def adapt_stop_index_in_last_segment(self, all_file_segments): """ Sets the stop_index for the last recognized segment, which is a special case and is usually not filled beforehand, because there is no next start index :param all_file_segments: holder object for segment classes and other info :return: None """ # search for last segment saved_start_index = -1 saved_last_segment = None for segment in all_file_segments.my_classes: # only count segmented segments if segment.start_was_segmented is False: continue if segment.start_line_index >= saved_start_index: saved_start_index = segment.start_line_index saved_last_segment = segment if saved_last_segment is None: return # adapt the last stop index of last segment saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1 saved_last_segment.stop_was_segmented = True # todo think about if this is necessary? def adapt_non_explicit_indices(self, all_file_segments): # update start and explicit stop tags first all_file_segments.correct_overlaps_index_field(only_start_tags=True) # fill undefined stop regions until next start region all_file_segments.fill_start_index_until_next_stop()
class OCRset: """ A storage class for a y_mean value and a set of lines which was assigned to each other If the lineset values where not edited, they are intialized with 'False """ N_DISTANCE_SHORTEST_TAG = "n_distance_shortest" def __init__(self, lines_size, y_mean, msa_handler): lineset = [] for x in range(0, lines_size): lineset.append(False) self._set_lines = lineset self._size = lines_size self._y_mean = y_mean # mean y coordinate of all lines referenced in this set self.shortest_distance_line_index = -1 self._unspaced = False # indicates the set_lines was unspaced self._refspaced = False # indicates the set_lines was reference spaced self._text_unspacer = TextUnspacer() self.shortest_distance_line = None # holder element for recognized shortest distance line self._best_msa_text = "" self._text_seg = None self._is_origin_database = False self._database_handler = None config_handler = ConfigurationHandler(first_init=False) self._config = config_handler.get_config() if 'ExceptionInitializing' in self._config: print("Exception initializing config, don't print") self._cpr = ConditionalPrint(False, False, False) else: self._cpr = ConditionalPrint(self._config.PRINT_MSA_HANDLER, self._config.PRINT_EXCEPTION_LEVEL, self._config.PRINT_WARNING_LEVEL) self._msa_handler = msa_handler def add_predictor(self, predictor): self.predictor = predictor self._msa_handler.add_predictor(predictor) def is_database_set(self, enabled, database_handler): self._is_origin_database = enabled self._database_handler = database_handler def edit_line_set_value(self, set_index, new_value): self._set_lines[set_index] = new_value def get_line_set_value_line(self, set_index): return self._set_lines[set_index] def get_line_set_value_text(self, set_index): value_line = self.get_line_set_value_line(set_index) value_text = self.get_line_content(value_line) return value_text def get_msa_best_text(self): return self._best_msa_text def set_msa_best_text(self, value): self._best_msa_text = value @property def size(self): return self._size @size.setter def size(self, value): self._size = value @property def y_mean(self): return self._y_mean @y_mean.setter def y_mean(self, value): self.y_mean = value def calculate_y_mean(self): """ Goes through set elements and calculates y_mean for y_start and y_stop values :return: """ acc_counter = 0 y_start_final = 0 y_stop_final = 0 for line in self._set_lines: # don't count undefined values for means if line is False or line is None: continue # accumulate y-values (x_start, y_start, x_stop, y_stop) = line.coordinates y_start_final = y_start_final + y_start y_stop_final = y_stop_final + y_stop # add number of accumulation count acc_counter = acc_counter + 1 y_start_mean = y_start_final / acc_counter y_stop_mean = y_stop_final / acc_counter y_mean = (y_start_mean + y_stop_mean) / 2 self._y_mean = round(y_mean) def is_full(self): """ Checks if all lines are defined in the lineset :return: True or False """ for line in self._set_lines: if line is False: return False return True def print_me(self, diff_only=False): lineset_acc = "" one_line_is_false = False for line in self._set_lines: try: ocr_text = self.get_line_content(line) if ocr_text is False: one_line_is_false = True lineset_acc = lineset_acc + str(ocr_text) + "||" else: lineset_acc = lineset_acc + ocr_text + "||" except: self._cpr.print("problem creating printable lineset ") lineset_acc = lineset_acc + "||" msa_str = str(self._best_msa_text) if diff_only is True: if one_line_is_false is True: self._cpr.print( str(self.y_mean) + "||" + msa_str + "||" + str(self.shortest_distance_line_index) + "||" + lineset_acc) else: self._cpr.print( str(self.y_mean) + "||" + msa_str + "||" + str(self.shortest_distance_line_index) + "||" + lineset_acc) def calculate_n_distance_keying(self): # get the texts texts = [] for line in self._set_lines: text = self.get_line_content(line) texts.append(text) self._n_distance_voter = NDistanceVoter(texts) if "ExceptionInitializing" in self._config.keys(): print("Exception in initializing config using default in c") shortest_dist_index = self._n_distance_voter.compare_texts( \ take_longest_on_empty_lines = True, \ vote_without_spaces = False) else: shortest_dist_index = self._n_distance_voter.compare_texts( \ take_longest_on_empty_lines = self._config.NDIST_VOTE_LONGEST_IF_EMPTY_STRINGS, \ vote_without_spaces = self._config.NDIST_VOTE_WITHOUT_SPACES) # save the result self.shortest_distance_line_index = shortest_dist_index self.shortest_distance_line = self._set_lines[shortest_dist_index] def calculate_n_distance_keying_wordwise(self): if self._is_origin_database is False: self._cpr.printex( "Wordwise keying only possible with database originated ocr_sets" ) raise Exception # get maximum word index todo probably will be refactored max_word_indices = [] for line in self._set_lines: if line is False or line is None or line.textstr == '': max_word_indices.append(0) else: max_word_index = int(max(line.data["word_idx"])) max_word_indices.append(max_word_index) max_word_index = max(max_word_indices) self._cpr.print("mwi", max_word_index) def get_word_at_calc_wordindex(line, word_index): accumulated_word = "" word_indices = line.data["calc_word_idx"] for char_index, char in enumerate(line.data["char"]): current_word_index = word_indices[char_index] if current_word_index == word_index: accumulated_word += char if current_word_index > word_index: break return accumulated_word max_word_index = 2 words_mock = [["hallo", "h4llo", "hallo"], ["zwei", None, "2wei"]] ndist_voter = NDistanceVoter(None) # get corresponding words for current_word_index in range(0, max_word_index): words = [] """ for line in self._set_lines: if line is False or line is None: words.append(False) else: if current_word_index < int(max(line.data["calc_word_idx"])): current_word = get_word_at_calc_wordindex(line, current_word_index) words.append(current_word) else: words.append(False) """ words = words_mock[current_word_index] ndist_voter.set_texts(words) wordindex_result = ndist_voter.compare_texts( \ take_longest_on_empty_lines = self._config.NDIST_VOTE_LONGEST_IF_EMPTY_STRINGS, \ vote_without_spaces=self._config.NDIST_VOTE_WITHOUT_SPACES) ndist_voter.reset() self._cpr.print(words[wordindex_result]) self._cpr.print("--") # just assume words is filled here and a 3 word list return def get_longest_index(self): def if_notdef_set_emptystring(value): if value is True or value is False or value is None: return "" return value lsval_1 = if_notdef_set_emptystring( self.get_line_content(self.get_line_set_value_line(0))) lsval_2 = if_notdef_set_emptystring( self.get_line_content(self.get_line_set_value_line(1))) lsval_3 = if_notdef_set_emptystring( self.get_line_content(self.get_line_set_value_line(2))) len_pline_1 = len(lsval_1) len_pline_2 = len(lsval_2) len_pline_3 = len(lsval_3) # max_index_value = max([len_pline_1, len_pline_2, len_pline_3]) max_index = np.argmax([len_pline_1, len_pline_2, len_pline_3]) self._cpr.print(max_index) return max_index def calculate_msa_best(self, take_n_dist_best_index=False, take_longest_as_pivot=False): # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot best_index = 1 if take_longest_as_pivot is True: best_index = self.get_longest_index() elif take_n_dist_best_index is True: best_index = self.get_shortest_n_distance_index() indices = [0, 1, 2] indices.remove(best_index) index1 = indices[0] index2 = indices[1] self._cpr.print("msa selection taking best:", best_index, "others:(", index1, "and", index2, ")") try: line_1 = self.get_line_content(self._set_lines[index1]) line_2 = self.get_line_content( self._set_lines[best_index]) # should be best line_3 = self.get_line_content(self._set_lines[index2]) self._cpr.print("ocr_set:") self._cpr.print("text_A", line_1) self._cpr.print("text_B", line_2) self._cpr.print("text_C", line_3) lines = [line_1, line_2, line_3] line_1_ok = not Random.is_false_true_or_none(line_1) line_2_ok = not Random.is_false_true_or_none(line_2) line_3_ok = not Random.is_false_true_or_none(line_3) ok_lines = [line_1_ok, line_2_ok, line_3_ok] not_ok_indices = [] ok_indices = [] for ok_index, ok in enumerate(ok_lines): if ok is True: # not_ok_indices.append(ok_index) ok_indices.append(ok_index) ok_len = len(ok_indices) if ok_len == 1: result = lines[ok_indices[0]] elif ok_len == 0: result = None elif ok_len == 2: result = lines[ok_indices[0]] else: result = self._msa_handler.get_best_of_three( line_1, line_2, line_3) self._best_msa_text = result except Exception as e: self._cpr.printex( "ocr_set.py Exception in MSA, just taking line prio exception:", e) tr = inspect.trace() self._cpr.printex("trace is:", tr) self._best_msa_text = self.get_line_content(self._set_lines[1]) def obtain_best_index(self, use_n_dist_pivot, use_longest_pivot, default_best_index=1): # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot best_index = 1 if use_n_dist_pivot is True: ldist_best_index = self.get_shortest_n_distance_index( ) # this doesn't work in all cases atm best_index = ldist_best_index if use_longest_pivot is True: best_index = self.get_longest_index() indices = [0, 1, 2] indices.remove(best_index) other_indices = indices return best_index, other_indices def obtain_line_info(self, best_index, other_indices): line_1 = self._set_lines[other_indices[0]] line_2 = self._set_lines[best_index] # should be best line_3 = self._set_lines[other_indices[1]] text_1 = self.get_line_content(line_1) text_2 = self.get_line_content(line_2) # should be best text_3 = self.get_line_content(line_3) self._cpr.print("ocr_set:") self._cpr.print("text_A", text_1) self._cpr.print("text_B", text_2) self._cpr.print("text_C", text_3) line_1_ok = not Random.is_false_true_or_none(line_1) line_2_ok = not Random.is_false_true_or_none(line_2) line_3_ok = not Random.is_false_true_or_none(line_3) ok_lines = [line_1_ok, line_2_ok, line_3_ok] ok_indices = [] for ok_index, ok in enumerate(ok_lines): if ok is True: # not_ok_indices.append(ok_index) ok_indices.append(ok_index) ok_len = len(ok_indices) texts_return = [text_1, text_2, text_3] lines_return = [line_1, line_2, line_3] lines_return_ok = [line_1_ok, line_2_ok, line_3_ok] return texts_return, lines_return, lines_return_ok, ok_len def calculate_msa_best_all(self, use_ndist_pivot, use_longest_pivot, use_charconfs, use_wordwise, use_searchspaces, prefered_index=1): # get the pivot index and the other indices best_index, other_indices = self.obtain_best_index( use_ndist_pivot, use_longest_pivot, prefered_index) self._cpr.print("msa selection taking best:", best_index, "others:(", other_indices[0], "and", other_indices[1], ")") # fetch the lines to process and info which (and how many) lines are ok texts, lines, lines_ok, number_lines_ok = self.obtain_line_info( best_index, other_indices) # do the msa if there is at least one line ok (confidence vote can be done with one line also :)) if use_wordwise is True: if number_lines_ok != 0: result, self._text_seg = self._msa_handler.get_best_of_three_wordwise( lines[0], lines[1], lines[2], use_charconfs, use_searchspaces) else: result = None else: if number_lines_ok != 0: text_1 = self.get_line_content(lines[0]) text_2 = self.get_line_content(lines[1]) # should be best text_3 = self.get_line_content(lines[2]) result = self._msa_handler.get_best_of_three( text_1, text_2, text_3, line_1=lines[0], line_2=lines[1], line_3=lines[2], use_charconfs=use_charconfs, use_searchspaces=use_searchspaces) else: result = None self._best_msa_text = result def calculate_msa_best_charconf(self, take_n_dist_best_index=False, take_longest_as_pivot=True): # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot best_index = 1 if take_n_dist_best_index is True: ldist_best_index = self.get_shortest_n_distance_index( ) # this doesn't work in all cases atm best_index = ldist_best_index if take_longest_as_pivot is True: best_index = self.get_longest_index() indices = [0, 1, 2] indices.remove(best_index) index1 = indices[0] index2 = indices[1] self._cpr.print("msa selection taking best:", best_index, "others:(", index1, "and", index2, ")") try: line_1 = self._set_lines[index1] line_2 = self._set_lines[best_index] line_3 = self._set_lines[index2] text_1 = self.get_line_content(line_1) text_2 = self.get_line_content(line_2) # should be best text_3 = self.get_line_content(line_3) self._cpr.print("ocr_set:") self._cpr.print("text_A", text_1) self._cpr.print("text_B", text_2) self._cpr.print("text_C", text_3) lines = [text_1, text_2, text_3] line_1_ok = not Random.is_false_true_or_none(text_1) line_2_ok = not Random.is_false_true_or_none(text_2) line_3_ok = not Random.is_false_true_or_none(text_3) ok_lines = [line_1_ok, line_2_ok, line_3_ok] not_ok_indices = [] ok_indices = [] for ok_index, ok in enumerate(ok_lines): if ok is True: # not_ok_indices.append(ok_index) ok_indices.append(ok_index) ok_len = len(ok_indices) if ok_len == 0: result = None else: result = self._msa_handler.get_best_of_three(text_1, text_2, text_3, use_charconfs=True, \ line_1=line_1,line_2=line_2,line_3=line_3) self._best_msa_text = result except Exception as e: self._cpr.printex( "ocr_set.py Exception in MSA, just taking line prio exception:", e) tr = inspect.trace() self._cpr.printex("trace is:", tr) if take_n_dist_best_index is True: self._best_msa_text = self.get_line_content( self._set_lines[ldist_best_index]) else: self._best_msa_text = self.get_line_content( self._set_lines[best_index]) def get_shortest_n_distance_text(self): if self.shortest_distance_line_index >= 0: line = self.shortest_distance_line line_text = self.get_line_content(line) return line_text else: return None def set_shortest_n_distance_text(self, value): if self.shortest_distance_line_index >= 0: sd_line = self.shortest_distance_line sd_line_new_value = self.set_line_content(sd_line, value) self.set_shortest_n_distance_line(sd_line_new_value) else: return None def get_shortest_n_distance_line(self): if self.shortest_distance_line_index >= 0: line = self.shortest_distance_line return line else: return None def set_shortest_n_distance_line(self, value): self.shortest_distance_line = value def get_shortest_n_distance_index(self): if self.shortest_distance_line_index >= 0: return self.shortest_distance_line_index else: return None def print_shortest_n_distance_line(self): line = self.get_shortest_n_distance_text() if line is not None and line is not False: self._cpr.print(line) def print_msa_best_line(self): msa_text = self._best_msa_text if msa_text is not None and msa_text is not False: print(msa_text) else: self._cpr.print(str(msa_text)) def get_line_content(self, line): """ Helper method to get line content, because ocropus content has other access properties. Method behaves differently when the current set is a database set :param line: line element to check upn :return: string with line content, or 'False if line isn't defined. """ # hint: the attribute checked is created by hocr_line_normalizer if line is False: return False # elif hasattr(line, 'ocr_text_normalized'): if self._is_origin_database is False: # just the standard behaviour if line.ocr_text_normalized is not None: return line.ocr_text_normalized else: return line.ocr_text else: return line.textstr def set_line_content(self, line, value): """ Helper method to set line content, because ocropus content has other access properties. :param line: line element to set the value to :param value: value to set to 'ocr_text_normalized' property :return: line or false if line not defined """ # hint: the attribute checked is created by hocr_line_normalizer if line is False: return False line.ocr_text_normalized = value return line def unspace_lines(self, list_index_to_unspace, unspaced_list_index): unspaced_lines = self._text_unspacer.unspace_texts( self._set_lines, list_index_to_unspace, unspaced_list_index) self._unspaced = True self._refspaced = False self._set_lines = unspaced_lines def refspace_lines(self, list_index_to_adapt, list_index_reference): refspaced_lines = self._text_unspacer.refspace_texts( self._set_lines, list_index_to_adapt, list_index_reference) self._unspaced = False self._refspaced = True self._set_lines = refspaced_lines
class SearchSpaceProcessor(object): def __init__(self, y_size, x_size, wildcard_character, substitution_character): self._y_size = y_size self._x_size = x_size self._middle_index = Random.find_middle(self._x_size, True) self._pre_middle_index = self.get_middle_index() - 1 self._nex_middle_index = self.get_middle_index() + 1 self._wildcard_character = wildcard_character self._substitution_character = substitution_character self.similar_chars = [] self.similar_chars.append(['o', 'ö']) self.similar_chars.append(['<', 'o']) # untested is this really better? self.similar_chars.append(['O', 'Ö']) self.similar_chars.append(['0', 'O', '9']) self.similar_chars.append(['d', 'ö']) #self.similar_chars.append(['1', 'l']) self.similar_chars.append(['l', 'j', '1']) self.similar_chars.append(['I', 'l']) self.similar_chars.append(['u', 'ü']) self.similar_chars.append(['U', 'Ü', 'O']) self.similar_chars.append(['a', 'ä']) self.similar_chars.append(['A', 'Ä']) self.similar_chars.append([':', ';']) self.similar_chars.append(['-', '¬']) self.similar_chars.append(['"', "'"]) self.similar_chars.append(['C', "G", "c"]) # just for testing ... self.similar_chars.append(['.', ',']) self.similar_chars.append([',', ';']) self.similar_chars.append(['v', 'V']) self.similar_chars.append(['w', 'W']) self.similar_chars.append(['i', 'l', 't', '1', '.']) # 1 l i also possible self.similar_chars.append(['r', 'n']) self.similar_chars.append(['%', 'm']) self.similar_chars.append(['&', 'é']) self.similar_chars.append(['e', 'é']) config_handler = ConfigurationHandler(first_init=False) self._config = config_handler.get_config() self._cpr = ConditionalPrint(self._config.PRINT_SEARCH_SPACE_PROCESSOR, self._config.PRINT_EXCEPTION_LEVEL, self._config.PRINT_WARNING_LEVEL) def get_middle_index(self): return self._middle_index def get_simchars_for_char( self, char ): # todo similar chars for each char could be preprocessed once at start simchars_return_array = [] for simchars in self.similar_chars: if char in simchars: simchars_return_array.extend(simchars) if len(simchars_return_array) >= 1: return simchars_return_array return [char] def get_pre_middle_index(self): return self._pre_middle_index def get_nex_middle_index(self): return self._nex_middle_index def get_wildcard_char(self): return self._wildcard_character def get_substitution_char(self): return self._substitution_character def get_y_size(self): return self._y_size def validate_column_features(self, search_space, x_index, reference_char=None, count_up_similar_references=False): counter_whitespaces = 0 counter_wildcards = 0 counter_nones = 0 counter_characters = 0 counter_reference_char = 0 counter_same_characters = 0 counter_dict = {} counter_special_characters = 0 most_occuring_char = None otherchar = None otherchar_y_index = None simchars = None if reference_char is not None and count_up_similar_references is True: simchars = self.get_simchars_for_char(reference_char) if len(simchars) != 1: self._cpr.print("evaluate") # gather data for y_index in range(0, self.get_y_size()): row = search_space[y_index] column_item = row[x_index] if column_item == self.get_wildcard_char(): counter_wildcards += 1 elif column_item == ' ': counter_whitespaces += 1 elif column_item == None or column_item == False or column_item == True: counter_nones += 1 else: if reference_char is not None: if count_up_similar_references is False and column_item == reference_char: counter_reference_char += 1 if count_up_similar_references is True: matching = [s for s in simchars if column_item in s] boolmatch = len(matching) >= 1 if boolmatch is True: counter_reference_char += 1 counter_characters += 1 otherchar = column_item otherchar_y_index = y_index if column_item != None: if column_item != self._wildcard_character and \ column_item != " ": if not column_item in counter_dict.keys(): counter_dict.update({column_item: 1}) else: counter_dict[column_item] += 1 if Random.is_special_character(column_item): counter_special_characters += 1 # the highest amount of same characters in this column if len(counter_dict.items()) >= 1: most_occuring_char, counter_same_characters = max( counter_dict.items(), key=operator.itemgetter(1)) # extract features features = [] counter_whitespace_and_wildcards = counter_whitespaces + counter_wildcards if counter_nones == self.get_y_size(): features.append(ColumnFeatures.ONLY_NONE.value) if counter_wildcards == self.get_y_size( ) - 1 and counter_characters == 1: features.append((ColumnFeatures.ONE_CHAR_REST_WILDCARDS).value) # additional feature, the only char is a special character if Random.is_special_character(otherchar): features.append( ColumnFeatures.ONE_SPECIALCHAR_REST_WILDCARDS.value) if counter_whitespaces == self.get_y_size( ) - 1 and counter_characters == 1: features.append(ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value) if counter_whitespace_and_wildcards == self.get_y_size( ) - 1 and counter_characters == 1: features.append( ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value) # additional feature, the only char is a special character if otherchar != self._wildcard_character and otherchar != " "\ and Random.is_special_character(otherchar): #print("feature extraction") #print(search_space[0]) #print(search_space[1]) #print(search_space[2]) #print("x-index",x_index) features.append( ColumnFeatures. ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value) if counter_reference_char == self.get_y_size() - 1 and ( counter_whitespaces == 1 or counter_wildcards == 1): features.append(ColumnFeatures.MOSTLY_REFERENCE_CHAR.value) if counter_whitespaces == self.get_y_size(): features.append(ColumnFeatures.ONLY_WHITESPACE.value) if counter_reference_char == self.get_y_size(): features.append(ColumnFeatures.ONLY_WILDCARD.value) if counter_whitespace_and_wildcards == self.get_y_size(): features.append(ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value) if counter_reference_char >= 1: features.append(ColumnFeatures.CONTAINS_REFERENCE_CHAR.value) if counter_same_characters == self.get_y_size(): if counter_special_characters == self.get_y_size(): features.append(ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value) if Random.is_special_character(most_occuring_char) \ and counter_same_characters == self.get_y_size()-1 \ and most_occuring_char != self._wildcard_character \ and counter_whitespace_and_wildcards == 1: features.append(ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value) return features, otherchar, otherchar_y_index def shift_from_mid(self, search_space, line_index, to_left, other_substition_char=None): if other_substition_char is not None: used_substitution_char = other_substition_char else: used_substitution_char = self.get_substitution_char() mid_val = search_space[line_index][self.get_middle_index()] possible_shifts = [ ' ', self.get_wildcard_char(), used_substitution_char, None, False, True, 0 ] shifted = False if to_left is True: if search_space[line_index][ self.get_pre_middle_index()] in possible_shifts: search_space[line_index][self.get_pre_middle_index()] = mid_val search_space[line_index][ self.get_middle_index()] = used_substitution_char shifted = True else: if search_space[line_index][ self.get_nex_middle_index()] in possible_shifts: search_space[line_index][self.get_nex_middle_index()] = mid_val search_space[line_index][ self.get_middle_index()] = used_substitution_char shifted = True return search_space, shifted def shift_from_to(self, search_space, y_index, x_from, x_to, other_substition_char=None): if other_substition_char is not None: used_substitution_char = other_substition_char else: used_substitution_char = self.get_substitution_char() possible_shifts = [ ' ', self.get_wildcard_char(), used_substitution_char, None, False, True, 0 ] swap_val = search_space[y_index][x_from] shifted = False if search_space[y_index][x_to] in possible_shifts: search_space[y_index][x_to] = swap_val search_space[y_index][x_from] = used_substitution_char shifted = True return search_space, shifted def set_space_to_value(self, search_space, y_index, x_index, used_subsitution_value=None): if used_subsitution_value is not None: used_substitution_char = used_subsitution_value else: used_substitution_char = self.get_substitution_char() search_space[y_index][x_index] = used_substitution_char shifted = True return search_space, shifted def process_search_space(self, search_space, search_space_confs, use_similar_chars): processed_space = search_space processed_space_confs = search_space_confs change_done = False # self.output_as_scrollbar(search_space) #todo build this in someday mid_column_feats, otherchar_mid, oc_mid_index = self.validate_column_features( search_space, self.get_middle_index()) if self._config.MSA_BEST_SEARCHSPACE_MITIGATE_SPACE_HOPS: if ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value in mid_column_feats: # some char 'hopped' over a whitespace, get the characters back together pre_column_feats, otherchar_pre, oc_pre_index = self.validate_column_features(search_space, \ self.get_pre_middle_index(), reference_char=None) nex_column_feats, otherchar_nex, oc_nex_index = self.validate_column_features(search_space, \ self.get_nex_middle_index(), reference_char=None) if ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats and \ ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats: if otherchar_nex == otherchar_pre and \ oc_pre_index != oc_nex_index: processed_space, shifted_longtrans = self.shift_from_to( search_space, oc_pre_index, 0, 2) if shifted_longtrans is True: processed_space_confs, shifted_confs_longtrangs = self.shift_from_to( search_space_confs, oc_pre_index, 0, 2, 0) change_done = True if change_done: search_space = processed_space search_space_confs = processed_space_confs if ColumnFeatures.ONE_CHAR_REST_WILDCARDS.value in mid_column_feats \ or ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value in mid_column_feats \ or ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in mid_column_feats: #if ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in mid_column_feats: #if otherchar_mid == "l": # self._cpr.print("beep!") pre_column_feats, otherchar_pre, oc_pre_index = self.validate_column_features(search_space, \ self.get_pre_middle_index(), otherchar_mid, use_similar_chars) nex_column_feats, otherchar_nex, oc_nex_index = self.validate_column_features(search_space, \ self.get_nex_middle_index(), otherchar_mid, use_similar_chars) shifted = False left_right = None if ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in pre_column_feats\ or (ColumnFeatures.CONTAINS_REFERENCE_CHAR.value in pre_column_feats and ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats): left_right = True processed_space, shifted = self.shift_from_mid( search_space, oc_mid_index, left_right) if ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in nex_column_feats \ or (ColumnFeatures.CONTAINS_REFERENCE_CHAR.value in nex_column_feats and ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats): left_right = False processed_space, shifted = self.shift_from_mid( search_space, oc_mid_index, left_right) if shifted: if self._config.MSA_BEST_SEARCHSPACE_QUOTE_NORMALIZATION \ and (otherchar_mid == "'" or otherchar_mid == '"'): ## this part here merges '' to single ' and corrects the alignment x_middle_index = self.get_middle_index() if left_right is True: delete_index = x_middle_index + 1 shift_index = x_middle_index - 1 else: delete_index = x_middle_index - 1 shift_index = x_middle_index + 1 if otherchar_mid == "'": processed_space, shiftedD1 = self.set_space_to_value( search_space, oc_mid_index, shift_index, '"') processed_space, shiftedD2 = self.set_space_to_value( processed_space, oc_mid_index, delete_index) search_space_confs, shiftedD3 = self.set_space_to_value( search_space_confs, oc_mid_index, delete_index, used_subsitution_value=0) else: # just push confidences because it was confusion with ' and " should be prioritized search_space_confs, shiftedD3 = self.set_space_to_value( search_space_confs, oc_mid_index, shift_index, used_subsitution_value=1000) processed_space_confs, shifted_confs = self.shift_from_mid( search_space_confs, oc_mid_index, left_right, 0) change_done = True elif ColumnFeatures.ONLY_WHITESPACE.value in mid_column_feats or ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in mid_column_feats: # this case checks for 'far-transitions' of similar chars and does them if possible pre_column_feats, otherchar_pre, oc_pre_index = self.validate_column_features(search_space, \ self.get_pre_middle_index(), otherchar_mid, use_similar_chars) nex_column_feats, otherchar_nex, oc_nex_index = self.validate_column_features(search_space, \ self.get_nex_middle_index(), otherchar_mid, use_similar_chars) reference_char = None reference_char_y_index = None check_index = None pre_is_one_char = False nex_is_one_char = False if ColumnFeatures.ONE_CHAR_REST_WILDCARDS.value in pre_column_feats: reference_char = otherchar_pre reference_char_y_index = oc_pre_index pre_is_one_char = True check_index = self.get_nex_middle_index() check_index_from = self.get_pre_middle_index() if ColumnFeatures.ONE_CHAR_REST_WILDCARDS.value in nex_column_feats: reference_char = otherchar_nex reference_char_y_index = oc_nex_index nex_is_one_char = True check_index = self.get_pre_middle_index() check_index_from = self.get_nex_middle_index() if (pre_is_one_char is True and nex_is_one_char is False) \ or (pre_is_one_char is False and nex_is_one_char is True): other_column_feats, otherchar_other, oc_other_index = self.validate_column_features(search_space, \ check_index, reference_char, use_similar_chars) #print("search_space", search_space) if ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in other_column_feats: processed_space, shifted_longtrans = self.shift_from_to(search_space, reference_char_y_index, \ check_index_from, check_index) if shifted_longtrans is True: processed_space_confs, shifted_confs_longtrangs = self.shift_from_to(search_space_confs, \ reference_char_y_index, check_index_from, check_index, 0) change_done = True if self._config.MSA_BEST_SEARCHSPACE_DROP_SINGLE_CH_NEAR_SC: #print("processed space") #print(processed_space[0]) #print(processed_space[1]) #print(processed_space[2]) mid_column_feats2, otherchar_mid2, oc_mid_index2 = self.validate_column_features( processed_space, self.get_middle_index()) pre_column_feats2, otherchar_pre2, oc_pre_index2 = self.validate_column_features(processed_space, \ self.get_pre_middle_index(), reference_char=None) nex_column_feats2, otherchar_nex2, oc_nex_index2 = self.validate_column_features(processed_space, \ self.get_nex_middle_index(), reference_char=None) if ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value in mid_column_feats2: if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats2: mid_char_at_oc_index = search_space[oc_pre_index2][ self.get_middle_index()] if mid_char_at_oc_index == self.get_wildcard_char() or \ mid_char_at_oc_index == " ": processed_space2, shiftedD3 = self.shift_from_to( processed_space, oc_pre_index2, self.get_pre_middle_index(), self.get_middle_index(), self.get_wildcard_char()) processed_space_confs2, shiftedD3 = self.shift_from_to( processed_space_confs, oc_pre_index2, self.get_pre_middle_index(), self.get_middle_index(), 0) #processed_space2, shiftedD3= self.set_space_to_value(processed_space,oc_pre_index2,self.get_pre_middle_index(),self.get_wildcard_char()) #processed_space_confs2, shiftedD3= self.set_space_to_value(processed_space_confs,oc_pre_index2,self.get_pre_middle_index(),0) #processed_space2, shiftedD3 =self.shift_from_to(processed_space,oc_pre_index,self.get_pre_middle_index(), self.get_middle_index(),self._wildcard_character) #print("cool") #if shiftedD3: # print("ssp corrected") processed_space = processed_space2 processed_space_confs = processed_space_confs2 if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats2: mid_char_at_oc_index = search_space[oc_nex_index2][ self.get_middle_index()] if mid_char_at_oc_index == self.get_wildcard_char() or \ mid_char_at_oc_index == " ": processed_space2, shiftedD3 = self.shift_from_to( processed_space, oc_nex_index2, self.get_nex_middle_index(), self.get_middle_index(), self.get_wildcard_char()) processed_space_confs2, shiftedD3 = self.shift_from_to( processed_space_confs, oc_nex_index2, self.get_nex_middle_index(), self.get_middle_index(), 0) #processed_space2, shiftedD3= self.set_space_to_value(processed_space,oc_nex_index2,self.get_nex_middle_index(),self.get_wildcard_char()) #processed_space_confs2, shiftedD3= self.set_space_to_value(processed_space_confs,oc_nex_index2,self.get_nex_middle_index(), 0) #print("cool") #if shiftedD3: #print("ssp corrected") processed_space = processed_space2 processed_space_confs = processed_space_confs2 #elif ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value in mid_column_feats: #if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats2: #print("asd") #if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats2: #print("asd") #search_space_confs, shiftedD3 = self.set_space_to_value(processed_space, oc_nex_index, # delete_index, used_subsitution_value=self._wildcard_character) return processed_space, processed_space_confs, change_done def output_as_scrollbar(self, search_space, active=False): if active is False: return sys.stdout.write(f"Scrollingbar {search_space[1]} \r") sys.stdout.flush()
# continue #if int(split[0])<300: # continue #if not "_1956" in file.name: # continue # fetch additional information for current file (if toggled in info) additional_info = add_info_handler.fetch_additional_information_simple( file) # fetch basic data for current file ocromore_data = dh.fetch_ocromore_data(file, additional_info=additional_info) output_analyzer.set_current_data( ocromore_data) # prepare output analyzer cpr.print("Checking file:", ocromore_data['file_info'].path) # extract features from basic data ocromore_data = feature_extractor.extract_file_features(ocromore_data) # line segmentation ocromore_data = segment_classifier.classify_file_segments( ocromore_data) # segment parsing ocromore_data = segment_parser.parse_segments(ocromore_data) # output file synthesis segment_parser.write_result_to_output(True, ocromore_data) # todo # output analysis steps output_analyzer.log_segmentation_simple( ocromore_data) # log the recognized segmentation output_analyzer.log_parsed_output(
class OCRcomparison: """ Storage class for multiple Ocr_Sets """ def __init__(self, predictor=None, vocabulary_checker=None, first_config_init=False): self.ocr_sets = [] self.line_height_information = [] config_handler = ConfigurationHandler(first_init=first_config_init) self.config = config_handler.get_config() if 'ExceptionInitializing' in self.config: self.cpr = ConditionalPrint(False, False, False) else: self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.predictor = predictor self.vocabulary_checker = vocabulary_checker def load_predictor(self, predictor): self.predictor = predictor def add_set(self, set_to_add): self.ocr_sets.append(set_to_add) def add_line_information(self, line_height_information): self.line_height_information.append(line_height_information) def set_dataframe_wrapper(self, dataframe_wrapper): self._dataframe_wrapper = dataframe_wrapper def set_vocabulary_checker(self, vocabulary_checker): self.vocabulary_checker = vocabulary_checker def sort_set(self): """ Sort the ocr_sets by y_mean values :return: """ def take_y_mean(my_set): ym = my_set.y_mean return ym sorted_sets = sorted(self.ocr_sets, key=take_y_mean, reverse=False) self.ocr_sets = sorted_sets def unspace_list(self, list_index_to_unspace, unspaced_list_index): """ apply the unspacing algorithm to one of the lists, take another list as comparison (which is not spaced) :param list_index_to_unspace: index for the set to unspace :param unspaced_list_index: index for the non-spaced set :return: """ for set in self.ocr_sets: set.unspace_lines(list_index_to_unspace, unspaced_list_index) def refspace_list(self, list_index_to_adapt, list_index_reference): for set in self.ocr_sets: set.refspace_lines(list_index_to_adapt, list_index_reference) def print_sets(self, diff_only=False): for current_set in self.ocr_sets: current_set.print_me(diff_only) def do_n_distance_keying(self, wordwise_keying=False): if wordwise_keying is False: # the keying is done on line base - this is the standard mode without database for current_set in self.ocr_sets: current_set.calculate_n_distance_keying() else: # the keying is done wordwise - can be done with sets originated by database for current_set in self.ocr_sets: current_set.calculate_n_distance_keying_wordwise() def do_msa_best(self): for current_set in self.ocr_sets: current_set.calculate_msa_best() def do_msa_best_with_ndist_pivot(self): self.do_n_distance_keying() for current_set in self.ocr_sets: current_set.calculate_msa_best(True) def do_msa_best_with_ndist_pivot_charconf(self): self.do_n_distance_keying() for current_set in self.ocr_sets: current_set.calculate_msa_best_charconf(True) def do_msa_best_new(self, use_ndist_pivot, use_longest_pivot, use_charconfs, use_wordwise, use_searchspaces, do_postcorrection): if use_ndist_pivot is True: self.do_n_distance_keying() for current_set in self.ocr_sets: current_set.calculate_msa_best_all(use_ndist_pivot, use_longest_pivot, use_charconfs, use_wordwise, use_searchspaces) if do_postcorrection is True: self.do_postcorrection(True) print("done") def print_n_distance_keying_results(self): self.cpr.print("N_DISTANCE_KEYING_RESULTS ") for current_set in self.ocr_sets: current_set.print_shortest_n_distance_line() def print_msa_best_results(self): self.cpr.print("MSA_BEST_RESULTS ") for current_set in self.ocr_sets: current_set.print_msa_best_line() def add_linebreaks(self, previous_line, current_line, previous_line_index, sd_line_index, line_heigth_info): MODE = 'TAKE_CURRENT_LINE_DIST' if previous_line is None: return None if MODE is 'TAKE_CURRENT_LINE_DIST': MARGIN = 0 # tolerance margin current_lh_info = line_heigth_info[sd_line_index] (xp_start, yp_start, xp_stop, yp_stop) = previous_line.coordinates (xc_start, yc_start, xc_stop, yc_stop) = current_line.coordinates y_dist = yc_start - yp_stop if y_dist <= 0: return None line_distance = current_lh_info.get_line_distance() y_times = (y_dist + MARGIN) / line_distance y_times_absolute = TypeCasts.round_to_int(y_times) if y_times_absolute > 0: generated_text = Random.append_pad_values( "", y_times_absolute, "\n") return generated_text else: return None self.cpr.print("Undefined case reached shouldn't happen") return None def save_n_distance_keying_results_to_file(self, filename, mode_add_linebreaks=False): file = open(filename, 'w+') previous_sd_line = None previous_sd_line_index = None for current_set in self.ocr_sets: sd_text = current_set.get_shortest_n_distance_text() # add comparison from previous to actual line break here if mode_add_linebreaks: sd_line_index = current_set.get_shortest_n_distance_index() sd_line = current_set.get_shortest_n_distance_line() if sd_line is True or sd_line is False: continue additional_breaks = \ self.add_linebreaks(previous_sd_line, sd_line, previous_sd_line_index, sd_line_index, self.line_height_information) if additional_breaks is not None: file.write(additional_breaks) previous_sd_line = sd_line previous_sd_line_index = sd_line_index # do not print lines which are mostly recognized with no content at the moment if sd_text is not None and sd_text is not False: file.write(sd_text + "\n") file.close() def save_dataset_to_file(self, filename, set_index, mode_add_linebreaks=False, other_set=""): dir = os.path.dirname(filename) if not os.path.exists(dir): os.makedirs(dir) file = open(filename, 'w+', encoding="utf-8") previous_dataset_line = None previous_dataset_line_index = None for current_set in self.ocr_sets: if other_set == 'msa_best': dataset_text = current_set.get_msa_best_text() elif other_set == 'ndist_keying': dataset_text = current_set.get_shortest_n_distance_text() else: dataset_text = current_set.get_line_set_value_text(set_index) # add comparison from previous to actual line break here if mode_add_linebreaks: dataset_line = current_set.get_line_set_value_line(set_index) if dataset_line is True or dataset_line is False: continue additional_breaks = \ self.add_linebreaks(previous_dataset_line, dataset_line, previous_dataset_line_index, set_index, self.line_height_information) if additional_breaks is not None: file.write(additional_breaks) previous_dataset_line = dataset_line previous_dataset_line_index = set_index # do not print lines which are mostly recognized with no content at the moment if dataset_text is not None and dataset_text is not False: file.write(dataset_text + "\n") file.close() def save_dataset_to_hocr(self, filename, set_index, mode_add_linebreaks=False, other_set=""): #TODO: Import const to config or/and rework imagepath generating filename = os.path.normpath(filename).replace("\\", "/") IMG_PATH = "/media/sf_ShareVB/many_years_firmprofiles/" IMG_FILETYPES = [".jpg"] imgdir = "None" if IMG_PATH != "": imgname = "_".join( filename.split("/")[-1].replace("msa_best", "").split("_")[:-1]) if imgname[-3:] == "msa": imgname = imgname[:-3] imgfolder = filename.split("/")[-2] imgpath = IMG_PATH + "**/" + imgfolder + "/**/" + imgname + "*" imgdirs = list( chain.from_iterable( glob.iglob(imgpath + filetype, recursive=True) for filetype in IMG_FILETYPES)) if imgdirs is not None and len(imgdirs) > 0: imgdir = imgdirs[0] filename += ".hocr" dir = os.path.dirname(filename) if not os.path.exists(dir): os.makedirs(dir) file = open(filename, 'w+', encoding="utf-8") wrote_header = False for lidx, current_set in enumerate(self.ocr_sets): if lidx == 0: for last_set in self.ocr_sets[lidx]._set_lines: if last_set.data["line_x1"] != []: file_cords = last_set.data break if other_set == 'msa_best': dataset_text = current_set.get_msa_best_text() dataset_bbox = None name = ["msa", "combined"] for lines in current_set._set_lines: ldata = lines.data if dataset_bbox is None and ldata["line_x0"]: dataset_bbox = [ min(ldata["line_x0"]), min(ldata["line_y0"]), max(ldata["line_x1"]), max(ldata["line_y1"]) ] elif ldata["line_x0"]: if min(ldata["line_x0"]) < dataset_bbox[0]: dataset_bbox[0] = min(ldata["line_x0"]) if min(ldata["line_y0"]) < dataset_bbox[1]: dataset_bbox[1] = min(ldata["line_y0"]) if max(ldata["line_x1"]) > dataset_bbox[2]: dataset_bbox[2] = max(ldata["line_x1"]) if max(ldata["line_y1"]) > dataset_bbox[3]: dataset_bbox[3] = max(ldata["line_y1"]) else: dataset_text = current_set.get_line_set_value_text(set_index) dataset_bbox = None ldata = current_set._set_lines[set_index].data name = current_set._set_lines[set_index].name if ldata["line_x0"]: dataset_bbox = [ min(ldata["line_x0"]), min(ldata["line_y0"]), max(ldata["line_x1"]), max(ldata["line_y1"]) ] # do not print lines which are mostly recognized with no content at the moment if dataset_text is not None and dataset_text is not False and dataset_bbox: if wrote_header == False: wrote_header = True hocr_header = f'''<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>OCR Results</title> <meta http-equiv="content-type" content="text/html; charset=utf-8" /> <meta name='AKF-OCR' content='{name[0]}-{name[1]}' /> <meta name='ocr-capabilities' content='ocr_line ocrx_word'/> </head> <body> <div class='ocr_page' title='image {imgdir}; bbox 0 0 {int(file_cords["line_x1"][0])} {int(file_cords["line_y1"][0])}'>\n''' file.write(hocr_header) dtext = self._write_line_infos(dataset_bbox, dataset_text, set_index, other_set, lidx, current_set) file.write(dtext) if lidx == len(self.ocr_sets) - 2: file.write("\t\t</div>\n\t</body>\n</html>") break file.close() def _write_line_infos(self, dataset_bbox, dataset_text, set_index, other_set, lidx, current_set): dtext = f''' <span class ='ocr_line' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' ><br/>\n''' if other_set == "msa_best": if current_set._text_seg == None: dtext += f''' <span class ='ocrx_word' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' >{dataset_text}</span >\n''' else: for number, word in current_set._text_seg.items(): if number != -1.0: set_index = 2 if number in current_set._set_lines[1].word["UID"].keys() and \ set(current_set._set_lines[1].word["text"][number]) != set("¦") and \ current_set._set_lines[1].data["word_x0"]: set_index = 1 elif number in current_set._set_lines[0].word["UID"].keys() and \ set(current_set._set_lines[0].word["text"][number]) != set("¦") and \ current_set._set_lines[0].data["word_x0"]: set_index = 0 dataset_bbox = self._get_wbbox_new( dataset_bbox, number, current_set._set_lines[set_index].data) dtext += f''' <span class ='ocrx_word' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' >{word}</span >\n''' set_index = 0 else: for number, word in current_set._set_lines[set_index].word[ "text"].items(): dataset_bbox = self._get_wbbox( dataset_bbox, number, current_set._set_lines[set_index].word["UID"], current_set._set_lines[set_index].data) dtext += f''' <span class ='ocrx_word' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' >{word}</span >\n''' dtext += f''' </span>\n''' return dtext def _get_wbbox(self, bbox, number, nb_dict, data, avg=True): wbbox_pos = 0 for nbkey in nb_dict: if nbkey != number: if wbbox_pos == 0: wbbox_pos = -1 wbbox_pos += len(nb_dict[nbkey]) else: wbbox_pos += len(nb_dict[nbkey]) / 2 break if wbbox_pos != 0: if number != 0.0: bbox[0] = data["word_x0"][int(wbbox_pos)] bbox[2] = data["word_x1"][int(wbbox_pos)] return bbox def _get_wbbox_new(self, bbox, number, data, avg=True): uid_arr = np.array(data["UID"]) wc_arr = np.where(uid_arr == -1) nb_arr = np.array(data["word_match"]) nb_arr = np.delete(nb_arr, wc_arr) #for wc_pos in wc_arr[0]: # del nb_arr[wc_pos] nb_pos = np.where(nb_arr == number) if len(nb_pos[0]) == 0: stop = "STOP" #return wbbox_pos = nb_pos[0][int(len(nb_pos[0]) / 2)] if wbbox_pos != 0: bbox[0] = data["word_x0"][int(nb_pos[0][0])] bbox[2] = data["word_x1"][int(nb_pos[0][-1])] return bbox def export_text_lines(self): """ Exports the lines of text of the result as list :return: list with lines """ return_list = [] for setindex, current_set in enumerate(self.ocr_sets): sd_line = current_set.get_shortest_n_distance_line() # do not list lines which are mostly recognized with no content at the moment if sd_line is not None and sd_line is not False: return_list.append(sd_line) return return_list def do_vocabulary_correction(self): store_last_entry = None for current_set in self.ocr_sets: msa_best_text = current_set.get_msa_best_text() msa_best_text_corrected = "" msa_best_ttokenized = msa_best_text.split() len_tokens = len(msa_best_ttokenized) for word_index, word in enumerate(msa_best_ttokenized): #if "Tee" in word: # print("asd") if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION: if store_last_entry != None: # don't correct first follow up line word to seperation word store_last_entry = None msa_best_text_corrected += " " + word continue if len_tokens - 1 == word_index: tdash = self.vocabulary_checker.word_trails_with_dash( word) if tdash: store_last_entry = word msa_best_text_corrected += " " + word continue word_wo_sc, ratio = self.vocabulary_checker.without_special_chars( word) if ratio == 0 or len(word_wo_sc) <= 2: msa_best_text_corrected += " " + word continue word_wb, bstart, btrail, changeb = self.vocabulary_checker.remove_and_give_borders( word) if changeb: word_correct_vc, suggestions, first_letter_high = self.vocabulary_checker.correct_text( word_wb) if word_correct_vc is None: word_correct = word else: word_correct = bstart + word_correct_vc + btrail else: word_correct, suggestions, first_letter_high = self.vocabulary_checker.correct_text( word) if word_correct is None: msa_best_text_corrected += " " + word else: msa_best_text_corrected += " " + word_correct msa_best_text_corrected = msa_best_text_corrected.lstrip(" ") if self.config.KEYING_RESULT_VC_PRINTDIFF and msa_best_text_corrected != msa_best_text: print("vocab in :", msa_best_text) print("vocab out:", msa_best_text_corrected) current_set.set_msa_best_text(msa_best_text_corrected) def do_postcorrection(self, postcorrect_keying=False, postcorrect_ndist=False, postcorrect_msa=False, postcorrect_other=False, postcorrection_index=0): """ Do postcorrection steps for a specified list of sets or for the resulting lines of n_distkeying :param postcorrect_keying: if this is true, the lines of n_distkeying are postcorrected, otherwise it's specified by pc_index :param postcorrection_index: specifies the list of sets which is postcorrected if pc_keying is false :return: """ if postcorrect_keying is False: return for current_set in self.ocr_sets: if postcorrect_ndist: sd_line_text = current_set.get_shortest_n_distance_text() if sd_line_text is not None and sd_line_text is not True and sd_line_text is not False: sd_line_text_corrected = TextCorrector.correct_line_text( sd_line_text) current_set.set_shortest_n_distance_text( sd_line_text_corrected) if postcorrect_msa: msa_best_text = current_set.get_msa_best_text() if msa_best_text is not None and msa_best_text is not True and msa_best_text is not False: msa_best_text_corrected = TextCorrector.correct_line_text( msa_best_text) current_set.set_msa_best_text(msa_best_text_corrected)