class TableHandler(object): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.PRINT_TO_CHECKFILE = False # a line starting with these words can't be in a table self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:", "von","Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:","Umstellung"] #with open("checkfile_tables.txt", "w") as myfile: # myfile.write("----" + "\n") def recognize_a_line(self, line): if line == None or line == False or line == True or line.textstr == None: return False whole_text = line.textstr self.cpr.print("recognizing line:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for key_index, key in enumerate(line.word['text']): word = line.word['text'][key] uid_info = line.word['UID'][key] word_xstart = line.data['word_x0'][character_index] word_xstop = line.data['word_x1'][character_index] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if key_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word is None or word == "": continue if key_index == 0: if word in self.filter_start_words: first_word_no_table_indicator = True if word.lower() == "ultimo": ultimo_is_first_word = True if word[0] == "(": starts_with_parenthesis = True if key_index == len(line.word['text'])-1: if word[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word)) counters_numbers.append(counter_numbers_word) character_index += len(uid_info) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars shouldn't happen, no recognizion") return False special_chars_ratio = counter_special_chars/ counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces/ counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers/counter,2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths)-1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True self.cpr.print("alle cntr:", counter_chars) self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio) self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio) self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio) self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio) self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio) self.cpr.print("x_box_sizes", x_box_sizes) self.cpr.print("x_gaps", x_gaps) self.cpr.print("x_gap_max_size", maximum_x_gap) self.cpr.print("x_gaps_mean", mean_x_gap) self.cpr.print("x_gaps_median", median_x_gap) if "Gewinn nach Vortrag" in whole_text: print("") if ((alphabetical_ratio < 0.75 and \ numbers_ratio > 0.2 and \ counter_chars > 5 and \ counter_words >= 2) and not \ (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word: if first_word_no_table_indicator: return False if mean_x_gap <= 115: return False if many_alphabetical_in_last_word: return False if many_alphabetical_in_middle_words and many_numbers_in_first_word: return False self.cpr.print("possible entry:", whole_text) if self.PRINT_TO_CHECKFILE: with open("checkfile_tables.txt", "a") as myfile: myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \ + "||| median x_gap: " + str(median_x_gap)+"\n") print("jab") return True return False
class EndobjectFactory(object): """ Creates an object with the following structure and provides exporting methods: segment_tag_1: [ ---> this level is created by set_current_main_list { type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 city: "Neustadt" }, { type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 city: "Neustadt" } ], segment_tag_2: [ { ... } ... ] """ def __init__(self): self.my_object = {} self.current_main_list = None self.pp = pprint.PrettyPrinter(indent=5) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) if self.config.REMOVE_TAGS_IN_ORIG_DIFF: self.known_uc = KnownUncategories() def set_current_main_list(self, segment_tag): if segment_tag not in self.my_object.keys(): self.my_object[segment_tag] = [] # create the main list (all subsequent entries are stored here) self.current_main_list = self.my_object[segment_tag] # create a short link on the main list def add_to_my_obj(self, key, value, object_number=0, only_filled=False): if only_filled is True and (value == None or value == "" or value == [] or value == {}): return False # fill main list if object index not in len_list = len(self.current_main_list) if len_list < object_number+1: for index in range(len_list,object_number+1): self.current_main_list.append({}) self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value) # add or insert to the main_list self.current_main_list[object_number][key] = value return True def print_me_and_return(self): print("my_object is:") self.pp.pprint(self.my_object) return self.my_object def print_current_main(self): print("current_main:") self.pp.pprint(self.current_main_list) def export_as_json(self): my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False) return my_obj_json def export_as_json_at_key(self, key, remove_first_object=False): if key not in self.my_object.keys(): return None my_obj = self.my_object[key] if remove_first_object: if len(my_obj) >= 1: my_obj = my_obj[1:] # remove the first object which usally contains generic info my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False) return my_obj_json @staticmethod def fetch_subentries_recursive_check(entry): """ Fetches all subentries (values) from an entry and writes them to a list of texts This get's called recursively within the function until all subentries are found :param entry: entry to fetch the subentries from :return: list of subentries """ final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value) final_texts.extend(recursive_texts) return final_texts @staticmethod def fetch_keys_recusive_check(entry, final_keys, create_multiple=True): """ Fetches all keys in an object and it's sub-objects calls itself recursively until all keys are found writes final keys to final_keys array and returns this :param entry: object to fetch the sub-keys from :param final_keys: list of final keys (initial state) :param create_multiple: if the same key occurs multiple times it still gets added :return: final_keys with added keys from object """ if isinstance(entry, list): for item in entry: final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple) return final_keys elif not isinstance(entry, dict): # just return if there are no keys (cause no dictionary) return final_keys for key in entry: value = entry[key] if create_multiple or key not in final_keys: if isinstance(key, int): continue final_keys.append(key) final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys) return final_keys def diff_seg_to_orig_at_key(self, key): """ def fetch_subentries_recursive(entry): final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = fetch_subentries_recursive(value) final_texts.extend(recursive_texts) return final_texts """ if key not in self.my_object.keys(): return None my_data = self.my_object[key] # check if the orig-post property can exist warn if not if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: self.cpr.printw("trying to fetch original data, original data is not added to results") self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") if len(my_data) <= 0: self.cpr.printw("no data to do returning") return return # todo this seems to be wrong # copy orig string original_text = my_data[0]['origpost'] rest_text = original_text # fetch parsed entries for diff all_final_entries = [] # array of final entries for index in range(1, len(my_data)): entry = my_data[index] final_entries = fetch_subentries_recursive(entry) all_final_entries.extend(final_entries) # order diff data after length all_final_entries.sort(key=lambda x: len(x)) all_final_entries.reverse() # subtract for text in all_final_entries: rest_text = rest_text.replace(text, "") rest_text = rest_text.strip() return rest_text, original_text def diff_parsed_to_orig_at_key(self, key): """ def fetch_subentries_recursive(entry): final_texts = [] for item in entry: if isinstance(entry, list): value = item else: # item is a key value = entry[item] if isinstance(value, str): final_texts.append(value) elif isinstance(value, int): final_texts.append(str(value)) elif isinstance(value, object): obj_size = len(value) if obj_size > 0: recursive_texts = fetch_subentries_recursive(value) final_texts.extend(recursive_texts) return final_texts def fetch_keys_recusive(entry, final_keys, create_multiple=True): # just return if there are no keys (cause no dictionary) if not isinstance(entry, dict): return final_keys for key in entry: value = entry[key] if create_multiple or key not in final_keys: if isinstance(key, int): continue final_keys.append(key) final_keys = fetch_keys_recusive(value, final_keys) return final_keys """ if key not in self.my_object.keys(): return None #if key == "KursVonZuteilungsrechten": # print("todo remove debug") my_data = self.my_object[key] # check if the orig-post property can exist warn if not if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: self.cpr.printw("trying to fetch original data, original data is not added to results") self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") if len(my_data) <= 0: self.cpr.printw("no data to do returning") return # copy orig string original_text = my_data[0]['origpost'] rest_text = original_text # fetch parsed entries for diff pool_entries = [] # array of final entries for index in range(1, len(my_data)): entry = my_data[index] final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry) pool_entries.extend(final_entries) if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: # removes all spaces from rest and comparison values because spaces are often # a problem in subtracting the rests rest_text = rest_text.replace(" ", "") for index in range(0,len(pool_entries)): pool_entries[index] = pool_entries[index].replace(" ", "") all_final_entries = [] # add the entries to the complete subtraction and tag them with '1' for pentry in pool_entries: all_final_entries.append((pentry, 1)) # if keys shall be subracted also add them also if self.config.REMOVE_TAGS_IN_ORIG_DIFF: pool_keys = [] # gets multiple of the same key for later 1 by 1 subtraction for index in range(1, len(my_data)): pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True) # also remove spaces in keys if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: for index in range(0, len(pool_keys)): pool_keys[index] = pool_keys[index].replace(" ", "") final_keys = [] for pkey in pool_keys: final_keys.append((pkey, 2)) all_final_entries.extend(final_keys) # order diff data after length all_final_entries.sort(key=lambda x: len(x[0])) all_final_entries.reverse() # subtract for entry in all_final_entries: text = entry[0] text_or_key = entry[1] if text_or_key == 2: if text in self.known_uc.unkeys: continue text_stripped = text.strip() # remove spaces so texts better fit in rest_text = rest_text.replace(text_stripped, "", 1) rest_text = rest_text.strip() return rest_text, original_text
class SegmentParser(object): """ Parse the classified segments segment by segment, each segment defined code the parser points to. """ def __init__(self, output_analyzer, dictionary_handler, ocromore_data=None): self.ef = EndobjectFactory() self.dictionary_handler = dictionary_handler # map which maps tags to functions for parsing -> change constuctor for other project fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.function_map = fmap.get_function_map() self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/" def clear_result(self, output_analyzer, dictionary_handler, ocromore_data=None): # create a new end object factory, new content self.ef = EndobjectFactory() # map to the new ef object which has been recreated fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) self.function_map = fmap.get_function_map() def parse_segments(self, ocromore_data): self.ocromore_data = ocromore_data segmentation = ocromore_data['segmentation'] segmentation_classes = segmentation.my_classes # add all text from original file if activated (i.e. for debugging purposes) if self.config.ADD_FULLTEXT_ENTRY: all_texts = self.get_all_text(ocromore_data) self.ef.set_current_main_list("overall_info") self.ef.add_to_my_obj("fulltexts", all_texts) # add additional info to result if self.config.ADDITIONAL_INFORMATION and self.config.ADD_ADDITIONAL_INFO: if not self.config.ADD_FULLTEXT_ENTRY: self.ef.set_current_main_list("Information") self.ef.add_to_my_obj("additionals", ocromore_data["additional_info"]) # add a duplicate of the original text from which in the below analysis case the files get subtracted if self.config.LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE: if self.config.ADD_FULLTEXT_ENTRY: ocromore_data['analysis_to_orig'] = {} original_rest, complete_text = self.get_all_text( ocromore_data, join_separated_lines=True) ocromore_data['analysis_to_orig'][ 'original_rest'] = original_rest ocromore_data['analysis_to_orig'][ 'original_length_initial'] = len(complete_text) else: self.cpr.printw( "activated segment to orig diff, but no saving of origin activate ADD_FULLTEXT_ENTRY " "in config for this functionality") #Init toolbbox snippet = None if self.config.USE_SNIPPET: if "./" in self.config.IMGPATH: ipath = os.path.dirname( ocromore_data["file_info"].path) + self.config.IMGPATH[1:] else: ipath = os.path.normcase(self.config.IMGPATH) results = glob.glob( ipath + ocromore_data["file_info"].name.split(".")[0].replace( "_msa_best", "") + "*", recursive=True) if results: snippet = Snippet() snippet.imread(results[0]) else: self.config.USE_TOOLBBOX = False info_handler = {} # start parsing for each successfully segmented area for segmentation_class in segmentation_classes: # if the class segment was recognized ... if segmentation_class.is_start_segmented(): # get the unique identifier for this class segment_tag = segmentation_class.get_segment_tag() segmentation_class.snippet = snippet segmentation_class.info_handler = info_handler self.trigger_mapped_function(segment_tag, segmentation_class, ocromore_data) # add and return result ocromore_data['results'] = self.ef return ocromore_data def trigger_mapped_function(self, segment_tag, segmentation_class, ocromore_data): if segment_tag not in self.function_map.keys(): return #todo: fileinfo -> parsing real_start_tag, content_texts, content_lines, feature_lines = self.prepare_parsing_info( segmentation_class, ocromore_data) # switch the object to save context segment_tag = segmentation_class.segment_tag self.ef.set_current_main_list(segment_tag) # call the mapped function, which fills the end-factory self.function_map[segment_tag].__call__(real_start_tag, content_texts, content_lines, feature_lines, segmentation_class) def prepare_parsing_info(self, segmentation_class, ocromore_data): lines = ocromore_data['lines'] line_features = ocromore_data['line_features'] real_start_tag, content_texts, content_lines, feature_lines = \ DataHelper.get_content(lines,line_features, segmentation_class) return real_start_tag, content_texts, content_lines, feature_lines def get_all_text(self, ocromore_data, join_separated_lines=False): """ Gets all text lines in ocromore_data as array and as joined string :param ocromore_data: data from which the text is extracted :return: texts list, complete text """ all_texts = [] complete_text = "" for line in ocromore_data['lines']: text = line['text'] all_texts.append(text) complete_text += text if join_separated_lines: complete_text = "" all_texts = dh.join_separated_lines(all_texts) for text in all_texts: complete_text += text return all_texts, complete_text def write_result_to_output(self, as_json, ocromore_data): if as_json is True: my_json = self.ef.export_as_json() my_json_lines = my_json.split("\n") dh.write_array_to_root("result_json/", my_json_lines, ocromore_data, self.result_root)
class FeatureExtractor(): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) self.filter_start_words = [ "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:", "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung" ] def extract_file_features(self, ocromore_data): all_line_features = [] for line in ocromore_data['lines']: current_line_features = self.extract_line_features(line) all_line_features.append(current_line_features) ocromore_data['line_features'] = all_line_features return ocromore_data def extract_line_features(self, line): final_line_features = {} whole_text = line['text'] self.cpr.print("recognizing text:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for word_obj in line['words']: word_index = word_obj['word_index'] word_text = word_obj['text'] hocr_coordinates = word_obj['hocr_coordinates'] word_xstart = hocr_coordinates[0] word_xstop = hocr_coordinates[2] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if word_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word_text is None or word_text == "": continue if word_index == 0: if word_text in self.filter_start_words: first_word_no_table_indicator = True if word_text.lower() == "ultimo": ultimo_is_first_word = True if word_text[0] == "(": starts_with_parenthesis = True if word_index == len(whole_text) - 1: if word_text[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word_text) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round( counter_alphabetical_word / len(word_text), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word_text)) counters_numbers.append(counter_numbers_word) character_index += len(word_text) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars in line:", str(line['line_index']), "no features here") return False special_chars_ratio = counter_special_chars / counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces / counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers / counter, 2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths) - 1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True final_line_features = LineFeatures(cpr=self.cpr) final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.counter_special_chars = counter_special_chars final_line_features.counter_chars = counter_chars final_line_features.counter_spaces = counter_spaces final_line_features.counter_numbers = counter_numbers final_line_features.counter_alphabetical = counter_alphabetical final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars final_line_features.counter_words = counter_words final_line_features.counters_numbers = counters_numbers final_line_features.counters_wordlengths = counters_wordlengths final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios final_line_features.numbers_ratio = numbers_ratio final_line_features.alphabetical_ratio = alphabetical_ratio final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio final_line_features.special_chars_ratio = special_chars_ratio final_line_features.spaces_ratio = spaces_ratio final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words final_line_features.many_numbers_in_first_word = many_numbers_in_first_word final_line_features.x_box_sizes = x_box_sizes final_line_features.x_gaps = x_gaps final_line_features.maximum_x_gap = maximum_x_gap final_line_features.mean_x_gap = mean_x_gap final_line_features.median_x_gap = median_x_gap return final_line_features
class VocabularyChecker(): def __init__(self): config_handler = ConfigurationHandler(first_init=False) self.config = config_handler.get_config() self.cpr = ConditionalPrint(self.config.PRINT_VOCABULARY_CHECKER, self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL) self.dict_lines = [] self.max_edist = None self.suggenstion_verbosity = None #self.spellchecker = None self.special_chars_borders = "!¦1234567890,)(;.:\"-" self.pattern_start = re.compile(r"^[" + self.special_chars_borders + "]+") self.pattern_trail = re.compile(r"[" + self.special_chars_borders + "]+$") self.pattern_trail_dash = re.compile(r"[-]$") self.pattern_only_normal_chars = re.compile(r"[a-zA-Z]+") def _load_doc(self, filename): # open the file as read only file = open(filename, 'r') # read all text texts = file.readlines() # close the file file.close() return texts def without_special_chars(self, input_text): len_text = len(input_text) input_text_wo_sc = self.pattern_only_normal_chars.findall(input_text) if len(input_text_wo_sc) >= 1: len_text_wo_sc = len(input_text_wo_sc[0]) ratio = len_text_wo_sc / len_text return input_text_wo_sc[0], ratio else: # there are only special characters return input_text, 0 def get_accumulated_confidence_rate(self, word, word_acc_confs, wildcard_char): word_reduced, word_starting_borders, word_trailing_borders, change = self.remove_and_give_borders( word) wsplit = list(word) if change == False: acc_conf = 0 for i in range(0, len(wsplit)): acc_conf += word_acc_confs[i] return acc_conf, acc_conf / len( wsplit ), False, word_starting_borders, word_trailing_borders, word else: acc_conf = 0 len_start = len(word_starting_borders) len_trail = len(word_trailing_borders) for i in range(len_start, len(wsplit) - len_trail): acc_conf += word_acc_confs[i] return acc_conf, acc_conf / ( len(wsplit) - len_start - len_trail ), True, word_starting_borders, word_trailing_borders, word_reduced def remove_and_give_borders(self, input_text): start_sc_text = "" stop_sc_text = "" if len(input_text) > 2: start_special_chars = self.pattern_start.findall(input_text) stop_special_chars = self.pattern_trail.findall(input_text) if len(start_special_chars) >= 1: start_sc_text = start_special_chars[0] if len(stop_special_chars) >= 1: stop_sc_text = stop_special_chars[0] if start_special_chars == None and stop_special_chars == None: return input_text, start_sc_text, stop_sc_text, False else: input_text_stripped = input_text.strip( self.special_chars_borders) return input_text_stripped, start_sc_text, stop_sc_text, True else: return input_text, start_sc_text, stop_sc_text, False def word_trails_with_dash(self, input_text): trail_dash_res = self.pattern_trail_dash.findall(input_text) if len(trail_dash_res) >= 1: return True else: return False def initialize_lines(self, dict_file_path, remove_special_border_chars): # add the lines from a dictionary path to dict_lines doc = self._load_doc(dict_file_path) lines_doc = self._get_lines(doc, remove_special_border_chars) self.dict_lines.extend(lines_doc) def _get_lines(self, doc, remove_special_border_chars): lines_doc = [] for line in doc: if "--------------" in line: continue line = line.replace('\n', "") if remove_special_border_chars: # print("lbef",line) line = line.strip(self.special_chars_borders) # print("laft",line) linelen = len(line) if linelen > 2: if linelen < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH: continue # filter out lengths which are shorter than minimum if self.config.KEYING_RESULT_VC_DOWNCAST_ALL_CASES: line_low = line.lower() if line_low != line: lines_doc.append(line_low) lines_doc.append(line) return lines_doc def initialize_spellchecker(self): try: from pysymspell.symspell import SymSpell if self.dict_lines == None: self.cpr.printw( "can't initialize spellchecker, please first call initialize_lines" ) return # set paramters self.max_edist = self.config.KEYING_RESULT_VC_EDIT_DISTANCE_LEVEL self.suggenstion_verbosity = SymSpell.Verbosity.CLOSEST # initialize symspell as spellchecker sym_spell = SymSpell(self.max_edist) # load dictionary to spellchecker sym_spell.create_dictionary_by_list(self.dict_lines) self.spellchecker = sym_spell except: print( "To use the vocabulary checker you must pull PySymSpell from GitHub in the directory (AWARE: MIT License)" "by activate and initalize the submodule (delete the comment symbol: #):\n" ".gitmodule at line: 1-3") def correct_text_at_certain_indices_only(self, input_text, possible_error_indices): replacement_char = "‖" return_term, suggestions, first_letter_high = self.correct_text( input_text, suggestion_verbosity=SymSpell.Verbosity.ALL) if input_text == return_term: return return_term #print("asd") input_text_array = list(input_text) #if "Vortrag" in input_text or len(suggestions)>=2: # print("asd") suggestion_number_error_correction_count = [] num_of_possible_suggestions = 0 for suggestion in suggestions: input_text_array_c = input_text_array[:] # copy input text array sug_array = list(suggestion.term) for char_index_it, char_it in enumerate(input_text_array): for char_index_sug, char_sug in enumerate(sug_array): if input_text_array_c[char_index_it] == sug_array[ char_index_sug]: input_text_array_c[char_index_it] = replacement_char sug_array[char_index_sug] = replacement_char continue # print("asd") number_of_possible_errors_corrected = 0 # check if char was sustracted in possible error indices for index in possible_error_indices: char_to_check = input_text_array_c[index] char_previous = input_text_array[index] if char_to_check == char_previous: number_of_possible_errors_corrected += 1 if number_of_possible_errors_corrected >= 1: num_of_possible_suggestions += 1 suggestion_number_error_correction_count.append( number_of_possible_errors_corrected) if len(suggestion_number_error_correction_count) <= 0: return None # if num_of_possible_suggestions >=2: # print("asd") best_suggestion_index = np.argmax( suggestion_number_error_correction_count) best_suggestion_ecccount = suggestion_number_error_correction_count[ best_suggestion_index] if best_suggestion_ecccount > 0: best_suggestion_value = suggestions[best_suggestion_index].term if first_letter_high: best_suggestion_value = best_suggestion_value[0].upper( ) + best_suggestion_value[1:] return best_suggestion_value else: return None def correct_text(self, input_text, suggestion_verbosity=None): first_letter_high = False if self.config.KEYING_RESULT_VC_DOWNCAST_ALL_CASES: first_letter = input_text[0] first_letter_high = first_letter.islower() == False # input_text = input_text.lower() suggestion_verbosity_used = self.suggenstion_verbosity if suggestion_verbosity != None: suggestion_verbosity_used = suggestion_verbosity suggestions = self.spellchecker.lookup(input_text, suggestion_verbosity_used, self.max_edist) if len(suggestions) >= 1: return_term = suggestions[0]._term if self.config.KEYING_RESULT_VC_DOWNCAST_ALL_CASES and first_letter_high: return_term = return_term[0].upper() + return_term[1:] return return_term, suggestions, first_letter_high else: return None, suggestions, first_letter_high