def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join(app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"]) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary( join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams( join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config[ "exts"]["tmp"] self.bigrams = Bigrams( join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap( join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config[ "exts"]["tmp"] self.ocrkey_map = OcrKeyMap( join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap( join(inline_models_dir, inline_models_key["anagrams"]))
def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename + " already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = { key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys() } for key, value in tmp_ac.altcase_map.items( ) + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename + "'s datastructures loaded")
def correct(self, text_data): """Correct text data Args: text_data (`Text`): Text data """ unigrams = Unigrams( join( self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"], )) ml_classifier = load( join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"])) if ml_classifier is None: return self.model["algo"].set_classifier(ml_classifier) for paragraph in text_data.text: for line in paragraph: if line.grade % 5 == 0: continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text_data.stats) line.grade = self.model["algo"].classify(features) * 5
def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename+" already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = {key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys()} for key, value in tmp_ac.altcase_map.items() + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename+"'s datastructures loaded")
def train(self, dataset): """Train the model with a dataset Args: dataset (list): List of training files """ # Get the original training set training_set = self.model["algo"].training_set # Append the new data to it for text in dataset: self.logger.debug("Processing " + text.filename + "...") unigrams = Unigrams( join( self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"], )) for p in text.text: for line in p: if line.grade % 5 != 0: # Unclassified lines are useless for the training continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text.stats) result = int(line.grade / 5) training_set["features"].append(features) training_set["results"].append(result) self.logger.debug("Saving training set...") save( training_set, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["training_set"])) self.logger.debug("Training model...") ml_classifier = SGDClassifier(loss="log", class_weight="auto") self.model["algo"].set_classifier(ml_classifier) self.model["algo"].set_training_set(training_set["features"], training_set["results"]) self.model["algo"].train() save( self.model["algo"].classifier, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"]))
def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join( app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"] ) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary(join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams(join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config["exts"]["tmp"] self.bigrams = Bigrams(join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap(join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config["exts"]["tmp"] self.ocrkey_map = OcrKeyMap(join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap(join(inline_models_dir, inline_models_key["anagrams"]))
class InlineModel(AbstractModel): """Model for inline data structures """ def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join( app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"] ) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary(join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams(join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config["exts"]["tmp"] self.bigrams = Bigrams(join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap(join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config["exts"]["tmp"] self.ocrkey_map = OcrKeyMap(join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap(join(inline_models_dir, inline_models_key["anagrams"])) def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename+" already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = {key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys()} for key, value in tmp_ac.altcase_map.items() + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename+"'s datastructures loaded") def correct(self, text_data): """Correct text data Args: text_data (`Text`): Text data """ correction_data = self.correction_data() for paragraph in text_data.text: for line in paragraph: for token in line.tokens: token[2] = init_correction_map(token[1], correction_data["dictionary"]) # Skip some correction steps if the token is too short, in the dictionary or already identified as # garbage if not token[2] is None and len(token[2]) == 0: anagrams = select_anagrams(token[1], correction_data) ocr_sims = select_ocrsims(token[1], correction_data) token[2] = build_candidates_list(token[1], anagrams, ocr_sims, correction_data) token[2] = correct_case(token[1], token[2], correction_data) token[2] = rate_corrections(token[2]) if len(token[2]) == 0: # No correction has been found token[2] = None # Applying the bigram boost to the tokens bigrams = extract_paragraph_bigrams(paragraph) apply_bigram_boost(paragraph, bigrams, correction_data["occurence_map"]) # Select the appropriate correction for line in paragraph: for token in line.tokens: token[2] = select_correction(token[1], token[2]) if token[2] is not None and len(token[2]) > 1: tkn_list = [tkn for tkn, sc in token[2].items() if sc == max(token[2].values())] if len(tkn_list) != 1: tkn_list = select_lower_edit_distance(token[1], {tkn: token[2][tkn] for tkn in tkn_list}) if len(tkn_list) != 1: tkn_list = [select_best_alphabetical_word(token[1], tkn_list)] token[2] = {tkn: token[2][tkn] for tkn in tkn_list} def correction_data(self): """Get the correction data Returns: dict: Correction data """ return { "occurence_map": self.unigrams.ngrams + self.bigrams.ngrams, "altcase": self.altcase_map.altcase_map, "ocrkeys": self.ocrkey_map.ocrkey_map, "anagrams": self.anagram_map.anagram_hashmap, "alphabet": self.anagram_map.anagram_alphabet, "dictionary": self.dictionary.dictionary }
class InlineModel(AbstractModel): """Model for inline data structures """ def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join(app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"]) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary( join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams( join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config[ "exts"]["tmp"] self.bigrams = Bigrams( join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap( join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config[ "exts"]["tmp"] self.ocrkey_map = OcrKeyMap( join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap( join(inline_models_dir, inline_models_key["anagrams"])) def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename + " already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = { key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys() } for key, value in tmp_ac.altcase_map.items( ) + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename + "'s datastructures loaded") def correct(self, text_data): """Correct text data Args: text_data (`Text`): Text data """ correction_data = self.correction_data() for paragraph in text_data.text: for line in paragraph: for token in line.tokens: token[2] = init_correction_map( token[1], correction_data["dictionary"]) # Skip some correction steps if the token is too short, in the dictionary or already identified as # garbage if not token[2] is None and len(token[2]) == 0: anagrams = select_anagrams(token[1], correction_data) ocr_sims = select_ocrsims(token[1], correction_data) token[2] = build_candidates_list( token[1], anagrams, ocr_sims, correction_data) token[2] = correct_case(token[1], token[2], correction_data) token[2] = rate_corrections(token[2]) if len(token[2]) == 0: # No correction has been found token[2] = None # Applying the bigram boost to the tokens bigrams = extract_paragraph_bigrams(paragraph) apply_bigram_boost(paragraph, bigrams, correction_data["occurence_map"]) # Select the appropriate correction for line in paragraph: for token in line.tokens: token[2] = select_correction(token[1], token[2]) if token[2] is not None and len(token[2]) > 1: tkn_list = [ tkn for tkn, sc in token[2].items() if sc == max(token[2].values()) ] if len(tkn_list) != 1: tkn_list = select_lower_edit_distance( token[1], {tkn: token[2][tkn] for tkn in tkn_list}) if len(tkn_list) != 1: tkn_list = [ select_best_alphabetical_word( token[1], tkn_list) ] token[2] = {tkn: token[2][tkn] for tkn in tkn_list} def correction_data(self): """Get the correction data Returns: dict: Correction data """ return { "occurence_map": self.unigrams.ngrams + self.bigrams.ngrams, "altcase": self.altcase_map.altcase_map, "ocrkeys": self.ocrkey_map.ocrkey_map, "anagrams": self.anagram_map.anagram_hashmap, "alphabet": self.anagram_map.anagram_alphabet, "dictionary": self.dictionary.dictionary }