def save(self): data = { "hashmap": self.anagram_hashmap, "alphabet": self.anagram_alphabet } save(data, self.filename)
def save(self): data = { "altcase": self.altcase_map, "altcase_pruned": self.altcase_pruned_map } save(data, self.filename)
def save(self): data = { "hashmap": self.anagram_hashmap, "alphabet": self.anagram_alphabet } save(data, self.filename)
def save(self): data = { "altcase": self.altcase_map, "altcase_pruned": self.altcase_pruned_map } save(data, self.filename)
def save(self): data = { "bigrams": self.ngrams, "bigrams_pruned": self.ngrams_pruned } save(data, self.filename)
def save(self): data = { "raw_unigrams": self.raw_unigrams, "unigrams": self.ngrams, "unigrams_pruned": self.ngrams_pruned } save(data, self.filename)
def save(self): data = { "raw_unigrams": self.raw_unigrams, "unigrams": self.ngrams, "unigrams_pruned": self.ngrams_pruned } save(data, self.filename)
def train(self, dataset): """Train the model with a dataset Args: dataset (list): List of training files """ # Get the original training set training_set = self.model["algo"].training_set # Append the new data to it for text in dataset: self.logger.debug("Processing " + text.filename + "...") unigrams = Unigrams( join( self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"], )) for p in text.text: for line in p: if line.grade % 5 != 0: # Unclassified lines are useless for the training continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text.stats) result = int(line.grade / 5) training_set["features"].append(features) training_set["results"].append(result) self.logger.debug("Saving training set...") save( training_set, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["training_set"])) self.logger.debug("Training model...") ml_classifier = SGDClassifier(loss="log", class_weight="auto") self.model["algo"].set_classifier(ml_classifier) self.model["algo"].set_training_set(training_set["features"], training_set["results"]) self.model["algo"].train() save( self.model["algo"].classifier, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"]))
def is_preprocessed(self, filename): """Determine if the given file has already been preprocessed (its data added to the models) Args: filename (str): Path of the given file Returns: int: 0 if not preprocess, 1 otherwise """ text_id = file_checksum(filename) if text_id not in self.hash_list: self.hash_list.append(text_id) save(self.hash_list, self.hash_filename) return 0 return 1
def is_preprocessed(self, filename): """Determine if the given file has already been preprocessed (its data added to the models) Args: filename (str): Path of the given file Returns: int: 0 if not preprocess, 1 otherwise """ text_id = file_checksum(filename) if text_id not in self.hash_list: self.hash_list.append(text_id) save(self.hash_list, self.hash_filename) return 0 return 1
def train(self, dataset): """Train the model with a dataset Args: dataset (list): List of training files """ # Get the original training set training_set = self.model["algo"].training_set # Append the new data to it for text in dataset: self.logger.debug("Processing "+text.filename+"...") unigrams = Unigrams(join(self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"],)) for p in text.text: for line in p: if line.grade % 5 != 0: # Unclassified lines are useless for the training continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text.stats) result = int(line.grade / 5) training_set["features"].append(features) training_set["results"].append(result) self.logger.debug("Saving training set...") save(training_set, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["training_set"])) self.logger.debug("Training model...") ml_classifier = SGDClassifier(loss="log", class_weight="auto") self.model["algo"].set_classifier(ml_classifier) self.model["algo"].set_training_set(training_set["features"], training_set["results"]) self.model["algo"].train() save(self.model["algo"].classifier, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"]))
def save(self): save(self.ocrkey_map, self.filename)
def save(self): data = {"bigrams": self.ngrams, "bigrams_pruned": self.ngrams_pruned} save(data, self.filename)
def save(self): save(self.dictionary, self.filename)
def save(self): save(self.ocrkey_map, self.filename)
def save(self): save(self.dictionary, self.filename)