Esempio n. 1
0
    def save(self):
        data = {
            "hashmap": self.anagram_hashmap,
            "alphabet": self.anagram_alphabet
        }

        save(data, self.filename)
Esempio n. 2
0
    def save(self):
        data = {
            "altcase": self.altcase_map,
            "altcase_pruned": self.altcase_pruned_map
        }

        save(data, self.filename)
Esempio n. 3
0
    def save(self):
        data = {
            "hashmap": self.anagram_hashmap,
            "alphabet": self.anagram_alphabet
        }

        save(data, self.filename)
Esempio n. 4
0
    def save(self):
        data = {
            "altcase": self.altcase_map,
            "altcase_pruned": self.altcase_pruned_map
        }

        save(data, self.filename)
Esempio n. 5
0
    def save(self):
        data = {
            "bigrams": self.ngrams,
            "bigrams_pruned": self.ngrams_pruned
        }

        save(data, self.filename)
Esempio n. 6
0
    def save(self):
        data = {
            "raw_unigrams": self.raw_unigrams,
            "unigrams": self.ngrams,
            "unigrams_pruned": self.ngrams_pruned
        }

        save(data, self.filename)
Esempio n. 7
0
    def save(self):
        data = {
            "raw_unigrams": self.raw_unigrams,
            "unigrams": self.ngrams,
            "unigrams_pruned": self.ngrams_pruned
        }

        save(data, self.filename)
Esempio n. 8
0
    def train(self, dataset):
        """Train the model with a dataset

        Args:
            dataset (list): List of training files
        """
        # Get the original training set
        training_set = self.model["algo"].training_set

        # Append the new data to it
        for text in dataset:
            self.logger.debug("Processing " + text.filename + "...")
            unigrams = Unigrams(
                join(
                    self.config["root"],
                    self.config["dirs"]["models_root"],
                    self.config["dirs"]["models"]["inline"],
                    self.config["models"]["inline"]["unigrams"],
                ))

            for p in text.text:
                for line in p:
                    if line.grade % 5 != 0:  # Unclassified lines are useless for the training
                        continue

                    f = MachineLearningFeatures()
                    features = f.extract_features(line, unigrams.ngrams,
                                                  text.stats)
                    result = int(line.grade / 5)

                    training_set["features"].append(features)
                    training_set["results"].append(result)

        self.logger.debug("Saving training set...")
        save(
            training_set,
            join(self.config["dirs"]["models_root"],
                 self.config["dirs"]["models"]["learning"],
                 self.config["models"]["learning"]["training_set"]))

        self.logger.debug("Training model...")
        ml_classifier = SGDClassifier(loss="log", class_weight="auto")
        self.model["algo"].set_classifier(ml_classifier)
        self.model["algo"].set_training_set(training_set["features"],
                                            training_set["results"])
        self.model["algo"].train()

        save(
            self.model["algo"].classifier,
            join(self.config["dirs"]["models_root"],
                 self.config["dirs"]["models"]["learning"],
                 self.config["models"]["learning"]["classifier"]))
Esempio n. 9
0
    def is_preprocessed(self, filename):
        """Determine if the given file has already been preprocessed (its data added to the models)

        Args:
            filename (str): Path of the given file

        Returns:
            int: 0 if not preprocess, 1 otherwise
        """
        text_id = file_checksum(filename)

        if text_id not in self.hash_list:
            self.hash_list.append(text_id)
            save(self.hash_list, self.hash_filename)
            return 0

        return 1
Esempio n. 10
0
    def is_preprocessed(self, filename):
        """Determine if the given file has already been preprocessed (its data added to the models)

        Args:
            filename (str): Path of the given file

        Returns:
            int: 0 if not preprocess, 1 otherwise
        """
        text_id = file_checksum(filename)

        if text_id not in self.hash_list:
            self.hash_list.append(text_id)
            save(self.hash_list, self.hash_filename)
            return 0

        return 1
Esempio n. 11
0
    def train(self, dataset):
        """Train the model with a dataset

        Args:
            dataset (list): List of training files
        """
        # Get the original training set
        training_set = self.model["algo"].training_set

        # Append the new data to it
        for text in dataset:
            self.logger.debug("Processing "+text.filename+"...")
            unigrams = Unigrams(join(self.config["root"],
                                     self.config["dirs"]["models_root"],
                                     self.config["dirs"]["models"]["inline"],
                                     self.config["models"]["inline"]["unigrams"],))

            for p in text.text:
                for line in p:
                    if line.grade % 5 != 0:  # Unclassified lines are useless for the training
                        continue

                    f = MachineLearningFeatures()
                    features = f.extract_features(line, unigrams.ngrams, text.stats)
                    result = int(line.grade / 5)

                    training_set["features"].append(features)
                    training_set["results"].append(result)

        self.logger.debug("Saving training set...")
        save(training_set, join(self.config["dirs"]["models_root"],
                                self.config["dirs"]["models"]["learning"],
                                self.config["models"]["learning"]["training_set"]))

        self.logger.debug("Training model...")
        ml_classifier = SGDClassifier(loss="log", class_weight="auto")
        self.model["algo"].set_classifier(ml_classifier)
        self.model["algo"].set_training_set(training_set["features"], training_set["results"])
        self.model["algo"].train()

        save(self.model["algo"].classifier, join(self.config["dirs"]["models_root"],
                                                 self.config["dirs"]["models"]["learning"],
                                                 self.config["models"]["learning"]["classifier"]))
Esempio n. 12
0
 def save(self):
     save(self.ocrkey_map, self.filename)
Esempio n. 13
0
    def save(self):
        data = {"bigrams": self.ngrams, "bigrams_pruned": self.ngrams_pruned}

        save(data, self.filename)
Esempio n. 14
0
 def save(self):
     save(self.dictionary, self.filename)
Esempio n. 15
0
 def save(self):
     save(self.ocrkey_map, self.filename)
Esempio n. 16
0
 def save(self):
     save(self.dictionary, self.filename)