Example #1
0
    def load(self, text_data):
        """Load text data to the model

        Args:
            text_data (`Text`): Text data
        """
        if self.is_preprocessed(text_data.filename) != 0:
            self.logger.debug(text_data.filename +
                              " already loaded: skipping it.")
            return

        tmp_u = Unigrams(self.tmp_unigrams_filename)
        word_list = tmp_u.append_data(text_data)

        self.bigrams.append_data(word_list)

        tmp_ac = AltCaseMap(self.tmp_altcase_filename)
        tmp_ac.append_data(tmp_u.raw_unigrams)

        tmp_u.generate_low_case(tmp_ac.altcase_map)

        self.ocrkey_map.append_data(tmp_u.raw_unigrams)

        # Updating data
        self.unigrams.raw_unigrams += tmp_u.raw_unigrams
        self.unigrams.ngrams += tmp_u.ngrams
        self.unigrams.prune(0.7)
        self.unigrams.save()

        combine_struct = {
            key: set()
            for key in tmp_ac.altcase_map.keys() +
            self.altcase_map.altcase_map.keys()
        }
        for key, value in tmp_ac.altcase_map.items(
        ) + self.altcase_map.altcase_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.altcase_map.altcase_map = combine_struct
        self.altcase_map.prune(self.unigrams.ngrams_pruned)
        self.altcase_map.save()

        unlink(self.tmp_unigrams_filename)
        unlink(self.tmp_altcase_filename)

        self.anagram_map.append_data(self.bigrams.ngrams_pruned,
                                     self.unigrams.ngrams_pruned)
        self.dictionary.append_data(self.unigrams.ngrams_pruned)

        self.logger.info(text_data.filename + "'s datastructures loaded")
Example #2
0
    def load(self, text_data):
        """Load text data to the model

        Args:
            text_data (`Text`): Text data
        """
        if self.is_preprocessed(text_data.filename) != 0:
            self.logger.debug(text_data.filename+" already loaded: skipping it.")
            return

        tmp_u = Unigrams(self.tmp_unigrams_filename)
        word_list = tmp_u.append_data(text_data)

        self.bigrams.append_data(word_list)

        tmp_ac = AltCaseMap(self.tmp_altcase_filename)
        tmp_ac.append_data(tmp_u.raw_unigrams)

        tmp_u.generate_low_case(tmp_ac.altcase_map)

        self.ocrkey_map.append_data(tmp_u.raw_unigrams)

        # Updating data
        self.unigrams.raw_unigrams += tmp_u.raw_unigrams
        self.unigrams.ngrams += tmp_u.ngrams
        self.unigrams.prune(0.7)
        self.unigrams.save()

        combine_struct = {key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys()}
        for key, value in tmp_ac.altcase_map.items() + self.altcase_map.altcase_map.items():
            combine_struct[key] = combine_struct[key].union(value)

        self.altcase_map.altcase_map = combine_struct
        self.altcase_map.prune(self.unigrams.ngrams_pruned)
        self.altcase_map.save()

        unlink(self.tmp_unigrams_filename)
        unlink(self.tmp_altcase_filename)

        self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned)
        self.dictionary.append_data(self.unigrams.ngrams_pruned)

        self.logger.info(text_data.filename+"'s datastructures loaded")