コード例 #1
0
    def __init__(self, trie_file=None, word_list=None, save_unseen=True, save_dir=None):
        """

        :param trie_file:
        :param word_list:
        :param save_unseen:
        :param save_dir:

        >>> known_words = ['maturita','temperueniunt', 'radicibus', 'subministres']
        >>> trie_transformer = TrieTransformer(word_list=known_words)
        >>> corrupt_corpus = [['maturitatemperueniunt', 'est'], ['radicibussubministres', 'amo']]
        >>> trie_transformer.transform(corrupt_corpus)
        [['maturita', 'temperueniunt', 'est'], ['radicibus', 'subministres', 'amo']]

        """
        if trie_file:
            with open(trie_file, 'rb') as reader:
                self.trie = pickle.load(reader)
        if word_list:
            self.trie = WordTrie()
            for word in word_list:
                self.trie.add(word)
        if save_unseen:
            self.save_unseen = True
            self.unseen = []
        self.save_dir = save_dir
コード例 #2
0
def get_split_words(corpus_reader: CorpusReader,
                    word_trie: WordTrie,
                    max_word_length: int = 15) -> Dict[str, List[str]]:
    """
    Search a corpus for improperly joined words, defined by a discrete trie model.
    return a dictionary, keys are files, and values are lists of tuples of the split words.

    :param corpus_reader:
    :param word_trie:
    :param max_word_length:
    :return:
    """
    split_words = defaultdict(list)  # type: Dict[str, List[str]]
    files = corpus_reader.fileids()
    for file in tqdm(files, total=len(files), unit="files"):
        for word in corpus_reader.words(file):
            if len(word) > max_word_length and not word_trie.has_word(word):
                word_list = word_trie.extract_word_pair(word)
                if len(word_list) == 2:
                    split_words[file] += word_list
    return split_words
コード例 #3
0
class TrieTransformer(BaseEstimator, TransformerMixin):
    """
    Auto splice improperly joined words in a tokenized sentence matrix.
    """

    def __init__(self, trie_file=None, word_list=None, save_unseen=True, save_dir=None):
        """

        :param trie_file:
        :param word_list:
        :param save_unseen:
        :param save_dir:

        >>> known_words = ['maturita','temperueniunt', 'radicibus', 'subministres']
        >>> trie_transformer = TrieTransformer(word_list=known_words)
        >>> corrupt_corpus = [['maturitatemperueniunt', 'est'], ['radicibussubministres', 'amo']]
        >>> trie_transformer.transform(corrupt_corpus)
        [['maturita', 'temperueniunt', 'est'], ['radicibus', 'subministres', 'amo']]

        """
        if trie_file:
            with open(trie_file, 'rb') as reader:
                self.trie = pickle.load(reader)
        if word_list:
            self.trie = WordTrie()
            for word in word_list:
                self.trie.add(word)
        if save_unseen:
            self.save_unseen = True
            self.unseen = []
        self.save_dir = save_dir

    def fit(self, string_matrix: List[List[str]], y: List[Any] = None):
        """
        Template method
        :param X:
        :param y:
        :return:
        """
        return self

    def extract_word_pair(self, long_word):
        """

        :param long_word:
        :return:
        """

        if self.trie.has_word(long_word):
            return [long_word]

        for idx in range(2, len(long_word) - 1):
            word1 = long_word[:idx]
            word2 = long_word[idx:]
            if self.trie.has_word(word1) and self.trie.has_word(word2):
                return [word1, word2]
        if self.save_unseen:
            self.unseen.append(long_word)
        return [long_word]  # don't swallow unknown words

    def transform(self, string_matrix: List[List[str]]) -> List[List[str]]:
        """
        Modify the matrix
        :param string_matrix:
        :return:
        """
        results = []
        if self.save_unseen:
            self.unseen = []
        for document in string_matrix:
            sentence = [] # type: List[str]
            for x in document:
                tmp_result = self.extract_word_pair(x)
                if tmp_result:
                    sentence += self.extract_word_pair(x)
            results.append(sentence)
        try:
            if self.save_unseen and self.save_dir:
                with open(os.path.join(self.save_dir,
                                       'unseen_words.{}.txt'.format(
                                           datetime.datetime.now().strftime('%Y.%m.%d'))),
                          'wt', encoding='utf8') as writer:
                    for word in self.unseen:
                        writer.write(word)
                        writer.write('\n')
        except OSError:
            LOG.exception('Failure in trying to save unseen words')
        return results