Esempio n. 1
0
    def _parse (self) :
        stems_and_tokens = self._stem_tokens(self.tokenize(self.document))
        max_gram_length = self.max_gram_length

        for window in windowed(stems_and_tokens, size=max_gram_length, trail=True) :
            for stems_and_tokens in self._sub_grams(window) :
                stems, tokens = zip(*stems_and_tokens)

                yield (stems, tokens)
Esempio n. 2
0
def usable (known, documents, is_usable=is_readable, gram_size=4, split_index=None) :
    known = {value : value for value in known}

    if split_index is None :
        split_index = gram_size - 1

    for document in documents :
        for gram in windowed(Parsed(document), gram_size) :
            input_ = tuple(_known_or_unkown(known, gram[:split_index]))
            correct_output = tuple(_known_or_unkown(known, gram[split_index:]))

            if is_usable(input_) and is_usable(correct_output) :
                yield (input_, correct_output)