def _parse (self) : stems_and_tokens = self._stem_tokens(self.tokenize(self.document)) max_gram_length = self.max_gram_length for window in windowed(stems_and_tokens, size=max_gram_length, trail=True) : for stems_and_tokens in self._sub_grams(window) : stems, tokens = zip(*stems_and_tokens) yield (stems, tokens)
def usable (known, documents, is_usable=is_readable, gram_size=4, split_index=None) : known = {value : value for value in known} if split_index is None : split_index = gram_size - 1 for document in documents : for gram in windowed(Parsed(document), gram_size) : input_ = tuple(_known_or_unkown(known, gram[:split_index])) correct_output = tuple(_known_or_unkown(known, gram[split_index:])) if is_usable(input_) and is_usable(correct_output) : yield (input_, correct_output)