def learn_text(self, text, allow_new_words): """ Count n-grams and add words to the auto-learn models. """ if self.auto_learn_models: tokens, spans = pypredict.tokenize_text(text) # There are too many false positives with trailing # single quotes, remove them. # Do this here, because we still want "it's", etc. to # incrementally provide completions. for i, token in enumerate(tokens): if token.endswith("'"): token = token[:-1] if not token: # shouldn't happen token = "<unk>" tokens[i] = token # if requested, drop unknown words if allow_new_words: token_sections = [tokens] else: token_sections = self._drop_new_words(tokens, spans, self.persistent_models) models = self._model_cache.get_models(self.auto_learn_models) for model in models: for tokens in token_sections: model.learn_tokens(tokens) _logger.info("learn_text: tokens=" + repr(token_sections)) # debug: save all learned text for later parameter optimization if config.log_learn: fn = os.path.join(config.user_dir, "learned_text.txt") with open(fn, "a") as f: f.write(text + "\n")
def learn_scratch_text(self, text): """ Count n-grams and add words to the scratch models. """ tokens, spans = pypredict.tokenize_text(text) models = self._model_cache.get_models(self.scratch_models) for model in models: # print("scratch learn", model, tokens) model.learn_tokens(tokens, True)
def setUp(self): self._tmp_dir = tempfile.TemporaryDirectory(prefix="test_onboard_") self._dir = self._tmp_dir.name text = "word1 word2 word3 word4 word5 word6" tokens, _spans = pypredict.tokenize_text(text) # prepare contents of error-free models self._model_contents = [] self._models = [] for i in range(0, self.MAX_ORDER): order = i + 1 fn = os.path.join(self._dir, "order{}.lm".format(order)) if order == 1: model = pypredict.UnigramModel() else: model = pypredict.DynamicModel(order) model.learn_tokens(tokens) model.save(fn) with open(fn, encoding="UTF-8") as f: lines = f.readlines() self._models.append(model) self._model_contents.append([fn, lines])
def learn_scratch_text(self, text): """ Count n-grams and add words to the scratch models. """ tokens, spans = pypredict.tokenize_text(text) models = self._model_cache.get_models(self.scratch_models) for model in models: #print("scratch learn", model, tokens) model.learn_tokens(tokens, True)
def learn_text(self, text, allow_new_words): """ Count n-grams and add words to the auto-learn models. """ if self.auto_learn_models: tokens, spans = pypredict.tokenize_text(text) # Remove trailing single quote, too many false positives. # Do this here, because we still want "it's", etc. to # incrementally provide completions. for i, token in enumerate(tokens): if token.endswith("'"): token = token[:-1] if not token: # shouldn't happen token = "<unk>" tokens[i] = token models = self._model_cache.get_models(self.auto_learn_models) for model in models: model.learn_tokens(tokens, allow_new_words) _logger.info("learn_text: tokens=" + repr(tokens[:10])) # debug: save all learned text for later parameter optimization if config.log_learn: fn = os.path.join(config.user_dir, "learned_text.txt") with open(fn, "a") as f: f.write(text + "\n")
def tokenize_text(self, text): """ Let the service find the words in text. """ if 1: # avoid the D-Bus round-trip while we can tokens, spans = pypredict.tokenize_text(text) else: tokens, spans = self._call_method("tokenize_text", ([], []), text) return tokens, spans
def tokenize_text(self, text): """ Let the service find the words in text. """ tokens, spans = pypredict.tokenize_text(text) return tokens, spans