def __init__(self, filename): self.analyzer = StandardAnalyzer() store = park.SQLiteStore(filename) self.model = Model(self.analyzer, store) self.searcher = RandomWalkSearcher(self.model)
def test_init(self): # Don't specify any ngram orders, which should get trigrams # and bigrams stored. model = self.model self.assertEquals((3, 2, 1), model.orders) # And make sure n=5 yields 5-grams and 4-grams model = Model(self.analyzer, self.store, n=5) self.assertEquals((5, 4, 3, 2, 1), model.orders)
def test_load_tokens(self): # Ensure that model.tokens is properly reloaded from the # database when an old Model is loaded model = self.model model.train(u"this is a test") model.train(u"this is another test") # We save on train(), so make sure the new tokens log is empty. self.assertEqual(0, len(model.tokens.token_log)) save_token_ids = dict(model.tokens.token_ids) save_tokens = dict(model.tokens.tokens) model = Model(self.analyzer, self.store) self.assertEqual(save_token_ids, model.tokens.token_ids) self.assertEqual(save_tokens, model.tokens.tokens)
class Brain(object): """A simplified, cobe 2.x style interface. This behaves roughly like cobe 2.x with an English stemmer for now; more flexibility will come as the API is fleshed out. It generates replies with a random walk across the language model and scores candidate replies by entropy, with a penalty for too-long replies. """ def __init__(self, filename): self.analyzer = StandardAnalyzer() store = park.SQLiteStore(filename) self.model = Model(self.analyzer, store) self.searcher = RandomWalkSearcher(self.model) def reply(self, text): # Create a search query from the input query = self.analyzer.query(text, self.model) # Track (and don't re-score) replies that have already been # seen. These are expected when using a random walk searcher, # but they're also useful when debugging searches. seen = set() join = self.analyzer.join entropy = self.model.entropy def score(reply): joined = join(reply) if joined in seen: return -1.0, joined seen.add(joined) n_tokens = len(reply) # Penalize longer replies (cobe 2.x compatibility) penalty = 1.0 if n_tokens > 24: penalty = math.sqrt(n_tokens) elif n_tokens > 48: penalty = n_tokens joined = join(reply) return entropy(joined) / penalty, joined # This search is a generator; it doesn't start evaluating until read search = itime(self.searcher.search(query), 0.5) # Generate and score the search results. results = sorted(itertools.imap(score, search)) if log.isEnabledFor(logging.DEBUG): for score, text in results: log.debug("%.4f %s", score, text) log.debug("made %d replies (%d unique)", len(results), len(seen)) score, reply = results[-1] return reply def train(self, text): return self.model.train(text) def train_many(self, text_gen): return self.model.train_many(text_gen)
class Brain(object): """A simplified, cobe 2.x style interface. This behaves roughly like cobe 2.x with an English stemmer for now; more flexibility will come as the API is fleshed out. It generates replies with a random walk across the language model and scores candidate replies by entropy, with a penalty for too-long replies. """ def __init__(self, filename): self.analyzer = StandardAnalyzer() store = park.SQLiteStore(filename) self.model = Model(self.analyzer, store) self.searcher = RandomWalkSearcher(self.model) def reply(self, text): # Create a search query from the input query = self.analyzer.query(text, self.model) # Track (and don't re-score) replies that have already been # seen. These are expected when using a random walk searcher, # but they're also useful when debugging searches. seen = set() join = self.analyzer.join entropy = self.model.entropy def score(reply): joined = join(reply) if joined in seen: return -1.0, joined seen.add(joined) n_tokens = len(reply) # Penalize longer replies (cobe 2.x compatibility) penalty = 1.0 if n_tokens > 16: penalty = math.sqrt(n_tokens) elif n_tokens > 32: penalty = n_tokens joined = join(reply) return entropy(joined) / penalty, joined # This search is a generator; it doesn't start evaluating until read search = itime(self.searcher.search(query), 0.5) # Generate and score the search results. results = sorted(itertools.imap(score, search)) if log.isEnabledFor(logging.DEBUG): for score, text in results: log.debug("%.4f %s", score, text) log.debug("made %d replies (%d unique)", len(results), len(seen)) score, reply = results[-1] return reply def train(self, text): return self.model.train(text) def train_many(self, text_gen): return self.model.train_many(text_gen)
def setUp(self): self.analyzer = WhitespaceAnalyzer() self.store = park.SQLiteStore(":memory:") self.model = Model(self.analyzer, self.store)