def __init__(self): # keys are word 2-tuples, values are dicts mapping trailing words to frequency self.__words = {} # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples) self.__words_compiled = {} # maps phrase start and end words to cardinalities self.__heads = KeyCounter() self.__tails = KeyCounter() # list of word-cardinality tuples self.__heads_compiled = []
def test_ingest_correct_length(self): speaker = self.__new_speaker() speaker.ingest(_text) speaker.compile() run_count = 100 min_length = 70 max_length = 130 errors = KeyCounter() for i in xrange(run_count): try: result = speaker.speak(min_length, max_length) if len(result) < min_length: errors.increment("short") elif len(result) > max_length: errors.increment("long") except Exception, e: errors.increment(e)
def find_artifact_counts_newer(cls, datetime, **kw): counts = KeyCounter() for art in ArtifactInfo.find_newer(datetime, **kw): counts.increment(art.source_name) return counts.to_hash()
class Markov2Speaker(SelectingSpeaker): """ 2nd order Markov chain. mostly congruent to Markov1Speaker. """ def __init__(self): # keys are word 2-tuples, values are dicts mapping trailing words to frequency self.__words = {} # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples) self.__words_compiled = {} # maps phrase start and end words to cardinalities self.__heads = KeyCounter() self.__tails = KeyCounter() # list of word-cardinality tuples self.__heads_compiled = [] def ingest(self, phrase): for sentence in tokenize_sentences(phrase, 50, lowercase=True): phrase_words = sentence.split() phrase_words.append(Symbols.END) phrase_len = len(phrase_words) # phrases under 3 are of no use to a 2nd-order chain if phrase_len < 3: return # grabs first 2 words of phrase self.__heads.increment((phrase_words[0], phrase_words[1])) for i in range(phrase_len - 2): w1 = phrase_words[i] w2 = phrase_words[i + 1] w3 = phrase_words[i + 2] w_pair = (w1, w2) if w_pair in self.__words: trailing_words = self.__words[w_pair] if w3 in trailing_words: trailing_words[w3] = trailing_words[w3] + 1 else: trailing_words[w3] = 1 else: trailing_words = {w3: 1} self.__words[w_pair] = trailing_words def compile(self): """ converts word grid into more efficient structure """ self.__words_compiled.clear() del self.__heads_compiled[0:] for w_pair in self.__words: trailing_words = [] w_pair_count = 0 for w, count in self.__words[w_pair].iteritems(): trailing_words.append((w, count)) w_pair_count += count self.__words_compiled[w_pair] = (w_pair_count, trailing_words) # builds list of (word 2-tuple, frequency) tuples self.__word_weights = [] for w_pair, p in self.__words_compiled.iteritems(): self.__word_weights.append((w_pair, p[0])) for pair, count in self.__heads.iteritems(): self.__heads_compiled.append((pair, count)) # logs all head phrases with cardinality > 1 # logging.debug("heads: %s" % filter(lambda p: p[1] > 1, sorted(self.__heads.items(), key=lambda p: p[1], reverse=True))) def select(self, selected, min_length, max_length): """ params: selected - list of selected words. return: a tuple of words """ if not self.__words: raise MissingDataException("no satisfactory content has been ingested") elif not self.__words_compiled: self.compile() # select a trailing word via weighted random def select_next(current_pair): pair_stats = self.__words_compiled.get(current_pair, None) next_word = None if pair_stats: # ends if we're past the min length and have END as a potential next word if (calculate_length(selected) > min_length) and \ Symbols.END in pair_stats: next_word = None else: # don't end if we're not longer than min_length logging.debug("select_next %s %s" % (current_pair, str(pair_stats))) # stats were found for this pair # next_word = rrandom.select_weighted_with_replacement(pair_stats[1]) next_word = random.choice(pair_stats[1])[0] if next_word is Symbols.END: next_word = None return (next_word,) if next_word else None # select first pair via weighted random if not selected: #next = (rrandom.select_weighted_with_replacement(self.__word_weights)) next = (rrandom.select_weighted_with_replacement(self.__heads_compiled)) else: # select using last 2 words as params next = select_next((selected[-2], selected[-1])) return next def describe(self): parts = [] for w_pair, suffix_count in sorted(self.__words.iteritems()): parts.append("%s %s\n" % w_pair) for w_tail, count in sorted(suffix_count.iteritems(), key=lambda p: p[1], reverse=True): parts.append(" %s: %d\n" % (w_tail, count)) return "".join(parts)
class Markov2Speaker(SelectingSpeaker): """ 2nd order Markov chain. mostly congruent to Markov1Speaker. """ def __init__(self): # keys are word 2-tuples, values are dicts mapping trailing words to frequency self.__words = {} # keys are word 2-tuples, values are tuples of (frequency, list of (word, frequency) tuples) self.__words_compiled = {} # maps phrase start and end words to cardinalities self.__heads = KeyCounter() self.__tails = KeyCounter() # list of word-cardinality tuples self.__heads_compiled = [] def ingest(self, phrase): for sentence in tokenize_sentences(phrase, 50, lowercase=True): phrase_words = sentence.split() phrase_words.append(Symbols.END) phrase_len = len(phrase_words) # phrases under 3 are of no use to a 2nd-order chain if phrase_len < 3: return # grabs first 2 words of phrase self.__heads.increment((phrase_words[0], phrase_words[1])) for i in range(phrase_len - 2): w1 = phrase_words[i] w2 = phrase_words[i + 1] w3 = phrase_words[i + 2] w_pair = (w1, w2) if w_pair in self.__words: trailing_words = self.__words[w_pair] if w3 in trailing_words: trailing_words[w3] = trailing_words[w3] + 1 else: trailing_words[w3] = 1 else: trailing_words = {w3: 1} self.__words[w_pair] = trailing_words def compile(self): """ converts word grid into more efficient structure """ self.__words_compiled.clear() del self.__heads_compiled[0:] for w_pair in self.__words: trailing_words = [] w_pair_count = 0 for w, count in self.__words[w_pair].iteritems(): trailing_words.append((w, count)) w_pair_count += count self.__words_compiled[w_pair] = (w_pair_count, trailing_words) # builds list of (word 2-tuple, frequency) tuples self.__word_weights = [] for w_pair, p in self.__words_compiled.iteritems(): self.__word_weights.append((w_pair, p[0])) for pair, count in self.__heads.iteritems(): self.__heads_compiled.append((pair, count)) # logs all head phrases with cardinality > 1 # logging.debug("heads: %s" % filter(lambda p: p[1] > 1, sorted(self.__heads.items(), key=lambda p: p[1], reverse=True))) def select(self, selected, min_length, max_length): """ params: selected - list of selected words. return: a tuple of words """ if not self.__words: raise MissingDataException( "no satisfactory content has been ingested") elif not self.__words_compiled: self.compile() # select a trailing word via weighted random def select_next(current_pair): pair_stats = self.__words_compiled.get(current_pair, None) next_word = None if pair_stats: # ends if we're past the min length and have END as a potential next word if (calculate_length(selected) > min_length) and \ Symbols.END in pair_stats: next_word = None else: # don't end if we're not longer than min_length logging.debug("select_next %s %s" % (current_pair, str(pair_stats))) # stats were found for this pair # next_word = rrandom.select_weighted_with_replacement(pair_stats[1]) next_word = random.choice(pair_stats[1])[0] if next_word is Symbols.END: next_word = None return (next_word, ) if next_word else None # select first pair via weighted random if not selected: #next = (rrandom.select_weighted_with_replacement(self.__word_weights)) next = (rrandom.select_weighted_with_replacement( self.__heads_compiled)) else: # select using last 2 words as params next = select_next((selected[-2], selected[-1])) return next def describe(self): parts = [] for w_pair, suffix_count in sorted(self.__words.iteritems()): parts.append("%s %s\n" % w_pair) for w_tail, count in sorted(suffix_count.iteritems(), key=lambda p: p[1], reverse=True): parts.append(" %s: %d\n" % (w_tail, count)) return "".join(parts)