def restart(self, soul): self.quote_engine_only = soul.quote_engine_only self.work_lock = threading.Lock() self._shutdown = False self._thread = threading.Thread(target=self.__phrase_worker) if self.quote_engine_only: self.voice = None if len(self.pending_tweets.texts) == len(easter_eggs.xkcd): self.pending_goal = len(soul.tagged_tweets) self.low_watermark = 0 # Copy soul.tagged_tweets into pending_tweets print "Loading quotes into query engine.." for tweet in soul.tagged_tweets: self.pending_tweets.add_text(SearchableText(tweet)) if len(self.pending_tweets.texts) % 100 == 0: print "Loaded quote #" + str(len(self.pending_tweets.texts)) + "/" + str( len(soul.tagged_tweets) ) print "Loaded quotes into query engine." self.pending_tweets.update_matrix() BrainReader.write(self, "target_user.brain") else: if config.getfloat("brain", "tweet_pool_multiplier") > 0: self.pending_goal = min( len(soul.tagged_tweets) * config.getfloat("brain", "tweet_pool_multiplier"), config.getint("brain", "tweet_pool_max"), ) else: self.pending_goal = config.getint("brain", "tweet_pool_max") self.low_watermark = self.pending_goal - 85 # FIXME: config? self.voice = PhraseGenerator( soul.tagged_tweets, soul.normalizer, config.getint("brain", "hmm_context"), config.getint("brain", "hmm_offset"), ) if self.pending_tweets.needs_update: self.pending_tweets.update_matrix() self._thread.start()
class TwitterBrain: def __init__(self, soul): # Need an ordered list of vocab words for SearchableTextCollection. # If it vocab changes, we fail. import easter_eggs for t in easter_eggs.xkcd: soul.vocab.update(t.word_info.iterkeys()) self.pending_tweets = SearchableTextCollection(soul.vocab) for t in easter_eggs.xkcd: self.pending_tweets.add_text(t) self.already_tweeted = [] self.remove_tweets = [] for t in soul.tagged_tweets: words = map(lambda x: x[0], t) self.already_tweeted.append(set(words)) self.conversation_contexts = {} self.raw_normalizer = TokenNormalizer() self.last_vect = None self.restart(soul) # Must come last! def restart(self, soul): self.quote_engine_only = soul.quote_engine_only self.work_lock = threading.Lock() self._shutdown = False self._thread = threading.Thread(target=self.__phrase_worker) if self.quote_engine_only: self.voice = None if len(self.pending_tweets.texts) == len(easter_eggs.xkcd): self.pending_goal = len(soul.tagged_tweets) self.low_watermark = 0 # Copy soul.tagged_tweets into pending_tweets print "Loading quotes into query engine.." for tweet in soul.tagged_tweets: self.pending_tweets.add_text(SearchableText(tweet)) if len(self.pending_tweets.texts) % 100 == 0: print "Loaded quote #" + str(len(self.pending_tweets.texts)) + "/" + str( len(soul.tagged_tweets) ) print "Loaded quotes into query engine." self.pending_tweets.update_matrix() BrainReader.write(self, "target_user.brain") else: if config.getfloat("brain", "tweet_pool_multiplier") > 0: self.pending_goal = min( len(soul.tagged_tweets) * config.getfloat("brain", "tweet_pool_multiplier"), config.getint("brain", "tweet_pool_max"), ) else: self.pending_goal = config.getint("brain", "tweet_pool_max") self.low_watermark = self.pending_goal - 85 # FIXME: config? self.voice = PhraseGenerator( soul.tagged_tweets, soul.normalizer, config.getint("brain", "hmm_context"), config.getint("brain", "hmm_offset"), ) if self.pending_tweets.needs_update: self.pending_tweets.update_matrix() self._thread.start() # TODO: We could normalize for tense agreement... might be a bad idea # though. # http://nodebox.net/code/index.php/Linguistics # en.is_verb() with en.verb.tense() and en.verb.conjugate() def get_tweet(self, msger=None, query_string=None, followed=False): self.__lock() if msger and msger not in self.conversation_contexts: self.conversation_contexts[msger] = ConversationContext(msger) max_len = config.getint("brain", "tweet_len") if query_string and msger: # XXX: nltk.pos_tag doesn't do so well if the first word in a question # is capitalized. Should we add an option to the normalizer for this? query_string = word_detokenize( self.conversation_contexts[msger].normalizer.normalize_tokens(word_tokenize(query_string)) ) query_string = PronounInverter.invert_all(query_string) print "Normalized Inverted Query: " + query_string query_text = SearchableText(query_string, strip=True) curr_vect = self.pending_tweets.score_query(query_text) if followed: qvect = curr_vect else: if self.last_vect != None: self.conversation_contexts[msger].prime_memory(self.last_vect) qvect = self.conversation_contexts[msger].decay_query(curr_vect, query_text) max_len -= len("@" + msger + " ") (score, last_vect, ret) = self.pending_tweets.vector_query( qvect, exclude=self.remove_tweets, max_len=max_len ) if followed: min_score = config.getfloat("query", "min_follow_reply_score") else: min_score = config.getfloat("query", "min_msg_reply_score") if score >= min_score: self.last_vect = last_vect else: print "Minimum score of " + str(min_score) + " not met: " + str(score) print str(ret.tagged_tokens) print "Not responding with: " + ret.text return None if followed: # If this was a followed tweet, we should now record that it made # us say something. self.conversation_contexts[msger].decay_query(curr_vect, query_text) # Remember the last thing we said. self.conversation_contexts[msger].remember_query(self.last_vect) else: # query should be None here if query_string: query_text = SearchableText(query_string, strip=True) else: query_text = None (score, self.last_vect, ret) = self.pending_tweets.text_query( query_text, exclude=self.remove_tweets, max_len=max_len ) self.remove_tweets.append(ret) tokens = ret.tokens() self.already_tweeted.append(set(tokens)) self.__unlock() print str(ret.tagged_tokens) if msger: return "@" + msger + " " + ret.text else: return ret.text def __did_already_tweet(self, words, max_score=config.getfloat("brain", "max_shared_word_ratio")): for t in self.already_tweeted: score = float(len(t & words)) score1 = score / len(t) score2 = score / len(words) if score1 > max_score or score2 > max_score: # print "Too similar to old tweet.. skipping: "+\ # str(score1)+"/"+str(score2) return True # TODO: This maybe should be a function of the SearchableTextCollection for text in self.pending_tweets.texts: tokens = set(text.tokens()) score = float(len(tokens & words)) score1 = score / len(tokens) score2 = score / len(words) if score1 > max_score or score2 > max_score: # print "Too similar to pending tweet.. skipping: "+\ # str(score1)+"/"+str(score2) return True return False def __phrase_worker(self): try: self.__phrase_worker2() except: print "Worker thread died." traceback.print_exc() print "Worker thread quit." sys.exit(0) # Needed to avoid the race condition on pickling.. def __lock(self): lock = self.work_lock while not lock: lock = self.work_lock time.sleep(1) lock.acquire() def __unlock(self): self.work_lock.release() # TODO: Maybe this whole tweet pool model is the wrong way to go. # We could try influencing the HMM's probabilities directly using a query, # but I'm not sure how to do that and not get utter nonsense. def __phrase_worker2(self): first_run = True while not self._shutdown: added_tweets = False if len(self.remove_tweets) > 90: # FIXME: config? self.__lock() while len(self.remove_tweets) > 0: self.pending_tweets.remove_text(self.remove_tweets.pop()) added_tweets = True self.__unlock() if self.quote_engine_only: if added_tweets: self.__lock() self.pending_tweets.update_matrix() self.__unlock() first_run = False BrainReader.write(self, "target_user.brain") time.sleep(2) continue # Need low watermark. Maybe goal-100? if len(self.pending_tweets.texts) <= self.low_watermark: while len(self.pending_tweets.texts) < self.pending_goal: self.__lock() (tweet, tokens, tagged_tokens) = self.voice.say_something() if len(tweet) > config.getint("brain", "tweet_len") or self.__did_already_tweet(set(tokens)): self.__unlock() continue self.pending_tweets.add_text(SearchableText(tweet, tokens, tagged_tokens)) print "At tweet count " + str(len(self.pending_tweets.texts)) + "/" + str(self.pending_goal) added_tweets = True self.__unlock() if len(self.pending_tweets.texts) % 100 == 0: # FIXME: config? break # Perform other work time.sleep(2) if len(self.pending_tweets.texts) == self.pending_goal and added_tweets: print "At full tweet count " + str(self.pending_goal) self.__lock() self.pending_tweets.update_matrix() self.__unlock() first_run = False BrainReader.write(self, "target_user.brain") elif added_tweets: self.__lock() print "At tweet count " + str(len(self.pending_tweets.texts)) + "/" + str(self.pending_goal) self.pending_tweets.update_matrix() self.__unlock() if (len(self.pending_tweets.texts) % config.getint("brain", "save_brain_every")) == 0: BrainReader.write(self, "target_user.brain") time.sleep(2)
print "Loaded soul file." except KeyError: soul = pickle.load(gzip.GzipFile("target_user.soul", "r")) print "Loaded soul file." except IOError: print "No soul file found. Regenerating." soul = CorpusSoul('target_user') if config.getboolean("soul", "gzip_soul"): pickle.dump(soul, gzip.GzipFile("target_user.soul", "w")) else: pickle.dump(soul, open("target_user.soul", "w")) except Exception,e: traceback.print_exc() soul.normalizer.verify_scores() voice = PhraseGenerator(soul.tagged_tweets, soul.normalizer, config.getint("brain","hmm_context"), config.getint("brain","hmm_offset")) while True: query = raw_input("> ") if not query: query = "41" if query.isdigit(): (str_result, tok_result, result) = voice.say_something() print str(result) print str_result if __name__ == "__main__": main()