Exemple #1
0
    def restart(self, soul):
        self.quote_engine_only = soul.quote_engine_only
        self.work_lock = threading.Lock()
        self._shutdown = False
        self._thread = threading.Thread(target=self.__phrase_worker)
        if self.quote_engine_only:
            self.voice = None
            if len(self.pending_tweets.texts) == len(easter_eggs.xkcd):
                self.pending_goal = len(soul.tagged_tweets)
                self.low_watermark = 0
                # Copy soul.tagged_tweets into pending_tweets
                print "Loading quotes into query engine.."
                for tweet in soul.tagged_tweets:
                    self.pending_tweets.add_text(SearchableText(tweet))
                    if len(self.pending_tweets.texts) % 100 == 0:
                        print "Loaded quote #" + str(len(self.pending_tweets.texts)) + "/" + str(
                            len(soul.tagged_tweets)
                        )
                print "Loaded quotes into query engine."
                self.pending_tweets.update_matrix()
                BrainReader.write(self, "target_user.brain")
        else:
            if config.getfloat("brain", "tweet_pool_multiplier") > 0:
                self.pending_goal = min(
                    len(soul.tagged_tweets) * config.getfloat("brain", "tweet_pool_multiplier"),
                    config.getint("brain", "tweet_pool_max"),
                )
            else:
                self.pending_goal = config.getint("brain", "tweet_pool_max")

            self.low_watermark = self.pending_goal - 85  # FIXME: config?
            self.voice = PhraseGenerator(
                soul.tagged_tweets,
                soul.normalizer,
                config.getint("brain", "hmm_context"),
                config.getint("brain", "hmm_offset"),
            )
            if self.pending_tweets.needs_update:
                self.pending_tweets.update_matrix()

        self._thread.start()
Exemple #2
0
class TwitterBrain:
    def __init__(self, soul):
        # Need an ordered list of vocab words for SearchableTextCollection.
        # If it vocab changes, we fail.
        import easter_eggs

        for t in easter_eggs.xkcd:
            soul.vocab.update(t.word_info.iterkeys())
        self.pending_tweets = SearchableTextCollection(soul.vocab)
        for t in easter_eggs.xkcd:
            self.pending_tweets.add_text(t)
        self.already_tweeted = []
        self.remove_tweets = []
        for t in soul.tagged_tweets:
            words = map(lambda x: x[0], t)
            self.already_tweeted.append(set(words))
        self.conversation_contexts = {}
        self.raw_normalizer = TokenNormalizer()
        self.last_vect = None
        self.restart(soul)  # Must come last!

    def restart(self, soul):
        self.quote_engine_only = soul.quote_engine_only
        self.work_lock = threading.Lock()
        self._shutdown = False
        self._thread = threading.Thread(target=self.__phrase_worker)
        if self.quote_engine_only:
            self.voice = None
            if len(self.pending_tweets.texts) == len(easter_eggs.xkcd):
                self.pending_goal = len(soul.tagged_tweets)
                self.low_watermark = 0
                # Copy soul.tagged_tweets into pending_tweets
                print "Loading quotes into query engine.."
                for tweet in soul.tagged_tweets:
                    self.pending_tweets.add_text(SearchableText(tweet))
                    if len(self.pending_tweets.texts) % 100 == 0:
                        print "Loaded quote #" + str(len(self.pending_tweets.texts)) + "/" + str(
                            len(soul.tagged_tweets)
                        )
                print "Loaded quotes into query engine."
                self.pending_tweets.update_matrix()
                BrainReader.write(self, "target_user.brain")
        else:
            if config.getfloat("brain", "tweet_pool_multiplier") > 0:
                self.pending_goal = min(
                    len(soul.tagged_tweets) * config.getfloat("brain", "tweet_pool_multiplier"),
                    config.getint("brain", "tweet_pool_max"),
                )
            else:
                self.pending_goal = config.getint("brain", "tweet_pool_max")

            self.low_watermark = self.pending_goal - 85  # FIXME: config?
            self.voice = PhraseGenerator(
                soul.tagged_tweets,
                soul.normalizer,
                config.getint("brain", "hmm_context"),
                config.getint("brain", "hmm_offset"),
            )
            if self.pending_tweets.needs_update:
                self.pending_tweets.update_matrix()

        self._thread.start()

    # TODO: We could normalize for tense agreement... might be a bad idea
    # though.
    # http://nodebox.net/code/index.php/Linguistics
    # en.is_verb() with en.verb.tense() and en.verb.conjugate()
    def get_tweet(self, msger=None, query_string=None, followed=False):
        self.__lock()
        if msger and msger not in self.conversation_contexts:
            self.conversation_contexts[msger] = ConversationContext(msger)
        max_len = config.getint("brain", "tweet_len")
        if query_string and msger:
            # XXX: nltk.pos_tag doesn't do so well if the first word in a question
            # is capitalized. Should we add an option to the normalizer for this?
            query_string = word_detokenize(
                self.conversation_contexts[msger].normalizer.normalize_tokens(word_tokenize(query_string))
            )
            query_string = PronounInverter.invert_all(query_string)

            print "Normalized Inverted Query: " + query_string
            query_text = SearchableText(query_string, strip=True)
            curr_vect = self.pending_tweets.score_query(query_text)

            if followed:
                qvect = curr_vect
            else:
                if self.last_vect != None:
                    self.conversation_contexts[msger].prime_memory(self.last_vect)

                qvect = self.conversation_contexts[msger].decay_query(curr_vect, query_text)

            max_len -= len("@" + msger + " ")
            (score, last_vect, ret) = self.pending_tweets.vector_query(
                qvect, exclude=self.remove_tweets, max_len=max_len
            )
            if followed:
                min_score = config.getfloat("query", "min_follow_reply_score")
            else:
                min_score = config.getfloat("query", "min_msg_reply_score")

            if score >= min_score:
                self.last_vect = last_vect
            else:
                print "Minimum score of " + str(min_score) + " not met: " + str(score)
                print str(ret.tagged_tokens)
                print "Not responding with: " + ret.text
                return None
            if followed:
                # If this was a followed tweet, we should now record that it made
                # us say something.
                self.conversation_contexts[msger].decay_query(curr_vect, query_text)

            # Remember the last thing we said.
            self.conversation_contexts[msger].remember_query(self.last_vect)
        else:
            # query should be None here
            if query_string:
                query_text = SearchableText(query_string, strip=True)
            else:
                query_text = None
            (score, self.last_vect, ret) = self.pending_tweets.text_query(
                query_text, exclude=self.remove_tweets, max_len=max_len
            )
        self.remove_tweets.append(ret)
        tokens = ret.tokens()
        self.already_tweeted.append(set(tokens))
        self.__unlock()
        print str(ret.tagged_tokens)
        if msger:
            return "@" + msger + " " + ret.text
        else:
            return ret.text

    def __did_already_tweet(self, words, max_score=config.getfloat("brain", "max_shared_word_ratio")):
        for t in self.already_tweeted:
            score = float(len(t & words))
            score1 = score / len(t)
            score2 = score / len(words)
            if score1 > max_score or score2 > max_score:
                # print "Too similar to old tweet.. skipping: "+\
                #         str(score1)+"/"+str(score2)
                return True

        # TODO: This maybe should be a function of the SearchableTextCollection
        for text in self.pending_tweets.texts:
            tokens = set(text.tokens())
            score = float(len(tokens & words))
            score1 = score / len(tokens)
            score2 = score / len(words)
            if score1 > max_score or score2 > max_score:
                # print "Too similar to pending tweet.. skipping: "+\
                #         str(score1)+"/"+str(score2)
                return True
        return False

    def __phrase_worker(self):
        try:
            self.__phrase_worker2()
        except:
            print "Worker thread died."
            traceback.print_exc()
        print "Worker thread quit."
        sys.exit(0)

    # Needed to avoid the race condition on pickling..
    def __lock(self):
        lock = self.work_lock
        while not lock:
            lock = self.work_lock
            time.sleep(1)
        lock.acquire()

    def __unlock(self):
        self.work_lock.release()

    # TODO: Maybe this whole tweet pool model is the wrong way to go.
    # We could try influencing the HMM's probabilities directly using a query,
    # but I'm not sure how to do that and not get utter nonsense.
    def __phrase_worker2(self):
        first_run = True
        while not self._shutdown:
            added_tweets = False
            if len(self.remove_tweets) > 90:  # FIXME: config?
                self.__lock()
                while len(self.remove_tweets) > 0:
                    self.pending_tweets.remove_text(self.remove_tweets.pop())
                added_tweets = True
                self.__unlock()

            if self.quote_engine_only:
                if added_tweets:
                    self.__lock()
                    self.pending_tweets.update_matrix()
                    self.__unlock()
                    first_run = False
                    BrainReader.write(self, "target_user.brain")
                time.sleep(2)
                continue

            # Need low watermark. Maybe goal-100?
            if len(self.pending_tweets.texts) <= self.low_watermark:
                while len(self.pending_tweets.texts) < self.pending_goal:
                    self.__lock()
                    (tweet, tokens, tagged_tokens) = self.voice.say_something()

                    if len(tweet) > config.getint("brain", "tweet_len") or self.__did_already_tweet(set(tokens)):
                        self.__unlock()
                        continue

                    self.pending_tweets.add_text(SearchableText(tweet, tokens, tagged_tokens))
                    print "At tweet count " + str(len(self.pending_tweets.texts)) + "/" + str(self.pending_goal)
                    added_tweets = True
                    self.__unlock()

                    if len(self.pending_tweets.texts) % 100 == 0:  # FIXME: config?
                        break  # Perform other work

            time.sleep(2)

            if len(self.pending_tweets.texts) == self.pending_goal and added_tweets:
                print "At full tweet count " + str(self.pending_goal)
                self.__lock()
                self.pending_tweets.update_matrix()
                self.__unlock()
                first_run = False
                BrainReader.write(self, "target_user.brain")
            elif added_tweets:
                self.__lock()
                print "At tweet count " + str(len(self.pending_tweets.texts)) + "/" + str(self.pending_goal)
                self.pending_tweets.update_matrix()
                self.__unlock()
                if (len(self.pending_tweets.texts) % config.getint("brain", "save_brain_every")) == 0:
                    BrainReader.write(self, "target_user.brain")
            time.sleep(2)
Exemple #3
0
    print "Loaded soul file."
  except KeyError:
    soul = pickle.load(gzip.GzipFile("target_user.soul", "r"))
    print "Loaded soul file."
  except IOError:
    print "No soul file found. Regenerating."
    soul = CorpusSoul('target_user')
    if config.getboolean("soul", "gzip_soul"):
      pickle.dump(soul, gzip.GzipFile("target_user.soul", "w"))
    else:
      pickle.dump(soul, open("target_user.soul", "w"))
  except Exception,e:
    traceback.print_exc()

  soul.normalizer.verify_scores()

  voice = PhraseGenerator(soul.tagged_tweets, soul.normalizer,
                          config.getint("brain","hmm_context"),
                          config.getint("brain","hmm_offset"))

  while True:
    query = raw_input("> ")
    if not query: query = "41"
    if query.isdigit():
      (str_result, tok_result, result) = voice.say_something()
      print str(result)
      print str_result

if __name__ == "__main__":
  main()