Example #1
0
class Anagramer(object):
    """
    Anagramer hunts for anagrams on twitter.
    """

    def __init__(self):
        self.twitter_handler = TwitterHandler()
        self.stream_handler = StreamHandler()
        self.stats = AnagramStats()
        self.data = None  # wait until we get run call to load data
        # self.time_to_save = self.set_save_time()

    def run(self, source=None):
        """
        starts the program's main run-loop
        """
        self.data = DataHandler(delegate=self)
        if not source:
            while 1:
                try:
                    if not self.data:
                        self.data = DataHandler()
                    if not self.stats:
                        self.stats = AnagramStats()
                    if not self.stream_handler:
                        self.stream_handler = StreamHandler()
                    logging.info('entering run loop')
                    self.start_stream()
                except KeyboardInterrupt:
                    break
                except NeedsSave:
                    print('\nclosing stream for scheduled maintenance')
                    # todo: this is where we'd handle pruning etc
                finally:
                    self.stream_handler.close()
                    self.stream_handler = None
                    self.data.finish()
                    self.data = None
                    self.stats.close()
                    self.stats = None                 
        else:
            # means we're running from local data
            self.run_with_data(source)

    def start_stream(self):
        """
        main run loop
        """
        self.stats.start_time = time.time()
        self.stream_handler.start()
        for tweet in self.stream_handler:
            self.update_console()
            self.process_input(tweet)

    def run_with_data(self, data):
        """
        uses a supplied data source instead of a twitter connection (debug)
        """
        self.stats.start_time = time.time()
        self.stream_handler.start(source=data)
        # for tweet in data:
        #     self.process_input(tweet)
        #     # time.sleep(0.0001)
        #     self.stats.tweets_seen += 1
        #     self.stats.passed_filter += 1
        #     self.update_console()

        logging.debug('hits %g matches %g' % (self.stats.possible_hits, self.stats.hits))
        self.data.finish()

    def process_input(self, hashed_tweet):
        self.stats.new_hash(hashed_tweet['hash'])
        self.data.process_tweet(hashed_tweet)

    def process_hit(self, tweet_one, tweet_two):
        """
        called by datahandler when it has found a match in need of review.
        """
        self.stats.possible_hits += 1
        self.stats.new_hit(tweet_one['hash'])
        if self.compare(tweet_one['text'], tweet_two['text']):
            hit = {
                "id": int(time.time()*1000),
                "status": HIT_STATUS_REVIEW,
                "tweet_one": tweet_one,
                "tweet_two": tweet_two,
            }
            self.data.remove(tweet_one['hash'])
            self.data.add_hit(hit)
            self.stats.hits += 1
        else:
            pass

    def compare(self, tweet_one, tweet_two):
        """
        most basic test, finds if tweets are just identical
        """
        if not self.compare_chars(tweet_one, tweet_two):
            return False
        if not self.compare_words(tweet_one, tweet_two):
            return False
        return True

    def compare_chars(self, tweet_one, tweet_two, cutoff=0.5):
        """
        basic test, looks for similarity on a char by char basis
        """
        stripped_one = utils.stripped_string(tweet_one)
        stripped_two = utils.stripped_string(tweet_two)

        total_chars = len(stripped_two)
        same_chars = 0
        for i in range(total_chars):
            if stripped_one[i] == stripped_two[i]:
                same_chars += 1

        if (float(same_chars) / total_chars) < cutoff:
            return True
        return False

    def compare_words(self, tweet_one, tweet_two, cutoff=0.5):
        """
        looks for tweets containing the same words in different orders
        """
        words_one = utils.stripped_string(tweet_one, spaces=True).split()
        words_two = utils.stripped_string(tweet_two, spaces=True).split()

        word_count = len(words_one)
        if len(words_two) < len(words_one):
            word_count = len(words_two)

        same_words = 0
        # compare words to each other:
        for word in words_one:
            if word in words_two:
                same_words += 1
        # if more then $CUTOFF words are the same, fail test
        if (float(same_words) / word_count) < cutoff:
            return True
        else:
            return False

    def check_save(self):
        """check if it's time to save and save if necessary"""
        if (time.time() > self.time_to_save):
            self.time_to_save = self.set_save_time()
            raise NeedsSave

# displaying data while we run:
    def update_console(self):
        """
        prints various bits of status information to the console.
        """
        # what all do we want to have, here? let's blueprint:
        # tweets seen: $IN_HAS_TEXT passed filter: $PASSED_F% Hits: $HITS
        seen_percent = int(100*(float(
            self.stream_handler.passed_filter)/self.stream_handler.tweets_seen))
        runtime = time.time()-self.stats.start_time

        status = (
            'tweets seen: ' + str(self.stream_handler.tweets_seen) +
            " passed filter: " + str(self.stream_handler.passed_filter) +
            " ({0}%)".format(seen_percent) +
            " hits " + str(self.stats.possible_hits) +
            " agrams: " + str(self.stats.hits) +
            " buffer: " + str(self.stream_handler.bufferlength()) +
            " runtime: " + utils.format_seconds(runtime)
        )
        sys.stdout.write(status + '\r')
        sys.stdout.flush()

    def print_hits(self):
        hits = self.data.get_all_hits()
        for hit in hits:
            print(hit['tweet_one']['text'], hit['tweet_one']['id'])
            print(hit['tweet_two']['text'], hit['tweet_two']['id'])