Example #1
0
    def __init__(self, num_tweets=False, debug=False, time=False):

        self.collection = db.twictionary_models_tweets
        self.tres = db.twictionary_models_tres

        opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False)
        if num_tweets:
            opts['limit'] = num_tweets

        if debug or time:
            opts['limit'] = 5000

        self.tweets = self.collection.find(**opts)
        self.persister = Persister(debug=debug)
        self.debug = debug
        self.time = time
        self.tokenizer = TwitterTokenizer()

        self.run()
    def normalize_mentions(self, s, repl_func='@MENTION'):
        return MENTION_REGEX.sub(repl_func, s)

    def normalize_repeated_chars(self, s):
        return REPEATED_CHAR_REGEX.sub(r'\1\1\1', s)


if __name__ == '__main__':

    print reduce(lambda x, y: '{0}({1})'.format(y, x), ['f', 'g', 'h'], 'x')

    preprocessor = TwitterTextPreprocessor()

    from tokenizer import TwitterTokenizer

    tok = TwitterTokenizer()

    from pymongo import MongoClient

    client = MongoClient()
    db = client.twitter_database
    db_labeled_tweets = db.labeled_tweets

    for tweet in db_labeled_tweets.find({u'text': {'$exists': True}}):
        text = tweet.get(u'text')
        print tweet.get(u'_id')
        print text
        # print decode_html_entities(text)
        # print normalize_urls(text)
        # print normalize_repeated_chars(text)
        # print normalize_mentions(text)