Python TwitterTokenizer.TwitterTokenizer Examples

Programming Language: Python

Namespace/Package Name: tokenizer

Class/Type: TwitterTokenizer

Method/Function: TwitterTokenizer

Examples at hotexamples.com: 2

Python TwitterTokenizer.TwitterTokenizer - 2 examples found. These are the top rated real world Python examples of tokenizer.TwitterTokenizer.TwitterTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TwitterTokenizer(2)

is_bad_token(1)

tokenize(1)

Example #1

Show file

File: process.py Project: roofjack1/Twictionary

    def __init__(self, num_tweets=False, debug=False, time=False):

        self.collection = db.twictionary_models_tweets
        self.tres = db.twictionary_models_tres

        opts = dict(fields=dict(date=0, tid=0, user=0), timeout=False)
        if num_tweets:
            opts['limit'] = num_tweets

        if debug or time:
            opts['limit'] = 5000

        self.tweets = self.collection.find(**opts)
        self.persister = Persister(debug=debug)
        self.debug = debug
        self.time = time
        self.tokenizer = TwitterTokenizer()

        self.run()

Example #2

Show file

File: preprocess.py Project: EdwardBetts/twitter-sentiment

    def normalize_mentions(self, s, repl_func='@MENTION'):
        return MENTION_REGEX.sub(repl_func, s)

    def normalize_repeated_chars(self, s):
        return REPEATED_CHAR_REGEX.sub(r'\1\1\1', s)


if __name__ == '__main__':

    print reduce(lambda x, y: '{0}({1})'.format(y, x), ['f', 'g', 'h'], 'x')

    preprocessor = TwitterTextPreprocessor()

    from tokenizer import TwitterTokenizer

    tok = TwitterTokenizer()

    from pymongo import MongoClient

    client = MongoClient()
    db = client.twitter_database
    db_labeled_tweets = db.labeled_tweets

    for tweet in db_labeled_tweets.find({u'text': {'$exists': True}}):
        text = tweet.get(u'text')
        print tweet.get(u'_id')
        print text
        # print decode_html_entities(text)
        # print normalize_urls(text)
        # print normalize_repeated_chars(text)
        # print normalize_mentions(text)