Ejemplo n.º 1
0
def process_tweet(tweet_in):
    punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~")
    expander = Expand_Url(db_name="url_test")
    tweet = tweet_in
    if tweet.has_key("entities"):

        # Insert Counts
        tweet["counts"] = {
            "urls": len(tweet["entities"]["urls"]),
            "hashtags": len(tweet["entities"]["hashtags"]),
            "user_mentions": len(tweet["entities"]["user_mentions"]),
        }

        tweet["hashtags"] = []
        tweet["mentions"] = []

        # Insert list of hashtags and mentions
        for index in range(len(tweet["entities"]["hashtags"])):
            tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower())
            for index in range(len(tweet["entities"]["user_mentions"])):
                tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower())

                tweet["hashtags"].sort()
                tweet["mentions"].sort()

                # begin url expansion
                for index in range(len(tweet["entities"]["urls"])):
                    ourl = tweet["entities"]["urls"][index]["expanded_url"]

                    # if the expanded_url field is empty, try expanding the 'url' field instead
                    if ourl is None:
                        ourl = tweet["entities"]["urls"][index]["url"]

                    if ourl:

                        try:
                            expanded = expander.check_cache(ourl)
                            tweet["entities"]["urls"][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            # except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            # 	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error"
                # end url expansion

                # Track rule matches
                # tweet['track_kw'] = {}
                # tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set))
                # tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set))
                tweet_text = re.sub("[%s]" % punct, " ", tweet["text"])
                tweet_text = tweet_text.lower().split()
                # tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set))

                # Convert dates
                # tweet['created_ts'] = to_datetime(tweet['created_at'])
                # tweet['user']['created_ts'] = to_datetime(tweet['user']['created_at'])

                # Print tweet as JSON to stdout
                # print tweet['text'],tweet['entities']['urls']
    # result = simplejson.dumps(tweet)
    return tweet
Ejemplo n.º 2
0
def process_tweet(tweet_in):
    track_list = ['boston','marathon','bomb','blast','explosion','watertown','mit','mitshooting']
    # Turn it into a set
    track_set = set(track_list)
    punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~')
    expander = Expand_Url(db_name=config_info.cache_db)
    try:
        tweet = simplejson.loads(tweet_in)
        if not tweet.has_key("info"):
            #print " [x] accepted tweet ID %s" % tweet['id']
            if tweet.has_key("entities"):

            # Insert Counts
                tweet['counts'] = {
                    'urls': len(tweet['entities']['urls']),
                    'hashtags': len(tweet['entities']['hashtags']),
                    'user_mentions': len(tweet['entities']['user_mentions'])
                };

                tweet['hashtags'] = []
                tweet['mentions'] = []

                # Insert list of hashtags and mentions
                for index in range(len(tweet['entities']['hashtags'])):
                    tweet['hashtags'].append(tweet['entities']['hashtags'][index]['text'].lower())
                    for index in range(len(tweet['entities']['user_mentions'])):
                        tweet['mentions'].append(tweet['entities']['user_mentions'][index]['screen_name'].lower())

                        tweet['hashtags'].sort()
                        tweet['mentions'].sort()

                # begin url expansion
                for index in range(len(tweet['entities']['urls'])):
                    ourl = tweet['entities']['urls'][index]['expanded_url']
                    if ourl != None:
                        try:
                            expanded = expander.check_cache(ourl)
                            tweet['entities']['urls'][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            #except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            #	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet['entities']['urls'][index]['error'] = "Possible Unicode Error";
                    # if the expanded_url field is empty, try expanding the 'url' field instead
                    else:
                        ourl = tweet['entities']['urls'][index]['url']
                        try:
                            expanded = expander.check_cache(ourl)
                            tweet['entities']['urls'][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            #except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            #	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet['entities']['urls'][index]['error'] = "Possible Unicode Error";
                    # end url expansion

                        # Track rule matches
                tweet['track_kw'] = {}
                tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set))
                tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set))
                tweet_text = re.sub('[%s]' % punct, ' ', tweet['text'])
                tweet_text = tweet_text.lower().split()
                tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set))

                        # Convert dates

                        # Print tweet as JSON to stdout
                        #print tweet['text'],tweet['entities']['urls']
            result = simplejson.dumps(tweet)
            print " [x] processed tweet ID %s" % tweet['id']
            return result
        else:
            print " [x] processed %s tweets" % tweet['info']['activity_count']

    except ValueError as e:
        print ' [x] %s, %s' % (e,tweet_in)
        return '%s, %s' % (e,tweet_in)
Ejemplo n.º 3
0
from expand_url import Expand_Url

URLs = ['http://www.ebay.com', 'http://somelab.net/foo', 'http://uw.edu/foo','http://seattle.somelab.net/test.txt', 'http://somelab.net']

test = Expand_Url(db_name='url_test')

for x in URLs:
    print test.check_cache(x)
Ejemplo n.º 4
0
			tweet['hashtags'].sort()
			tweet['mentions'].sort()

                        # begin url expansion
			for index in range(len(tweet['entities']['urls'])):
				ourl = tweet['entities']['urls'][index]['expanded_url']

                                # if the expanded_url field is empty, try expanding the 'url' field instead
				if ourl is None:
					ourl = tweet['entities']['urls'][index]['url']
                                        print ourl
				if ourl:
                                        print ourl
					try:
						expanded = expander.check_cache(ourl)
						tweet['entities']['urls'][index].update(expanded)

					# Catch any exceptions related to URL or expanding errors
                                        # and make sure we record why
					#except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
					#	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                                        # this catches errors which seem to emanate from unicode errors
                                        # this should be checked on occasion to ensure it really is a unicode error
					except KeyError as e:
						tweet['entities']['urls'][index]['expansion_error'] = "Possible Unicode Error";
                        # end url expansion

                        # Track rule matches
                        tweet['track_kw'] = {}
                        tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set))
Ejemplo n.º 5
0
def process_tweet(tweet_in):
    track_list = ["boston", "marathon", "bomb", "blast", "explosion", "watertown", "mit", "mitshooting"]
    # Turn it into a set
    track_set = set(track_list)
    punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~")
    expander = Expand_Url(db_name="url_cache")
    try:
        tweet = simplejson.loads(tweet_in)
        if not tweet.has_key("info"):
            # print " [x] accepted tweet ID %s" % tweet['id']
            if tweet.has_key("entities"):

                # Insert Counts
                tweet["counts"] = {
                    "urls": len(tweet["entities"]["urls"]),
                    "hashtags": len(tweet["entities"]["hashtags"]),
                    "user_mentions": len(tweet["entities"]["user_mentions"]),
                }

                tweet["hashtags"] = []
                tweet["mentions"] = []

                # Insert list of hashtags and mentions
                for index in range(len(tweet["entities"]["hashtags"])):
                    tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower())
                    for index in range(len(tweet["entities"]["user_mentions"])):
                        tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower())

                        tweet["hashtags"].sort()
                        tweet["mentions"].sort()

                # begin url expansion
                for index in range(len(tweet["entities"]["urls"])):
                    ourl = tweet["entities"]["urls"][index]["expanded_url"]

                    # if the expanded_url field is empty, try expanding the 'url' field instead
                    if ourl is None:
                        ourl = tweet["entities"]["urls"][index]["url"]

                    if ourl:
                        try:
                            expanded = expander.check_cache(ourl)
                            tweet["entities"]["urls"][index].update(expanded)
                            # Catch any exceptions related to URL or expanding errors
                            # and make sure we record why
                            # except (URLError, APIError, UnicodeWarning, UnicodeError) as e:
                            # 	tweet['entities']['urls'][index]['expansion_error'] = e.msg;
                            # this catches errors which seem to emanate from unicode errors
                            # this should be checked on occasion to ensure it really is a unicode error
                        except KeyError as e:
                            tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error"
                        # end url expansion

                        # Track rule matches
                        tweet["track_kw"] = {}
                        tweet["track_kw"]["hashtags"] = list(set(tweet["hashtags"]).intersection(track_set))
                        tweet["track_kw"]["mentions"] = list(set(tweet["mentions"]).intersection(track_set))
                        tweet_text = re.sub("[%s]" % punct, " ", tweet["text"])
                        tweet_text = tweet_text.lower().split()
                        tweet["track_kw"]["text"] = list(set(tweet_text).intersection(track_set))

                        # Convert dates

                        # Print tweet as JSON to stdout
                        # print tweet['text'],tweet['entities']['urls']
            result = simplejson.dumps(tweet)
            return result
            # print " [x] processed tweet ID %s" % tweet['id']
        else:
            print " [x] processed %s tweets" % tweet["info"]["activity_count"]

    except ValueError as e:
        print " [x] %s, %s" % (e, tweet_in)
        return "%s, %s" % (e, tweet_in)