def process_tweet(tweet_in): punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~") expander = Expand_Url(db_name="url_test") tweet = tweet_in if tweet.has_key("entities"): # Insert Counts tweet["counts"] = { "urls": len(tweet["entities"]["urls"]), "hashtags": len(tweet["entities"]["hashtags"]), "user_mentions": len(tweet["entities"]["user_mentions"]), } tweet["hashtags"] = [] tweet["mentions"] = [] # Insert list of hashtags and mentions for index in range(len(tweet["entities"]["hashtags"])): tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower()) for index in range(len(tweet["entities"]["user_mentions"])): tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower()) tweet["hashtags"].sort() tweet["mentions"].sort() # begin url expansion for index in range(len(tweet["entities"]["urls"])): ourl = tweet["entities"]["urls"][index]["expanded_url"] # if the expanded_url field is empty, try expanding the 'url' field instead if ourl is None: ourl = tweet["entities"]["urls"][index]["url"] if ourl: try: expanded = expander.check_cache(ourl) tweet["entities"]["urls"][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why # except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error" # end url expansion # Track rule matches # tweet['track_kw'] = {} # tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set)) # tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set)) tweet_text = re.sub("[%s]" % punct, " ", tweet["text"]) tweet_text = tweet_text.lower().split() # tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set)) # Convert dates # tweet['created_ts'] = to_datetime(tweet['created_at']) # tweet['user']['created_ts'] = to_datetime(tweet['user']['created_at']) # Print tweet as JSON to stdout # print tweet['text'],tweet['entities']['urls'] # result = simplejson.dumps(tweet) return tweet
def process_tweet(tweet_in): track_list = ['boston','marathon','bomb','blast','explosion','watertown','mit','mitshooting'] # Turn it into a set track_set = set(track_list) punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~') expander = Expand_Url(db_name=config_info.cache_db) try: tweet = simplejson.loads(tweet_in) if not tweet.has_key("info"): #print " [x] accepted tweet ID %s" % tweet['id'] if tweet.has_key("entities"): # Insert Counts tweet['counts'] = { 'urls': len(tweet['entities']['urls']), 'hashtags': len(tweet['entities']['hashtags']), 'user_mentions': len(tweet['entities']['user_mentions']) }; tweet['hashtags'] = [] tweet['mentions'] = [] # Insert list of hashtags and mentions for index in range(len(tweet['entities']['hashtags'])): tweet['hashtags'].append(tweet['entities']['hashtags'][index]['text'].lower()) for index in range(len(tweet['entities']['user_mentions'])): tweet['mentions'].append(tweet['entities']['user_mentions'][index]['screen_name'].lower()) tweet['hashtags'].sort() tweet['mentions'].sort() # begin url expansion for index in range(len(tweet['entities']['urls'])): ourl = tweet['entities']['urls'][index]['expanded_url'] if ourl != None: try: expanded = expander.check_cache(ourl) tweet['entities']['urls'][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why #except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet['entities']['urls'][index]['error'] = "Possible Unicode Error"; # if the expanded_url field is empty, try expanding the 'url' field instead else: ourl = tweet['entities']['urls'][index]['url'] try: expanded = expander.check_cache(ourl) tweet['entities']['urls'][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why #except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet['entities']['urls'][index]['error'] = "Possible Unicode Error"; # end url expansion # Track rule matches tweet['track_kw'] = {} tweet['track_kw']['hashtags'] = list(set(tweet['hashtags']).intersection(track_set)) tweet['track_kw']['mentions'] = list(set(tweet['mentions']).intersection(track_set)) tweet_text = re.sub('[%s]' % punct, ' ', tweet['text']) tweet_text = tweet_text.lower().split() tweet['track_kw']['text'] = list(set(tweet_text).intersection(track_set)) # Convert dates # Print tweet as JSON to stdout #print tweet['text'],tweet['entities']['urls'] result = simplejson.dumps(tweet) print " [x] processed tweet ID %s" % tweet['id'] return result else: print " [x] processed %s tweets" % tweet['info']['activity_count'] except ValueError as e: print ' [x] %s, %s' % (e,tweet_in) return '%s, %s' % (e,tweet_in)
from expand_url import Expand_Url URLs = ['http://www.ebay.com', 'http://somelab.net/foo', 'http://uw.edu/foo','http://seattle.somelab.net/test.txt', 'http://somelab.net'] test = Expand_Url(db_name='url_test') for x in URLs: print test.check_cache(x)
import sys sys.path.append('.') import simplejson import re import time from datetime import datetime, timedelta from email.utils import parsedate_tz #from some_url_expander import URLError #from some_url_expander import APIError from expand_url import Expand_Url from urlparse import urlsplit # who is expanding urls on our server expander = Expand_Url(db_name='url_test') # List of punct to remove from string for track keyword matching punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~') # List of words we are tracking track_list = ["15o","15oct","99percent","acampadamataro","acampvalladolid","acampvalladolid","frankietease","ioccupy","ioccupyoccupyashland","k8_revolution","lakajo97","occopywmpt","occuponsmontrea","occupy","occupyaarhus","occupyabilene","occupyadelaide","occupyafrica","occupyafrica1","occupyakron","occupyalbany","occupyalbanyny1","occupyallentown","occupyamsterdam","occupyanchorage","occupyannarbor","occupyappleton","occupyarcata","occupyarizona","occupyarkansas","occupyashland","occupyashlandky","occupyaspen","occupyastoria","occupyathens","occupyathensga","occupyatl","occupyatlanta","occupyatlanticcity","occupyatlcity","occupyauburn","occupyaugusta","occupyaurora","occupyaustin","occupyb0ulder","occupybaltimore","occupybhgrove","occupybkny","occupyboise","occupyboulder","occupyboulderco","occupybrisbane","occupybrussels","occupybucharest","occupybuffalo","occupycarsoncty","occupycc","occupycha","occupychi","occupychicago","occupychucktown","occupycincinnati","occupycincy","occupyclarksvil","occupycleveland","occupycolumbia","occupycosprings","occupycu","occupycville","occupydallas","occupydc","occupydelaware","occupydenhaag","occupydenmark","occupyearth","occupyeugene","occupyflorida","occupyfm","occupyfortmyers","occupyftcollins","occupygtown","occupyhardford","occupyhartford","occupyhouston","occupyhsv","occupyhumboldt","occupyindy","occupyisu","occupyitaly","occupyjax","occupykeene","occupykelowna","occupykingston","occupyla","occupylansing","occupylasvegas","occupylausd","occupylondon","occupylsx","occupymadison99","occupymartnsbrg","occupymemphis","occupymia","occupymilwaukee","occupymn","occupymontrea","occupynashville","occupynewportor","occupynj","occupyns","occupyobise","occupyokc","occupyomaha","occupyorlando","occupyorlandofl","occupyottawa","occupypei","occupyphoenix","occupyportland","occupyprov","occupyquebec","occupyraleigh","occupyredlands","occupyrichmond","occupyroanokeva","occupyrockford","occupysacto","occupysalem","occupysananto","occupysanjose","occupysantacruz","occupysarasota","occupysarasotaoccupysanjose","occupysaskatoon","occupysb","occupysd","occupyseattle","occupysenhaag","occupyslc","occupysr","occupystaugust","occupystl","occupytampa","occupythemedia","occupytoronto","occupyueg","occupyukiah","occupyvermont","occupyvictoria","occupywallst","occupywallstnyc","occupywallstreet","occupywinnipeg","occupywmpt","occupywv","occupyyakima","occupyyeg","occupyyork","occupy_albanyny","occupy_okc","occupy_ottawa","occypyftcollins","ows","owslosangeles","owsspacecoast","perversmas","quimbanda","storydoula","tokumtorgin","nov5","5nov","bofa","cabincr3w","nov2","2nov","generalstrike","oct29","29oct","nov17","17nov","occupypics","usdor","occupydenver","needsoftheoccupiers","wearethe99","occupyoakland","occupyboston","occupy_boston","oo","53percent","1percent","banktransferday","moveyourmoney","louderthanwords","rebuilddream","acorn","n17","17n","d21","12d","occupyarrests","n30","30n","nov30","strike","occupytheport"] # Turn it into a set track_set = set(track_list) # Parse Twitter created_at datestring and turn it into def to_datetime(datestring): time_tuple = parsedate_tz(datestring.strip()) dt = datetime(*time_tuple[:6]) return dt
def process_tweet(tweet_in): track_list = ["boston", "marathon", "bomb", "blast", "explosion", "watertown", "mit", "mitshooting"] # Turn it into a set track_set = set(track_list) punct = re.escape("!\"$%&'()*+,-./:;<=>?@[\\]^`{|}~") expander = Expand_Url(db_name="url_cache") try: tweet = simplejson.loads(tweet_in) if not tweet.has_key("info"): # print " [x] accepted tweet ID %s" % tweet['id'] if tweet.has_key("entities"): # Insert Counts tweet["counts"] = { "urls": len(tweet["entities"]["urls"]), "hashtags": len(tweet["entities"]["hashtags"]), "user_mentions": len(tweet["entities"]["user_mentions"]), } tweet["hashtags"] = [] tweet["mentions"] = [] # Insert list of hashtags and mentions for index in range(len(tweet["entities"]["hashtags"])): tweet["hashtags"].append(tweet["entities"]["hashtags"][index]["text"].lower()) for index in range(len(tweet["entities"]["user_mentions"])): tweet["mentions"].append(tweet["entities"]["user_mentions"][index]["screen_name"].lower()) tweet["hashtags"].sort() tweet["mentions"].sort() # begin url expansion for index in range(len(tweet["entities"]["urls"])): ourl = tweet["entities"]["urls"][index]["expanded_url"] # if the expanded_url field is empty, try expanding the 'url' field instead if ourl is None: ourl = tweet["entities"]["urls"][index]["url"] if ourl: try: expanded = expander.check_cache(ourl) tweet["entities"]["urls"][index].update(expanded) # Catch any exceptions related to URL or expanding errors # and make sure we record why # except (URLError, APIError, UnicodeWarning, UnicodeError) as e: # tweet['entities']['urls'][index]['expansion_error'] = e.msg; # this catches errors which seem to emanate from unicode errors # this should be checked on occasion to ensure it really is a unicode error except KeyError as e: tweet["entities"]["urls"][index]["expansion_error"] = "Possible Unicode Error" # end url expansion # Track rule matches tweet["track_kw"] = {} tweet["track_kw"]["hashtags"] = list(set(tweet["hashtags"]).intersection(track_set)) tweet["track_kw"]["mentions"] = list(set(tweet["mentions"]).intersection(track_set)) tweet_text = re.sub("[%s]" % punct, " ", tweet["text"]) tweet_text = tweet_text.lower().split() tweet["track_kw"]["text"] = list(set(tweet_text).intersection(track_set)) # Convert dates # Print tweet as JSON to stdout # print tweet['text'],tweet['entities']['urls'] result = simplejson.dumps(tweet) return result # print " [x] processed tweet ID %s" % tweet['id'] else: print " [x] processed %s tweets" % tweet["info"]["activity_count"] except ValueError as e: print " [x] %s, %s" % (e, tweet_in) return "%s, %s" % (e, tweet_in)