def extract_tweet(t): """ t is a dict representing a Twitter tweet as returned by the API returns: a dict for the tweet with filtered attributes """ d = {} rt = t.get('retweeted_status') for f in TWEET_FIELDS: if f == 'user_id': d['user_id'] = t['user']['id'] elif f == 'source': x = re.search(r'(?<=>).+?(?=<\/a>)', t['source']) d['source'] = x.group() if x else t['source'] elif f == 'created_at': d['created_at'] = convert_timestamp(t['created_at']) elif f == 'text': d['text'] = re.sub("\s+", ' ', t['text']).strip() elif f == 'retweeted_status_id': d['retweeted_status_id'] = rt['id'] if rt else None elif f == 'retweeted_status_user_id': d['retweeted_status_user_id'] = rt['user']['id'] if rt else None elif f == 'retweeted_status_user_screen_name': # note that the screen_name of retweeted user is extracted from # the tweet's text d['retweeted_status_user_screen_name'] = ( re.search(r'(?<=^RT @)\w+(?=:)', t['text']).group()) if rt else None else: d[f] = t[f] return d
def extract_twitter_profile(profile): """ profile is a dict representing a Twitter user profile as returned by the API returns: a dict with filtered attributes """ d = {} for att in TWITTER_PROFILE_FIELDS: if att == 'created_at': d['created_at'] = convert_timestamp(profile['created_at']) else: d[att] = profile[att] return d