def fetch(self, service_id, service_author_id, service_event_id, callback): asm = self.get_author_service_map(service_author_id) # TODO - there needs to be a global caching mechanism for this (i.e. memcached, etc.). # Because of the distributed nature of this two successive updates for the # same author don't share any state and can't leverage the refresh token # The refresh token should have a configured TTL # we need to exchange the refresh token for an access token query_args = urllib.urlencode([('client_id', self.oauth_config['key']), ('client_secret', self.oauth_config['secret']), ('refresh_token', asm.access_token), ('grant_type', 'refresh_token')]) raw_obj = json_serializer.load(urllib2.urlopen(self.oauth_config['oauth_exchange_url'], query_args)) access_token = raw_obj['access_token'] args = {'access_token': access_token} # fetch latest version of event url = '%s%s%s?%s' % (self.oauth_config['endpoint'], self.ACTIVITY_INFO, service_event_id, urllib.urlencode(args)) raw_obj = json_serializer.load(urllib2.urlopen(url)) interpreter = GoogleplusStatusEventInterpreter(raw_obj, asm, self.oauth_config) callback(create_googleplus_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), raw_obj))
def fetch(self, service_id, service_author_id, service_event_id, callback): asm = self.get_author_service_map(service_author_id) if asm.access_token: access_token = asm.access_token else: access_token = self.oauth_config['user1_access_token'] args = {'access_token': access_token} # fetch latest version of event url = '{0}{1}{2}?{3}'.format(self.oauth_config['endpoint'], self.MEDIA_INFO, service_event_id, urllib.urlencode(args)) raw_obj = json_serializer.load(urllib2.urlopen(url)) post = raw_obj['data'] interpreter = InstagramEventInterpreter(post, asm, self.oauth_config) # TODO - unclear if/why the link meta data should be included -- included here because # relationships are not being properly maintained callback(create_instagram_event( asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post, [create_event_link(data_access.service.name_to_id('instagram'), '_{0}@{1}'.format(self.service_name, asm.author_id))]))
def fetch(self, service_id, service_author_id, service_event_id, callback): asm = self.get_author_service_map(service_author_id) # TODO - temporary until we figure out a better solution for # not over-driving Twitter with un-authenticated events if not asm.access_token: return if asm.access_token: consumer = oauth.Consumer(self.oauth_config['key'], self.oauth_config['secret']) token = oauth.Token(asm.access_token, asm.access_token_secret) client = oauth.Client(consumer, token) args = {'id': service_event_id, 'include_entities': '1', 'trim_user': '******'} # if not authenticated provide the user_id query arg if not asm.access_token: args['user_id'] = asm.service_author_id url = TWEET_STATUS % (self.oauth_config['endpoint'], urllib.urlencode(args)) # TODO - remove the try/except once figure out a better solution for not # exceeding Twitter's rate limits try: json_obj = json_serializer.load_string(make_request(client, url)) if asm.access_token \ else json_serializer.load(urllib2.urlopen(url)) except urllib2.URLError, e: logging.error('ERROR REQUEST URL: {0}'.format(url)) logging.error('ERROR REASON: {0}, {1}'.format(e.code, e.read())) raise
def app_main(self, config, options, queues): TIM_DATA = 'TIM_DATA' data_directory = os.environ.get(TIM_DATA, None) if data_directory is None: logging.error('Environment variable %s not defined', TIM_DATA) sys.exit() file = options.message_file.format(tim_data=data_directory) if not os.path.exists(file): logging.warning('File "%s" does not exist', file) sys.exit() # read the message file try: messages = json_serializer.load(open(file, 'r')) except Exception: logging.error('Failed to read json file: %s', file) raise # create amqp connection client = message_queue.create_message_client(options.url) # create all of the required queues message_queue.create_queues_from_config(client, config['amqp']) # itereate and send all the interesting messages for message in messages: queue = message['header']['type'] if queue in queues: message_queue.send_messages(client, config['amqp']['exchange']['name'], [message]) sys.stdout.write('.') sys.stdout.write('\n')
def fetch(self, service_id, service_author_id, service_event_id, callback): asm = self.get_author_service_map(service_author_id) args = {'access_token': asm.access_token} # fetch latest version of event path = '%s%s?%s' % (self.oauth_config['endpoint'], service_event_id, urllib.urlencode(args)) json_obj = json_serializer.load(urllib2.urlopen(path)) interpreter = FacebookEventInterpreter(json_obj, asm, self.oauth_config) callback(create_facebook_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), json_obj))
def get_author_profile(self, service_author_id, asm): asm = self.fetch_begin(service_author_id, asm) # we need to exchange the refresh token for an access token query_args = urllib.urlencode([('client_id', self.oauth_config['key']), ('client_secret', self.oauth_config['secret']), ('refresh_token', asm.access_token), ('grant_type', 'refresh_token')]) try: raw_obj = json_serializer.load(urllib2.urlopen(self.oauth_config['oauth_exchange_url'], query_args)) except urllib2.URLError, e: logging.error('ERROR REASON: {0}, {1}'.format(e.code, e.read())) raise
def get_author_profile(self, service_author_id, asm): asm = self.fetch_begin(service_author_id, asm) if asm.access_token: access_token = asm.access_token user_info = self.USER_INFO.format('self') else: access_token = self.oauth_config['user1_access_token'] user_info = self.USER_INFO.format(asm.service_author_id) args = {'oauth_token': access_token, 'v': 20120130} url = '{0}{1}?{2}'.format(self.oauth_config['endpoint'], user_info, urllib.urlencode(args)) # request the user's profile json_obj = json_serializer.load(urllib2.urlopen(url)) # check for error if json_obj['meta']['code'] != 200: raise Exception('Foursquare error response: {0}'.format(json_obj['meta']['code'])) json_obj = json_obj['response']['user'] profile_json = {} firstName = profile_json['first_name'] = json_obj.get('firstName', '') lastName = profile_json['last_name'] = json_obj.get('lastName', '') # if we have a non-empty string add it to the json name = '{0} {1}'.format(firstName, lastName).strip() if len(name) > 0: profile_json['name'] = name if 'photo' in json_obj: profile_json['picture_url'] = json_obj['photo'] if 'bio' in json_obj: profile_json['headline'] = json_obj['bio'] return profile_json
def get_author_profile(self, service_author_id, asm): asm = self.fetch_begin(service_author_id, asm) if asm.access_token: access_token = asm.access_token user_id = 'me' else: access_token = self.oauth_config['user1_access_token'] user_id = asm.service_author_id args = {'access_token': access_token} url = '{0}{1}?{2}'.format(self.oauth_config['endpoint'], user_id, urllib.urlencode(args)) # request the user's profile json_obj = json_serializer.load(urllib2.urlopen(url)) profile_json = {} if 'first_name' in json_obj: profile_json['first_name'] = json_obj['first_name'] if 'last_name' in json_obj: profile_json['last_name'] = json_obj['last_name'] if 'name' in json_obj: profile_json['name'] = json_obj['name'] if 'link' in json_obj: profile_json['public_profile_url'] = json_obj['link'] url = '{0}{1}/picture?{2}'.format(self.oauth_config['endpoint'], user_id, urllib.urlencode(args)) # request the user's profile picuture_json = urllib2.urlopen(url).geturl() profile_json['picture_url'] = picuture_json return profile_json
def fetch(self, service_id, service_author_id, service_event_id, callback): asm = self.get_author_service_map(service_author_id) args = {'oauth_token': asm.access_token, 'v': 20120130} url = '%s%s%s?%s' % (self.oauth_config['endpoint'], CHECKIN_RESOURCE, service_event_id, urllib.urlencode(args)) event_json = json_serializer.load(urllib2.urlopen(url)) # check for error if event_json['meta']['code'] != 200: raise Exception('Foursquare error response: %s' % event_json['meta']['code']) ''' TODO: there should be a generalized mechanism for pruning unwanted properties from the json With Foursquare we're going to eliminate the user property (we know all about the user) and it doesn't appear in the checkin definition returned by "users/self/checkins" apparently by design as foursquare designates this event as optional and the user context is clearly defined by the call The following two properties don't appear in the "users/self/checkins" resource so each new foursquare event will immediately update. If the user executes another checkin within 60 minutes its possible the collector will get the event again because of the MOST_RECENT_OVERLAP window causing this event to "flap". It's minor but noteworthy. del checkin_obj['score'] del checkin_obj['venue']['specials'] ''' checkin_obj = event_json['response']['checkin'] prune_dictionary(checkin_obj, self.PRUNE_ITEMS) interpreter = FoursquareEventInterpreter(checkin_obj, asm, self.oauth_config) callback(create_foursquare_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), checkin_obj))
def get_author_profile(self, service_author_id, asm): asm = self.fetch_begin(service_author_id, asm) args = {'user_id': asm.service_author_id, 'include_entities': True} # Create our OAuth consumer instance if asm.access_token: consumer = oauth.Consumer(self.oauth_config['key'], self.oauth_config['secret']) token = oauth.Token(key=asm.access_token, secret=asm.access_token_secret) client = oauth.Client(consumer, token) url = '%s%s?%s' % (self.oauth_config['endpoint'], USER_INFO, urllib.urlencode(args)) # request the user's profile json_obj = json_serializer.load_string(make_request(client, url)) if asm.access_token \ else json_serializer.load(urllib2.urlopen(url)) profile_json = {} if 'name' in json_obj: profile_json['name'] = json_obj['name'] if 'location' in json_obj: profile_json['location'] = json_obj['location'] if 'profile_image_url' in json_obj: profile_json['picture_url'] = json_obj['profile_image_url'] if 'description' in json_obj: profile_json['headline'] = json_obj['description'] profile_json['public_profile_url'] = 'https://twitter.com/#!/%s' % json_obj['screen_name'] return profile_json
def fetch(self, service_author_id, callback): super(FacebookEventCollector, self).fetch(service_author_id, callback) state = self.fetch_begin(service_author_id) self.fetch_log_info(state) asm = state['asm'] args = {'access_token': asm.access_token} # get only events since last update or past year depending on if this # is the first collection of not if asm.most_recent_event_timestamp: since = calendar.timegm((asm.most_recent_event_timestamp - self.MOST_RECENT_OVERLAP).utctimetuple()) else: since = calendar.timegm((datetime.utcnow() - self.NEW_LOOKBACK_WINDOW).utctimetuple()) args['since'] = since # fetch all new posts posts_url = unicode('{0}{1}?{2}').format(self.oauth_config['endpoint'], self.FEED_COLLECTION, urllib.urlencode(args)) total_accepted = 0 while posts_url and total_accepted < self.MAX_EVENTS: logging.debug('requesting: "%s"', posts_url) posts_obj = json_serializer.load(urllib2.urlopen(posts_url)) # process the item # TODO loop termination on various constraints is not exact # for element in the feed for post in posts_obj['data']: # currently only interested in 'status' posts from the user if post['from']['id'] == service_author_id: post_type = post.get('type', None) # if this is a status update and there is an action or the # user is tagged in the story keep it # TODO: check for user in story_tags is experimental if post_type == 'status': tagged = False if 'story_tags' in post: for story_tag in post['story_tags'].itervalues(): for entity in story_tag: if int(entity['id']) == int(service_author_id): tagged = True break if tagged: break if not post.get('actions') and not tagged: continue # skip photo and checkin posts. they will get picked-up by their respective # processing below if post_type == 'photo' or post_type == 'checkin': continue interpreter = FacebookEventInterpreter(post, asm, self.oauth_config) if self.screen_event(interpreter, state): total_accepted = total_accepted + 1 callback(create_facebook_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post)) # setup for the next page (if any). Check that we're not looping ?? do we even need to check ?? next_url = posts_obj['paging']['next'] if 'paging' in posts_obj and 'next' in posts_obj['paging'] else None posts_url = next_url if next_url and next_url != posts_url else None # while posts # collect photos for all time if this is the first update; otherwise # only collect photos since the last update. Setting since to None # and remove the 'since' property from the query args will collect # all photos if not asm.most_recent_event_timestamp: since = None del args['since'] albums_url = unicode('{0}{1}?{2}').format(self.oauth_config['endpoint'], self.ALBUMS_COLLECTION, urllib.urlencode({'access_token': asm.access_token})) while albums_url: albums_obj = json_serializer.load(urllib2.urlopen(albums_url)) for album in albums_obj.get('data', []): # skip photos posted to friend's walls if album['type'] == 'friends_walls': continue created_time = calendar.timegm(datetime.strptime(album['created_time'], "%Y-%m-%dT%H:%M:%S+0000").utctimetuple()) updated_time = calendar.timegm(datetime.strptime(album['updated_time'], "%Y-%m-%dT%H:%M:%S+0000").utctimetuple()) if since == None or created_time >= since or updated_time >= since: # set the type to 'album so it will match what you get when it's directly # queried; also makes it easier for the event process to identify it album['type'] = 'album' interpreter = FacebookEventInterpreter(post, asm, self.oauth_config) # send event message callback(create_facebook_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), album)) # if album_id = album['id'] # check for any new photos in the album photos_url = unicode('{0}{1}{2}?{3}').format(self.oauth_config['endpoint'], album_id, self.PHOTOS_COLLECTION, urllib.urlencode(args)) while photos_url: photos_obj = json_serializer.load(urllib2.urlopen(photos_url)) for photo in photos_obj.get('data', []): photo['type'] = 'photo' interpreter = FacebookEventInterpreter(post, asm, self.oauth_config) # event message callback(create_facebook_event( asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), photo, [create_event_link(data_access.service.name_to_id('facebook'), album_id)])) # setup for the next page (if any). Check that we're not looping ?? do we even need to check ?? next_url = photos_obj['paging']['next'] if 'paging' in photos_obj and 'next' in photos_obj['paging'] else None photos_url = next_url if next_url and next_url != photos_url else None # while photos # setup for the next page (if any). Check that we're not looping ?? do we even need to check ?? next_url = albums_obj['paging']['next'] if 'paging' in albums_obj and 'next' in albums_obj['paging'] else None albums_url = next_url if next_url and next_url != albums_url else None # while albums # fetch all new checkins checkins_url = unicode('{0}{1}?{2}').format(self.oauth_config['endpoint'], self.CHECKIN_COLLECTION, urllib.urlencode(args)) total_accepted = 0 while checkins_url and total_accepted < self.MAX_EVENTS: checkins_obj = json_serializer.load(urllib2.urlopen(checkins_url)) # process the item # TODO loop termination on various constraints is not exact # for element in the feed for checkin_obj in checkins_obj['data']: # filter checkins not directly from this user if checkin_obj['from']['id'] == service_author_id: # set the type to checkin. When querying for checkins the # type property is missing checkin_obj['type'] = 'checkin' interpreter = FacebookEventInterpreter(post, asm, self.oauth_config) if self.screen_event(interpreter, state): total_accepted = total_accepted + 1 callback(create_facebook_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post)) # setup for the next page (if any). Check that we're not looping ?? do we even need to check ?? next_url = checkins_obj['paging']['next'] if 'paging' in checkins_obj and 'next' in checkins_obj['paging'] else None checkins_url = next_url if next_url and next_url != posts_url else None # while checkins # terminate the fetch self.fetch_end(state)
def fetch(self, service_author_id, callback): super(GoogleplusEventCollector, self).fetch(service_author_id, callback) state = self.fetch_begin(service_author_id) self.fetch_log_info(state) asm = state['asm'] # TODO - there needs to be a global caching mechanism for this (i.e. memcached, etc.). # Because of the distributed nature of this two successive updates for the # same author don't share any state and can't leverage the refresh token. # The refresh token should have a configured TTL # we need to exchange the refresh token for an access token query_args = urllib.urlencode([('client_id', self.oauth_config['key']), ('client_secret', self.oauth_config['secret']), ('refresh_token', asm.access_token), ('grant_type', 'refresh_token')]) raw_obj = json_serializer.load(urllib2.urlopen(self.oauth_config['oauth_exchange_url'], query_args)) access_token = raw_obj['access_token'] args = {'access_token': access_token, 'maxResults': self.PAGE_SIZE} # setup the url for fetching a page of posts url = '%s%s?%s' % (self.oauth_config['endpoint'], self.USER_ACTIVITY, urllib.urlencode(args)) total_accepted = 0 while url and total_accepted < self.MAX_EVENTS: raw_obj = json_serializer.load(urllib2.urlopen(url)) # for element in the feed for post in raw_obj.get('items', []): if post['kind'] == 'plus#activity': interpreter = GoogleplusStatusEventInterpreter(post, asm, self.oauth_config) if self.screen_event(interpreter, state): total_accepted = total_accepted + 1 callback(create_googleplus_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post)) # if # for # setup for the next page (if any) next_page = raw_obj.get('next_pageToken') if next_page: args['pageToken'] = next_page url = '%s%s?%s' % (self.oauth_config['endpoint'], self.USER_ACTIVITY, urllib.urlencode(args)) else: url = None # terminate the fetch self.fetch_end(state)
def fetch(self, service_author_id, callback): super(InstagramEventCollector, self).fetch(service_author_id, callback) state = self.fetch_begin(service_author_id) self.fetch_log_info(state) asm = state["asm"] if asm.access_token: access_token = asm.access_token user_media = self.USER_MEDIA.format("self") else: access_token = self.oauth_config["user1_access_token"] user_media = self.USER_MEDIA.format(asm.service_author_id) args = {"access_token": access_token, "count": self.PAGE_SIZE} # get only events since last update or past year depending on if this # is the first collection of not if asm.most_recent_event_timestamp: min_timestamp = calendar.timegm((asm.most_recent_event_timestamp - self.MOST_RECENT_OVERLAP).utctimetuple()) else: min_timestamp = calendar.timegm((datetime.utcnow() - self.NEW_LOOKBACK_WINDOW).utctimetuple()) args["min_timestamp"] = min_timestamp # setup the url for fetching a page of posts url = "{0}{1}?{2}".format(self.oauth_config["endpoint"], user_media, urllib.urlencode(args)) total_accepted = 0 while url and total_accepted < self.MAX_EVENTS: raw_obj = json_serializer.load(urllib2.urlopen(url)) # for element in the feed for post in raw_obj.get("data", []): interpreter = InstagramEventInterpreter(post, asm, self.oauth_config) if self.screen_event(interpreter, state): total_accepted = total_accepted + 1 callback( create_instagram_event( asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post, [ create_event_link( data_access.service.name_to_id("instagram"), "_{0}@{1}".format(self.service_name, asm.author_id), ) ], ) ) # if # for # setup for the next page (if any) url = ( raw_obj["pagination"]["next_url"] if "pagination" in raw_obj and "next_url" in raw_obj["pagination"] else None ) # terminate the fetch self.fetch_end(state)
def fetch(self, service_author_id, callback): super(TwitterEventCollector, self).fetch(service_author_id, callback) state = self.fetch_begin(service_author_id) self.fetch_log_info(state) asm = state['asm'] args = {'include_rts': 1, 'include_entities': 1, 'trim_user': 1, 'count': 200} # use authenticated access if we can if asm.access_token: consumer = oauth.Consumer(self.oauth_config['key'], self.oauth_config['secret']) token = oauth.Token(asm.access_token, asm.access_token_secret) client = oauth.Client(consumer, token) else: args['user_id'] = asm.service_author_id if asm.most_recent_event_id: args['since_id'] = asm.most_recent_event_id # API endpoint for getting user timeline url = '%s%s?%s' % (self.oauth_config['endpoint'], USER_TIMELINE, urllib.urlencode(args)) min_age = datetime.utcnow() - self.NEW_LOOKBACK_WINDOW last_id = None while True: try: raw_json = json_serializer.load_string(make_request(client, url)) if asm.access_token \ else json_serializer.load(urllib2.urlopen(url)) except urllib2.URLError, e: logging.error('ERROR REQUEST URL: {0}'.format(url)) logging.error('ERROR REASON: {0}, {1}'.format(e.code, e.read())) raise # check if nothing returned and terminate loop if so if len(raw_json) == 0: break for post in raw_json: # process the item #print json.dumps(post, sort_keys=True, indent=2) interpreter = TwitterEventInterpreter(post, asm, self.oauth_config) last_id = interpreter.get_id() # terminate fetching any more events if we've gone beyond the lookback window if interpreter.get_create_time() < min_age: url = None break if self.screen_event(interpreter, state): callback(create_twitter_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post)) if not url: break # setup for the next page (if any) args['max_id'] = long(last_id) - 1 url = '%s%s?%s' % (self.oauth_config['endpoint'], USER_TIMELINE, urllib.urlencode(args))
def fetch(self, service_author_id, callback): super(FoursquareEventCollector, self).fetch(service_author_id, callback) state = self.fetch_begin(service_author_id) self.fetch_log_info(state) asm = state['asm'] args = {'oauth_token': asm.access_token, 'v': 20120130} args['limit'] = LIMIT args['offset'] = 0 # get only events since last update or past year depending on if this # is the first collection of not if asm.most_recent_event_timestamp: after_time = calendar.timegm((asm.most_recent_event_timestamp - self.MOST_RECENT_OVERLAP).utctimetuple()) else: after_time = calendar.timegm((datetime.utcnow() - self.NEW_LOOKBACK_WINDOW).utctimetuple()) args['afterTimestamp'] = after_time url = '%s%s?%s' % (self.oauth_config['endpoint'], USER_CHECKINS, urllib.urlencode(args)) total_accepted = 0 while url and total_accepted < self.MAX_EVENTS: raw_json = json_serializer.load(urllib2.urlopen(url)) # check for error if raw_json['meta']['code'] != 200: raise Exception('Foursquare error response: %s' % raw_json['meta']['code']) # terminate if the response has no more events/checkins if len(raw_json['response']['checkins']['items']) == 0: break # for each element in the feed for post in raw_json['response']['checkins']['items']: prune_dictionary(post, self.PRUNE_ITEMS) interpreter = FoursquareEventInterpreter(post, asm, self.oauth_config) if self.screen_event(interpreter, state): total_accepted = total_accepted + 1 callback(create_foursquare_event(asm.author_id, CURRENT_STATE, service_author_id, interpreter.get_id(), post)) # for if not url: break # setup next request args['offset'] = args['offset'] + LIMIT url = '%s%s?%s' % (self.oauth_config['endpoint'], USER_CHECKINS, urllib.urlencode(args)) # terminate the fetch self.fetch_end(state)