def stream(queries, queue, settings): while True: for query in queries: time.sleep(1) url = "https://graph.facebook.com/search?q=%s&type=post&limit=25&access_token=%s" url = url % (query,settings['FACEBOOK_API_KEY']) data = fetch_json('facebook',url) if data: items = data['data'] for item in items: if 'message' in item: post = { "service" : 'facebook', "query": query, "user" : { "name": item['from'].get('name'), "id": item['from']['id'], }, "links" : [], "id" : item['id'], "text" : item['message'], "date": str(datetime.datetime.strptime(item['created_time'], settings['TIME_FORMAT'])), } url_regex = re.compile('(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?') for url in url_regex.findall(item['message']): post['links'].append({ 'href' : url }) post['user']['avatar'] = "http://graph.facebook.com/%s/picture" % item['from']['id'] if 'to' in item: post['to_users'] = item['to']['data'] if 'likes' in item: post['likes'] = item['likes']['count'] queue.put(post)
def stream(queries, queue, settings, kral_start_time): def get_access_token(): url_args = { 'client_id' : settings.get('Facebook','app_id'), 'client_secret' : settings.get('Facebook','app_secret'), 'grant_type' : 'client_credentials' } url = 'https://graph.facebook.com/oauth/access_token?%s' % urllib.urlencode(url_args) access_token = urllib2.urlopen(url).read().split('=')[1] return access_token access_token = get_access_token() #keep a store of queries and their previous_url since values to ensure we are #getting new items in our feed sinces = {} user_agent = settings.get('DEFAULT', 'user_agent', '') while True: for query in queries: #https://developers.facebook.com/docs/reference/api/batch/ #do batch requests for facebook, currently limited to 20 url_args = { 'access_token' : access_token, 'batch': [ { 'method': "GET", 'name' : "get-user-ids", "relative_url": "search?q=%s&type=post&limit=20" % urllib.quote(query), "omit_response_on_success": 0, }, { 'method':'GET', 'relative_url':'/feed/?ids={result=get-user-ids:$.data.*.from.id}', } ] } #if we have stored a "since" for this query and its newer than when we started #use that timestamp instead, else use the time we started if query in sinces and sinces[query] > kral_start_time: since = sinces[query] else: since = kral_start_time #set the since to retreive new posts url_args['batch'][0]['relative_url'] = "%s&since=%s" % (url_args['batch'][0]['relative_url'], since) url = 'https://graph.facebook.com' request = urllib2.Request(url) request.add_data(urllib.urlencode(url_args)) if user_agent: request.add_header('User-agent', user_agent) response = fetch_json(request) if not response: sleep(5) break posts, profiles = response if posts and profiles: decoded_posts = json.loads(posts['body']) decoded_profiles = json.loads(profiles['body']) if not decoded_posts['data']: sleep(2) #don't process anything if we have no data continue #get the since value from the previous url if 'paging' in decoded_posts and 'previous' in decoded_posts['paging']: parsed_paging_data = urlparse.parse_qs(decoded_posts['paging']['previous']) previous_since = int(parsed_paging_data['since'][0]) sinces[query] = previous_since items = decoded_posts['data'] for item in items: created_time = int(time.mktime(time.strptime(item['created_time'],'%Y-%m-%dT%H:%M:%S+0000'))) #only process if these posts are in fact new if created_time >= since: if 'message' in item: post = { "service" : 'facebook', "query": query, "user" : { "name": item['from'].get('name'), "id": item['from']['id'], "subscribers" : '0' }, "links" : [], "id" : item['id'], "text" : item['message'], "date": created_time, } url_regex = re.compile('(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?') for url in url_regex.findall(item['message']): post['links'].append({ 'href' : url }) post['user']['avatar'] = "http://graph.facebook.com/%s/picture" % item['from']['id'] if 'to' in item: post['to_users'] = item['to']['data'] if 'likes' in item: post['likes'] = item['likes']['count'] subscribers_estimate = 0 if item['from']['id'] in decoded_profiles: activity = 0 for profile_item in decoded_profiles[item['from']['id']]['data']: activity += profile_item['comments']['count'] if 'likes' in profile_item: activity += profile_item['likes']['count'] subscribers_estimate = activity * 10 if subscribers_estimate < 130: post['user']['subscribers'] = 130 else: post['user']['subscribers'] = subscribers_estimate # More research needs to be done into making a more accurate multiplier # what is the rough percentage of total friends someone has vs. how many # actuall participate on their wall on a regular basis? # We can only do our best consistant guess, as Facebook does not tell us # how many friends someone has. We can only guess by activity. queue.put(post) sleep(2) # time between requests this is actually seconds * num_queries
def stream(queries, queue, settings, kral_start_time): api_url = "http://www.reddit.com/search.json?" prev_items = defaultdict(list) user_agent = settings.get('DEFAULT', 'user_agent', '') while True: for query in queries: p = { 'q' : query, 'sort' : settings.get('Reddit', 'orderby', 'relevance'), } url = api_url + urllib.urlencode(p) request = urllib2.Request(url) if user_agent: request.add_header('User-agent', user_agent) response = fetch_json(request) if not response: sleep(5) break if 'data' in response and 'children' in response['data']: #api returns back 25 items for item in response['data']['children']: item_id = item['data']['id'] #if we've seen this item in the last 50 items skip it if item_id not in prev_items[query]: post = { 'service' : 'reddit', 'query' : query, 'user' : { 'name' : item['data']['author'], }, 'id' : item_id, 'date' : item['data']['created_utc'], 'text' : item['data']['title'], 'source' : item['data']['url'], 'likes': item['data'].get('likes', 0), 'dislikes': item['data'].get('downs', 0), 'comments': item['data'].get('num_comments', 0), 'favorites': item['data'].get('saved', 0), } queue.put(post) prev_items[query].append(item_id) #keep dupe buffer 50 items long #TODO: look into using deque with maxlength prev_items[query] = prev_items[query][:50] sleep(30)
def stream(queries, queue, kral_start_time): api_url = "http://www.reddit.com/search.json?" prev_items = defaultdict(list) user_agent = config.USER_AGENT while True: for query in queries: p = { 'q': query, 'sort': config.REDDIT['orderby'] or 'relevance', } url = api_url + urllib.urlencode(p) request = urllib2.Request(url) if user_agent: request.add_header('User-agent', user_agent) response = fetch_json(request) if not response: sleep(5) break if 'data' in response and 'children' in response['data']: #api returns back 25 items for item in response['data']['children']: item_id = item['data']['id'] #if we've seen this item in the last 50 items skip it if item_id not in prev_items[query]: post = { 'service': 'reddit', 'query': query, 'user': { 'name': item['data']['author'], }, 'id': item_id, 'date': item['data']['created_utc'], 'text': item['data']['title'], 'source': item['data']['url'], 'likes': item['data'].get('likes', 0), 'dislikes': item['data'].get('downs', 0), 'comments': item['data'].get('num_comments', 0), 'favorites': item['data'].get('saved', 0), } queue.put(post) prev_items[query].append(item_id) #keep dupe buffer 50 items long #TODO: look into using deque with maxlength prev_items[query] = prev_items[query][:50] sleep(30)
def stream(queries, queue, kral_start_time): print('-' * 50) print str(config) mode = config.YOUTUBE['mode'] or 'most_popular'.get( 'Youtube', 'mode', 'most_popular') api_url = "http://gdata.youtube.com/feeds/api/standardfeeds/%s?" % mode prev_ids = defaultdict(list) while True: for query in queries: p = { 'q': query, 'orderby': config.YOUTUBE['orderby'] or 'published', 'max-results': config.YOUTUBE['maxresults'] or 25, 'v': 2, 'alt': 'jsonc', 'format': 5, } #time is only supported in these standard video feeds if mode in [ 'top_rated', 'top_favorites', 'most_viewed', 'most_popular', 'most_discussed', 'most_responded', ]: p['time'] = config.YOUTUBE['time'] or 'today' url = api_url + urllib.urlencode(p) request = urllib2.Request(url) request.add_header('User-agent', config.USER_AGENT) response = fetch_json(request) if not response: sleep(5) break if 'data' in response and 'items' in response['data']: entries = response['data']['items'] for entry in entries: #['uploaded', #'category', #'updated', #'rating', #'description', #'title', #'tags', #'thumbnail', #'content', #'player', #'accessControl', #'uploader', #'ratingCount', #'duration', #'aspectRatio', #'likeCount', #'favoriteCount', #'id', #'viewCount'] entry_id = entry['id'] uploader = entry['uploader'] profile_url = "http://youtube.com/" + uploader if entry_id not in prev_ids[ query]: #if we've already seen this id skip it post = { "service": "youtube", "id": entry_id, "query": query, "date": entry['uploaded'], "user": { "name": uploader, "profile": profile_url, }, "source": entry['player']['default'], "text": entry['title'], "description": entry.get('description', ''), "category": entry['category'], "keywords": entry.get('tags', ''), "duration": entry['duration'], 'favorites': entry.get('favoriteCount', 0), 'views': entry.get('viewCount', 0), 'likes': entry.get('likeCount', 0), } #ratingCount – The total number of voters who have rated the video using either rating system. #The number of voters who disliked the video can be calculated by subtracting the likeCount from the ratingCount. post['dislikes'] = int(entry.get( 'ratingCount', 0)) - int(post['likes']) prev_ids[query].insert( 0, entry_id ) #add the entry ids to previous ids for query queue.put(post) #use 50 item buffer for dupes #TODO: look into deque prev_ids[query] = prev_ids[query][:50] sleep(15)
def stream(queries, queue, kral_start_time): print('-'*50) print str(config) mode = config.YOUTUBE['mode'] or 'most_popular' .get('Youtube', 'mode', 'most_popular') api_url = "http://gdata.youtube.com/feeds/api/standardfeeds/%s?" % mode prev_ids = defaultdict(list) while True: for query in queries: p = { 'q': query, 'orderby': config.YOUTUBE['orderby'] or 'published', 'max-results': config.YOUTUBE['maxresults'] or 25, 'v': 2, 'alt': 'jsonc', 'format': 5, } #time is only supported in these standard video feeds if mode in ['top_rated', 'top_favorites', 'most_viewed', 'most_popular', 'most_discussed', 'most_responded',]: p['time'] = config.YOUTUBE['time'] or 'today' url = api_url + urllib.urlencode(p) request = urllib2.Request(url) request.add_header('User-agent', config.USER_AGENT) response = fetch_json(request) if not response: sleep(5) break if 'data' in response and 'items' in response['data']: entries = response['data']['items'] for entry in entries: #['uploaded', #'category', #'updated', #'rating', #'description', #'title', #'tags', #'thumbnail', #'content', #'player', #'accessControl', #'uploader', #'ratingCount', #'duration', #'aspectRatio', #'likeCount', #'favoriteCount', #'id', #'viewCount'] entry_id = entry['id'] uploader = entry['uploader'] profile_url = "http://youtube.com/" + uploader if entry_id not in prev_ids[query]: #if we've already seen this id skip it post = { "service" : "youtube", "id" : entry_id, "query" : query, "date" : entry['uploaded'], "user" : { "name" : uploader, "profile" : profile_url, }, "source" : entry['player']['default'], "text" : entry['title'], "description" : entry.get('description', ''), "category" : entry['category'], "keywords" : entry.get('tags', ''), "duration" : entry['duration'], 'favorites' : entry.get('favoriteCount', 0), 'views' : entry.get('viewCount', 0), 'likes' : entry.get('likeCount', 0), } #ratingCount – The total number of voters who have rated the video using either rating system. #The number of voters who disliked the video can be calculated by subtracting the likeCount from the ratingCount. post['dislikes'] = int(entry.get('ratingCount', 0)) - int(post['likes']) prev_ids[query].insert(0, entry_id) #add the entry ids to previous ids for query queue.put(post) #use 50 item buffer for dupes #TODO: look into deque prev_ids[query] = prev_ids[query][:50] sleep(15)
def stream(queries, queue, kral_start_time): def get_access_token(): url_args = { 'client_id': config.FACEBOOK['app_id'], 'client_secret': config.FACEBOOK['app_secret'], 'grant_type': 'client_credentials' } url = 'https://graph.facebook.com/oauth/access_token?%s' % urllib.urlencode( url_args) access_token = urllib2.urlopen(url).read().split('=')[1] return access_token access_token = get_access_token() #keep a store of queries and their previous_url since values to ensure we are #getting new items in our feed sinces = {} user_agent = config.USER_AGENT print user_agent while True: print "Starting facebook" print queries for query in queries: #https://developers.facebook.com/docs/reference/api/batch/ #do batch requests for facebook, currently limited to 20 url_args = { 'access_token': access_token, 'batch': [{ 'method': "GET", 'name': "get-user-ids", "relative_url": "search?q=%s&type=post&limit=20" % urllib.quote(query), "omit_response_on_success": 0, }, { 'method': 'GET', 'relative_url': '/feed/?ids={result=get-user-ids:$.data.*.from.id}', }] } #if we have stored a "since" for this query and its newer than when we started #use that timestamp instead, else use the time we started if query in sinces and sinces[query] > kral_start_time: since = sinces[query] else: since = kral_start_time #set the since to retreive new posts url_args['batch'][0]['relative_url'] = "%s&since=%s" % ( url_args['batch'][0]['relative_url'], since) url = 'https://graph.facebook.com' request = urllib2.Request(url) request.add_data(urllib.urlencode(url_args)) if user_agent: request.add_header('User-agent', user_agent) response = fetch_json(request) if not response: sleep(5) break posts, profiles = response print 'Posts', posts print 'Profiles', profiles if posts and profiles: decoded_posts = json.loads(posts['body']) decoded_profiles = json.loads(profiles['body']) if not decoded_posts['data']: sleep(2) #don't process anything if we have no data continue #get the since value from the previous url if 'paging' in decoded_posts and 'previous' in decoded_posts[ 'paging']: parsed_paging_data = urlparse.parse_qs( decoded_posts['paging']['previous']) previous_since = int(parsed_paging_data['since'][0]) sinces[query] = previous_since items = decoded_posts['data'] for item in items: created_time = int( time.mktime( time.strptime(item['created_time'], '%Y-%m-%dT%H:%M:%S+0000'))) #only process if these posts are in fact new if created_time >= since: if 'message' in item: post = { "service": 'facebook', "query": query, "user": { "name": item['from'].get('name'), "id": item['from']['id'], "subscribers": '0' }, "links": [], "id": item['id'], "text": item['message'], "date": created_time, } url_regex = re.compile( '(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?' ) for url in url_regex.findall(item['message']): post['links'].append({'href': url}) post['user'][ 'avatar'] = "http://graph.facebook.com/%s/picture" % item[ 'from']['id'] if 'to' in item: post['to_users'] = item['to']['data'] if 'likes' in item: post['likes'] = item['likes']['count'] subscribers_estimate = 0 if item['from']['id'] in decoded_profiles: activity = 0 for profile_item in decoded_profiles[ item['from']['id']]['data']: activity += profile_item['comments'][ 'count'] if 'likes' in profile_item: activity += profile_item['likes'][ 'count'] subscribers_estimate = activity * 10 if subscribers_estimate < 130: post['user']['subscribers'] = 130 else: post['user'][ 'subscribers'] = subscribers_estimate # More research needs to be done into making a more accurate multiplier # what is the rough percentage of total friends someone has vs. how many # actuall participate on their wall on a regular basis? # We can only do our best consistant guess, as Facebook does not tell us # how many friends someone has. We can only guess by activity. queue.put(post) sleep( 2 ) # time between requests this is actually seconds * num_queries