Esempio n. 1
0
def stream(queries, queue, settings):
    while True:
        for query in queries:
            time.sleep(1)
            url = "https://graph.facebook.com/search?q=%s&type=post&limit=25&access_token=%s"
            url = url % (query,settings['FACEBOOK_API_KEY'])
            data = fetch_json('facebook',url)
            if data:
                items = data['data']
                for item in items:
                    if 'message' in item:
                        post = {
                            "service" : 'facebook',
                            "query": query,
                            "user" : {
                                "name": item['from'].get('name'),
                                "id": item['from']['id'],
                            },
                            "links" : [],
                            "id" : item['id'],
                            "text" : item['message'],
                            "date": str(datetime.datetime.strptime(item['created_time'], settings['TIME_FORMAT'])),
                        }
                        url_regex = re.compile('(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?')
                        for url in url_regex.findall(item['message']):
                            post['links'].append({ 'href' : url })
                        post['user']['avatar'] = "http://graph.facebook.com/%s/picture" % item['from']['id']
                        if 'to' in item:
                            post['to_users'] = item['to']['data']
                        if 'likes' in item:
                            post['likes'] = item['likes']['count']
                        queue.put(post)
Esempio n. 2
0
def stream(queries, queue, settings, kral_start_time):
   
    def get_access_token():

        url_args = {
            'client_id' : settings.get('Facebook','app_id'),
            'client_secret' : settings.get('Facebook','app_secret'),
            'grant_type' : 'client_credentials'
        }
        url = 'https://graph.facebook.com/oauth/access_token?%s' % urllib.urlencode(url_args)

        access_token = urllib2.urlopen(url).read().split('=')[1]

        return access_token

    access_token = get_access_token()

    #keep a store of queries and their previous_url since values to ensure we are 
    #getting new items in our feed
    sinces = {}
   
    user_agent = settings.get('DEFAULT', 'user_agent', '')
    
    while True:
        
        for query in queries:
            
            #https://developers.facebook.com/docs/reference/api/batch/
            #do batch requests for facebook, currently limited to 20
            url_args = {
                'access_token' : access_token,
                'batch': [
                    {
                        'method': "GET",
                        'name' : "get-user-ids",
                        "relative_url": "search?q=%s&type=post&limit=20" % urllib.quote(query),
                        "omit_response_on_success": 0,
                    },
                    {
                        'method':'GET',
                        'relative_url':'/feed/?ids={result=get-user-ids:$.data.*.from.id}',
                    }
                ]
            }

            #if we have stored a "since" for this query and its newer than when we started
            #use that timestamp instead, else use the time we started
            if query in sinces and sinces[query] > kral_start_time:
                since = sinces[query]
            else:
                since = kral_start_time
    
            #set the since to retreive new posts
            url_args['batch'][0]['relative_url'] = "%s&since=%s" % (url_args['batch'][0]['relative_url'], since)

            url = 'https://graph.facebook.com'
            request = urllib2.Request(url)
            request.add_data(urllib.urlencode(url_args))
            
            if user_agent:
                request.add_header('User-agent', user_agent)

            response = fetch_json(request)
           
            if not response:
                sleep(5)
                break

            posts, profiles = response

            if posts and profiles:
                
                decoded_posts = json.loads(posts['body'])
                decoded_profiles = json.loads(profiles['body'])

                if not decoded_posts['data']:
                    sleep(2) #don't process anything if we have no data 
                    continue

                #get the since value from the previous url
                if 'paging' in decoded_posts and 'previous' in decoded_posts['paging']:
                    parsed_paging_data = urlparse.parse_qs(decoded_posts['paging']['previous'])
                    previous_since = int(parsed_paging_data['since'][0])
                    sinces[query] = previous_since

                items = decoded_posts['data']

                for item in items:
                   
                    created_time = int(time.mktime(time.strptime(item['created_time'],'%Y-%m-%dT%H:%M:%S+0000')))
                   
                    #only process if these posts are in fact new
                    if created_time >= since:

                        if 'message' in item:
                            
                            post = {
                                "service" : 'facebook',
                                "query": query,
                                "user" : {
                                    "name": item['from'].get('name'),
                                    "id": item['from']['id'],
                                    "subscribers" : '0'
                                },
                                "links" : [],
                                "id" : item['id'],
                                "text" : item['message'],
                                "date": created_time, 
                            }
                            
                            url_regex = re.compile('(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?')
                            
                            for url in url_regex.findall(item['message']):
                                post['links'].append({ 'href' : url })
                            post['user']['avatar'] = "http://graph.facebook.com/%s/picture" % item['from']['id']
                            
                            if 'to' in item:
                                post['to_users'] = item['to']['data']
                            
                            if 'likes' in item:
                                post['likes'] = item['likes']['count']
                            
                            subscribers_estimate = 0
                            
                            if item['from']['id'] in decoded_profiles:
                                activity = 0
                                
                                for profile_item in decoded_profiles[item['from']['id']]['data']:
                                    activity += profile_item['comments']['count']
                                    if 'likes' in profile_item:
                                        activity += profile_item['likes']['count']
                                
                                subscribers_estimate = activity * 10
                            
                            if subscribers_estimate < 130:
                                post['user']['subscribers'] = 130
                            else:
                                post['user']['subscribers'] = subscribers_estimate
                                # More research needs to be done into making a more accurate multiplier
                                # what is the rough percentage of total friends someone has vs. how many
                                # actuall participate on their wall on a regular basis?
                                # We can only do our best consistant guess, as Facebook does not tell us
                                # how many friends someone has. We can only guess by activity.
                            queue.put(post)

                
            sleep(2) # time between requests this is actually seconds * num_queries
Esempio n. 3
0
File: reddit.py Progetto: d1on/kral
def stream(queries, queue, settings, kral_start_time):

    api_url = "http://www.reddit.com/search.json?"

    prev_items = defaultdict(list)

    user_agent = settings.get('DEFAULT', 'user_agent', '')

    while True:

        for query in queries:
           
            p = {
                'q' : query,
                'sort' : settings.get('Reddit', 'orderby', 'relevance'), 
            }
            
            url = api_url + urllib.urlencode(p)
        
            request = urllib2.Request(url)

            if user_agent:
                request.add_header('User-agent', user_agent)

            response = fetch_json(request)
            
            if not response:
                sleep(5)
                break

            if 'data' in response and 'children' in response['data']:

                #api returns back 25 items
                for item in response['data']['children']:
                    
                    item_id =  item['data']['id']
                    
                    #if we've seen this item in the last 50 items skip it
                    if item_id not in prev_items[query]:   
                    
                        post = {
                            'service' : 'reddit',
                            'query' : query,
                            'user' : {
                                'name' : item['data']['author'],
                            },
                            'id' : item_id,
                            'date' : item['data']['created_utc'],
                            'text' : item['data']['title'],
                            'source' : item['data']['url'],
                            'likes': item['data'].get('likes', 0),
                            'dislikes': item['data'].get('downs', 0),
                            'comments': item['data'].get('num_comments', 0),
                            'favorites': item['data'].get('saved', 0),
                        }
                         
                        queue.put(post)
                        
                        prev_items[query].append(item_id)
            
            #keep dupe buffer 50 items long
            #TODO: look into using deque with maxlength
            prev_items[query] = prev_items[query][:50]
        
        sleep(30)
Esempio n. 4
0
def stream(queries, queue, kral_start_time):

    api_url = "http://www.reddit.com/search.json?"

    prev_items = defaultdict(list)

    user_agent = config.USER_AGENT

    while True:

        for query in queries:

            p = {
                'q': query,
                'sort': config.REDDIT['orderby'] or 'relevance',
            }

            url = api_url + urllib.urlencode(p)

            request = urllib2.Request(url)

            if user_agent:
                request.add_header('User-agent', user_agent)

            response = fetch_json(request)

            if not response:
                sleep(5)
                break

            if 'data' in response and 'children' in response['data']:

                #api returns back 25 items
                for item in response['data']['children']:

                    item_id = item['data']['id']

                    #if we've seen this item in the last 50 items skip it
                    if item_id not in prev_items[query]:

                        post = {
                            'service': 'reddit',
                            'query': query,
                            'user': {
                                'name': item['data']['author'],
                            },
                            'id': item_id,
                            'date': item['data']['created_utc'],
                            'text': item['data']['title'],
                            'source': item['data']['url'],
                            'likes': item['data'].get('likes', 0),
                            'dislikes': item['data'].get('downs', 0),
                            'comments': item['data'].get('num_comments', 0),
                            'favorites': item['data'].get('saved', 0),
                        }

                        queue.put(post)

                        prev_items[query].append(item_id)

            #keep dupe buffer 50 items long
            #TODO: look into using deque with maxlength
            prev_items[query] = prev_items[query][:50]

        sleep(30)
Esempio n. 5
0
def stream(queries, queue, kral_start_time):

    print('-' * 50)
    print str(config)

    mode = config.YOUTUBE['mode'] or 'most_popular'.get(
        'Youtube', 'mode', 'most_popular')

    api_url = "http://gdata.youtube.com/feeds/api/standardfeeds/%s?" % mode

    prev_ids = defaultdict(list)

    while True:
        for query in queries:

            p = {
                'q': query,
                'orderby': config.YOUTUBE['orderby'] or 'published',
                'max-results': config.YOUTUBE['maxresults'] or 25,
                'v': 2,
                'alt': 'jsonc',
                'format': 5,
            }

            #time is only supported in these standard video feeds
            if mode in [
                    'top_rated',
                    'top_favorites',
                    'most_viewed',
                    'most_popular',
                    'most_discussed',
                    'most_responded',
            ]:
                p['time'] = config.YOUTUBE['time'] or 'today'

            url = api_url + urllib.urlencode(p)

            request = urllib2.Request(url)

            request.add_header('User-agent', config.USER_AGENT)

            response = fetch_json(request)

            if not response:
                sleep(5)
                break

            if 'data' in response and 'items' in response['data']:

                entries = response['data']['items']

                for entry in entries:
                    #['uploaded',
                    #'category',
                    #'updated',
                    #'rating',
                    #'description',
                    #'title',
                    #'tags',
                    #'thumbnail',
                    #'content',
                    #'player',
                    #'accessControl',
                    #'uploader',
                    #'ratingCount',
                    #'duration',
                    #'aspectRatio',
                    #'likeCount',
                    #'favoriteCount',
                    #'id',
                    #'viewCount']

                    entry_id = entry['id']

                    uploader = entry['uploader']

                    profile_url = "http://youtube.com/" + uploader

                    if entry_id not in prev_ids[
                            query]:  #if we've already seen this id skip it

                        post = {
                            "service": "youtube",
                            "id": entry_id,
                            "query": query,
                            "date": entry['uploaded'],
                            "user": {
                                "name": uploader,
                                "profile": profile_url,
                            },
                            "source": entry['player']['default'],
                            "text": entry['title'],
                            "description": entry.get('description', ''),
                            "category": entry['category'],
                            "keywords": entry.get('tags', ''),
                            "duration": entry['duration'],
                            'favorites': entry.get('favoriteCount', 0),
                            'views': entry.get('viewCount', 0),
                            'likes': entry.get('likeCount', 0),
                        }
                        #ratingCount – The total number of voters who have rated the video using either rating system.
                        #The number of voters who disliked the video can be calculated by subtracting the likeCount from the ratingCount.
                        post['dislikes'] = int(entry.get(
                            'ratingCount', 0)) - int(post['likes'])

                        prev_ids[query].insert(
                            0, entry_id
                        )  #add the entry ids to previous ids for query

                        queue.put(post)

            #use 50 item buffer for dupes
            #TODO: look into deque
            prev_ids[query] = prev_ids[query][:50]

            sleep(15)
Esempio n. 6
0
def stream(queries, queue, kral_start_time):

    print('-'*50)
    print str(config)

    mode = config.YOUTUBE['mode'] or 'most_popular' .get('Youtube', 'mode', 'most_popular')

    api_url = "http://gdata.youtube.com/feeds/api/standardfeeds/%s?" % mode 
    
    prev_ids = defaultdict(list)

    while True:
        for query in queries:

            p = {
                'q': query,
                'orderby': config.YOUTUBE['orderby'] or 'published',
                'max-results': config.YOUTUBE['maxresults'] or 25,
                'v': 2, 
                'alt': 'jsonc',
                'format': 5,
            }    

            #time is only supported in these standard video feeds
            if mode in ['top_rated', 'top_favorites', 'most_viewed', 
                    'most_popular', 'most_discussed', 'most_responded',]:
                p['time'] = config.YOUTUBE['time'] or 'today'

            url  =  api_url + urllib.urlencode(p)
            
            request = urllib2.Request(url)
    
            request.add_header('User-agent', config.USER_AGENT)

            response = fetch_json(request)
            
            if not response:
                sleep(5)
                break

            if 'data' in response and 'items' in response['data']:
                
                entries = response['data']['items']
                
                for entry in entries:
                    #['uploaded',
                    #'category', 
                    #'updated',
                    #'rating',
                    #'description',
                    #'title',
                    #'tags',
                    #'thumbnail',
                    #'content', 
                    #'player',
                    #'accessControl',
                    #'uploader',
                    #'ratingCount',
                    #'duration',
                    #'aspectRatio', 
                    #'likeCount',
                    #'favoriteCount',
                    #'id', 
                    #'viewCount']
                    
                    entry_id =  entry['id']
                    
                    uploader = entry['uploader']

                    profile_url = "http://youtube.com/" + uploader

                    if entry_id not in prev_ids[query]: #if we've already seen this id skip it

                        post = {
                            "service"     : "youtube",
                            "id"          : entry_id, 
                            "query"       : query,
                            "date"        : entry['uploaded'],
                            "user"        : {
                                                "name"    : uploader,
                                                "profile" : profile_url,
                                            },
                            "source"      : entry['player']['default'],
                            "text"        : entry['title'],
                            "description" : entry.get('description', ''),
                            "category"    : entry['category'],
                            "keywords"    : entry.get('tags', ''),
                            "duration"    : entry['duration'], 
                            'favorites'   : entry.get('favoriteCount', 0),
                            'views'       : entry.get('viewCount', 0),
                            'likes'       : entry.get('likeCount', 0),
                        }
                        #ratingCount – The total number of voters who have rated the video using either rating system.
                        #The number of voters who disliked the video can be calculated by subtracting the likeCount from the ratingCount.
                        post['dislikes'] = int(entry.get('ratingCount', 0)) - int(post['likes'])
                        
                        prev_ids[query].insert(0, entry_id) #add the entry ids to previous ids for query

                        queue.put(post)
                
            #use 50 item buffer for dupes
            #TODO: look into deque
            prev_ids[query] = prev_ids[query][:50] 
            
            sleep(15)
Esempio n. 7
0
def stream(queries, queue, kral_start_time):
    def get_access_token():

        url_args = {
            'client_id': config.FACEBOOK['app_id'],
            'client_secret': config.FACEBOOK['app_secret'],
            'grant_type': 'client_credentials'
        }
        url = 'https://graph.facebook.com/oauth/access_token?%s' % urllib.urlencode(
            url_args)

        access_token = urllib2.urlopen(url).read().split('=')[1]

        return access_token

    access_token = get_access_token()

    #keep a store of queries and their previous_url since values to ensure we are
    #getting new items in our feed
    sinces = {}

    user_agent = config.USER_AGENT
    print user_agent

    while True:
        print "Starting facebook"
        print queries
        for query in queries:

            #https://developers.facebook.com/docs/reference/api/batch/
            #do batch requests for facebook, currently limited to 20
            url_args = {
                'access_token':
                access_token,
                'batch': [{
                    'method':
                    "GET",
                    'name':
                    "get-user-ids",
                    "relative_url":
                    "search?q=%s&type=post&limit=20" % urllib.quote(query),
                    "omit_response_on_success":
                    0,
                }, {
                    'method':
                    'GET',
                    'relative_url':
                    '/feed/?ids={result=get-user-ids:$.data.*.from.id}',
                }]
            }

            #if we have stored a "since" for this query and its newer than when we started
            #use that timestamp instead, else use the time we started
            if query in sinces and sinces[query] > kral_start_time:
                since = sinces[query]
            else:
                since = kral_start_time

            #set the since to retreive new posts
            url_args['batch'][0]['relative_url'] = "%s&since=%s" % (
                url_args['batch'][0]['relative_url'], since)

            url = 'https://graph.facebook.com'
            request = urllib2.Request(url)
            request.add_data(urllib.urlencode(url_args))

            if user_agent:
                request.add_header('User-agent', user_agent)

            response = fetch_json(request)

            if not response:
                sleep(5)
                break

            posts, profiles = response
            print 'Posts', posts
            print 'Profiles', profiles
            if posts and profiles:

                decoded_posts = json.loads(posts['body'])
                decoded_profiles = json.loads(profiles['body'])

                if not decoded_posts['data']:
                    sleep(2)  #don't process anything if we have no data
                    continue

                #get the since value from the previous url
                if 'paging' in decoded_posts and 'previous' in decoded_posts[
                        'paging']:
                    parsed_paging_data = urlparse.parse_qs(
                        decoded_posts['paging']['previous'])
                    previous_since = int(parsed_paging_data['since'][0])
                    sinces[query] = previous_since

                items = decoded_posts['data']

                for item in items:

                    created_time = int(
                        time.mktime(
                            time.strptime(item['created_time'],
                                          '%Y-%m-%dT%H:%M:%S+0000')))

                    #only process if these posts are in fact new
                    if created_time >= since:

                        if 'message' in item:

                            post = {
                                "service": 'facebook',
                                "query": query,
                                "user": {
                                    "name": item['from'].get('name'),
                                    "id": item['from']['id'],
                                    "subscribers": '0'
                                },
                                "links": [],
                                "id": item['id'],
                                "text": item['message'],
                                "date": created_time,
                            }

                            url_regex = re.compile(
                                '(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
                            )

                            for url in url_regex.findall(item['message']):
                                post['links'].append({'href': url})
                            post['user'][
                                'avatar'] = "http://graph.facebook.com/%s/picture" % item[
                                    'from']['id']

                            if 'to' in item:
                                post['to_users'] = item['to']['data']

                            if 'likes' in item:
                                post['likes'] = item['likes']['count']

                            subscribers_estimate = 0

                            if item['from']['id'] in decoded_profiles:
                                activity = 0

                                for profile_item in decoded_profiles[
                                        item['from']['id']]['data']:
                                    activity += profile_item['comments'][
                                        'count']
                                    if 'likes' in profile_item:
                                        activity += profile_item['likes'][
                                            'count']

                                subscribers_estimate = activity * 10

                            if subscribers_estimate < 130:
                                post['user']['subscribers'] = 130
                            else:
                                post['user'][
                                    'subscribers'] = subscribers_estimate
                                # More research needs to be done into making a more accurate multiplier
                                # what is the rough percentage of total friends someone has vs. how many
                                # actuall participate on their wall on a regular basis?
                                # We can only do our best consistant guess, as Facebook does not tell us
                                # how many friends someone has. We can only guess by activity.
                            queue.put(post)

            sleep(
                2
            )  # time between requests this is actually seconds * num_queries