def facebook_post(item, query, **kwargs): logger = facebook_post.get_logger(**kwargs) time_format = "%Y-%m-%dT%H:%M:%S+0000" if item.has_key('message'): post_info = { "service" : 'facebook', "user" : { "name": item['from']['name'], "id": item['from']['id'], }, "links" : [], "id" : item['id'], "text" : item['message'], "date": str(datetime.datetime.strptime(item['created_time'], time_format)), } url_regex = re.compile('(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?') for url in url_regex.findall(item['message']): post_info['links'].append({ 'href' : url }) post_info['user']['avatar'] = "http://graph.facebook.com/%s/picture" % item['from']['id'] if item.get('to'): post_info['to_users'] = item['to']['data'] if item.get('likes'): post_info['likes'] = item['likes']['count'] if item.get('application'): post_info['application'] = item['application']['name'] push_data(post_info, queue=query) return
def run(self, item, query, **kwargs): logger = self.get_logger(**kwargs) time_format = "%a, %d %b %Y %H:%M:%S +0000" date = str(datetime.datetime.strptime(item['created_at'], time_format)) post_info = { "service": "identi.ca", "user": { "name": item['from_user'], "id": item['from_user_id'], }, "to_user": { "name": item['to_user'], "id": item['to_user_id'], }, "text": item['text'], "date": date, "pictures": { "0": { "thumbnail": item['profile_image_url'], }, }, "source": item['source'], "id": item['id'], } print(post_info['id']) push_data(post_info, queue = query) logger.info("Saved Identica Post")
def buzz_post(item, query, **kwargs): logger = buzz_post.get_logger(**kwargs) time_format = "%Y-%m-%dT%H:%M:%S.%fZ" # FIXME should consider all pictures, not just one try: thumbnail = item["object"]["attachments"][0]["links"]["preview"][0]["href"] picture = item["object"]["attachments"][0]["links"]["enclosure"][0]["href"] except: picture = "" thumbnail = "" # END FIXME post_info = { "service": "buzz", "user": { "name": item["actor"]["name"], "id": item["actor"]["name"], "avatar": item["actor"]["thumbnailUrl"], "source": item["actor"]["profileUrl"], }, "pictures": { # hard-coding for only one picture. See above FIXME "0": {"picture": picture, "thumbnail": thumbnail} }, "id": item["id"].split(":")[3], "date": str(datetime.datetime.strptime(item["published"], time_format)), "source": item["object"]["links"]["alternate"][0]["href"], "text": item["object"]["content"], } push_data(post_info, queue=query) logger.info("Saved Post/User")
def twitter_stream_tweet(data, queries, **kwargs): logger = twitter_stream_tweet.get_logger(**kwargs) content = json.loads(data) time_format = "%a %b %d %H:%M:%S +0000 %Y" post_info = { 'service' : 'twitter', 'user' : { 'id' : content['user']['id_str'], 'utc' : content['user']['utc_offset'], 'name' : content['user']['screen_name'], 'description' : content['user']['description'], 'location' : content['user']['location'], 'avatar' : content['user']['profile_image_url'], 'subscribers': content['user']['followers_count'], 'subscriptions': content['user']['friends_count'], 'website': content['user']['url'], 'language' : content['user']['lang'], }, 'links' : [], 'id' : content['id'], 'application': content['source'], 'date' : str(datetime.datetime.strptime(content['created_at'],time_format)), 'text' : content['text'], 'geo' : content['coordinates'], } for url in content['entities']['urls']: post_info['links'].append({ 'href' : url.get('url') }) for query in [q.lower() for q in queries]: ns_query = query.replace('_','') if ns_query in content['text'].lower(): push_data(post_info, queue=ns_query)
def youtube_video(item, query, **kwargs): logger = youtube_video.get_logger(**kwargs) if item.has_key('title'): post_info = { "service" : 'youtube', "id" : item['media$group']['yt$videoid']['$t'], "date" : item['media$group']['yt$uploaded']['$t'], "user" : item['author'][0]["name"]['$t'], "source" : item['link'][1]['href'], "text" : item["title"]['$t'], "keywords" : item['media$group']['media$keywords'].get('$t',''), "description" : item['media$group']['media$description']['$t'], "thumbnail" : "http://i.ytimg.com/vi/%s/hqdefault.jpg" % item['media$group']['yt$videoid']['$t'], "duration" : item['media$group']['yt$duration']['seconds'], } push_data(post_info, queue = query) logger.info("Saved Post/User")
def twitter_feed_tweet(item, query, **kwargs): if item.has_key('text'): post_info = { "service" : 'twitter', "user" : { "name": item['from_user'], "id": item['from_user_id_str'], 'avatar' : item['profile_image_url'], }, "links" : [], "id" : item['id_str'], "text" : item['text'], "source": item['source'], "date": str(datetime.datetime.fromtimestamp(rfc822.mktime_tz(rfc822.parsedate_tz(item['created_at'])))), } url_regex = re.compile('(?:http|https|ftp):\/\/[\w\-_]+(?:\.[\w\-_]+)+(?:[\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?') for url in url_regex.findall(item['text']): post_info['links'].append({ 'href' : url }) push_data(post_info, queue=query)
def flickr_photo(photo_info, query, **kwargs): logger = flickr_photo.get_logger(**kwargs) #photo_info['url'] = "http://flickr.com/%s/%s" % (user_info['path_alias'], photo_info['id']) photo_info['thumbnail'] = "http://farm{farm}.static.flickr.com/{server}/{id}_{secret}_m.jpg".format(**photo_info) post_info = { "service" : 'flickr', "id" : photo_info['id'], "date" : photo_info['dateupload'], "user" : { "id" : photo_info['owner'], "name" : photo_info['ownername'], #"avatar" : "http://farm{iconfarm}.static.flickr.com/{iconserver}/buddyicons/{nsid}.jpg".format(**user_info), #"postings" : user_info['photos']['count'].get('_content', ""), #"profile" : user_info['profileurl'].get('_content', ""), #"website" : user_info['photosurl'].get('_content', ""), }, "text" : photo_info["title"], "thumbnail" : photo_info['thumbnail'], } logger.info("Saved Post/User") push_data(post_info,queue=query)
for link in links: if link["href"].decode("utf8") == url.decode("utf8"): link["count"] += 1 if link["count"] > 1: if title: link["title"] = title else: url_title.delay(url) new_link = True post_info = link if new_link == False: post_info = {"service": "links", "href": url, "count": 1, "title": title} links.append(post_info) links = sorted(links, key=lambda link: link["count"], reverse=True) cache.set(cache_name, pickle.dumps(links), 31556926) push_data(post_info, queue=query) else: url_expand.delay(current_url, query, n) @task def url_title(url, **kwargs): cache_name = base64.b64encode(url)[:250] httprequest = urllib2.Request(url) try: data = urllib2.urlopen(httprequest) except urllib2.HTTPError: data = None if data: for line in data: if "<title>" in line: