def rebuild_tweets(): tweets = list(datastore.load('twitter_tweets')) new_tweets = [] for tweet in tweets: urls = tweet['urls'] hashtags = tweet['hashtags'] if urls is not None and urls != '': try: urls = ast.literal_eval(urls) tweet['urls'] = json.dumps(urls) except Exception as e: traceback.print_exc() tweet['urls'] = '' if hashtags is not None and hashtags != '': try: hashtags = ast.literal_eval(hashtags) tweet['hashtags'] = json.dumps(hashtags) except Exception as e: traceback.print_exc() tweet['hashtags'] = '' new_tweets.append(tweet) datastore.store(new_tweets, 'twitter_tweets_cleaned', '')
def get_followings(): users = list(datastore.load('khiem.twitter_users')) user_ids = [] followings = [] for i in range(0, len(users) - 1): user_ids.append(users[i].id) for i in range(500, 751): user = users[i] #friend_ids = api.GetFriendIDs() #for friend_id in friend_ids: # if friend_id in user_ids: # followings.append({"user_id": user["id"], "friend_id": friend_id}) try: follower_ids = api.GetFollowerIDs(userid=user["id"])["ids"] print str(i) + ": Found " + str( len(follower_ids)) + " followers of " + user.name for follower_id in follower_ids: if follower_id in user_ids: followings.append({ "user_id": follower_id, "friend_id": user.id }) except Exception as e: print e continue datastore.store(followings, 'twitter_followings_500_750', 'Twitter followings') print "Successfully persist followings: " + str(len(followings)) + " items"
def get_followings(): users = list(datastore.load('khiem.twitter_users')) user_ids = [] followings = [] for i in range(0,len(users)-1): user_ids.append(users[i].id) for i in range(500, 751): user = users[i] #friend_ids = api.GetFriendIDs() #for friend_id in friend_ids: # if friend_id in user_ids: # followings.append({"user_id": user["id"], "friend_id": friend_id}) try: follower_ids = api.GetFollowerIDs(userid=user["id"])["ids"] print str(i) + ": Found " + str(len(follower_ids)) + " followers of " + user.name for follower_id in follower_ids: if follower_id in user_ids: followings.append({"user_id": follower_id, "friend_id": user.id}) except Exception as e: print e continue datastore.store(followings, 'twitter_followings_500_750', 'Twitter followings') print "Successfully persist followings: " + str(len(followings)) + " items"
def getTweets(start, stop): tweet_entities = [ "id", "truncated", "hashtags", "urls", "user_mentions", "text", "source", "retweet_count", "created_at", "user", "favorited" ] users = list(datastore.load('twitter_users_cleaned')) total_users = len(users) if (stop - start > 300): print "Max (stop-start) is 300" return users = users[start:stop] data = [] index = start try: for u in users: print str(index) + ": Getting tweets for user " + u['name'] try: tweets = getTopTweets(userid=int(u['id'])) if (tweets is not None): print "Fetched " + str( len(tweets)) + " tweets of user " + u['name'] for t in tweets: st = t.AsDict() obj = {} for prop in tweet_entities: if prop not in st: if prop == "retweet_count" or prop == "favorited" or prop == "truncated": obj[prop] = 0 else: obj[prop] = '' elif prop == "user": user_obj = dict(st["user"]) obj['user_id'] = user_obj['id'] obj['user_screen_name'] = user_obj[ 'screen_name'] obj['user_name'] = user_obj['name'] else: obj[prop] = st[prop] data.append(obj) else: print "Empty tweets for user " + u['name'] except Exception as e: print 'Exception occurs when fetch tweets for user ' + u['name'] traceback.print_exc() index = index + 1 except Exception as e: traceback.print_exc() print str(datetime.now()) return datastore.store( data, 'twitter_tweets_cleaned_' + str(start) + '_' + str(stop - 1), '') print str( datetime.now()) + ' Result at segment ' + str(start) + ' -> ' + str( stop - 1) + '/' + str(total_users) + ': Total ' + str( len(data)) + ' tweets'
def grabContent(start, stop): if stop < 0 or start < 0 or stop < start: print "Invalid arguments" return articles = [] tweets = list(datastore.load('twitter_tweets', start, stop - start)) #tweets = tweets[start:stop] print "Loaded " + str( len(tweets)) + " tweets from `twitter_tweets` table, " + str( start) + " -> " + str(stop - 1) index = start url_counter = 0 for tweet in tweets: tweet_id = tweet['id'] user_id = tweet['user_id'] user_screen_name = tweet['user_screen_name'] text = tweet['text'] urls = tweet['urls'] print str(index) + ": Processing tweet " + str(tweet_id) if urls is not None and urls.strip() != '': #urls = dict(json.loads(urls, encoding="utf-8")) try: urls = ast.literal_eval(urls) except Exception as e: continue print "Found " + str(len(urls)) + " urls" for url in urls: url_counter = url_counter + 1 display_url = urls[url] print "Grabbing content from " + url + " ..." content = None try: content = contentdetector.upgradeLink(url) except Exception as e: print "Exception occurs when trying to grab content from " + url traceback.print_exc() if content is not None and content.strip() != '': articles.append({ "tweet_id": tweet_id, "user_id": user_id, "user_screen_name": user_screen_name, "text": text, "url": url, "display_url": display_url, "content": content }) index = index + 1 print "Processed tweets " + str(start) + " -> " + str(stop - 1) print "Total urls processed: " + str(url_counter) print "Total articles grabbed: " + str(len(articles)) datastore.store(articles, 'twitter_articles_' + str(start) + '_' + str(stop - 1), '')
def rebuild_users_level1(): users = list(datastore.load('khiem.twitter_users_level1')) users = users[3:] user_ids = [] for u in users: user_ids.append(u["id"]) users = api.UsersLookup(user_id=user_ids) user_data = [] for u in users: user_data.append(u.AsDict()) print "Successfully retrieve " + str(len(users)) + " users info" datastore.store(user_data, 'twitter_users_level1_rebuild', '')
def getTweets(start, stop): tweet_entities = ["id", "truncated", "hashtags", "urls", "user_mentions", "text", "source", "retweet_count", "created_at", "user", "favorited"] users = list(datastore.load('twitter_users_cleaned')) total_users = len(users) if (stop - start > 300): print "Max (stop-start) is 300" return users = users[start:stop] data = [] index = start try: for u in users: print str(index) + ": Getting tweets for user " + u['name'] try: tweets = getTopTweets(userid=int(u['id'])) if (tweets is not None): print "Fetched " + str(len(tweets)) + " tweets of user " + u['name'] for t in tweets: st = t.AsDict() obj = {} for prop in tweet_entities: if prop not in st: if prop == "retweet_count" or prop == "favorited" or prop == "truncated": obj[prop] = 0 else: obj[prop] = '' elif prop == "user": user_obj = dict(st["user"]) obj['user_id'] = user_obj['id'] obj['user_screen_name'] = user_obj['screen_name'] obj['user_name'] = user_obj['name'] else: obj[prop] = st[prop] data.append(obj) else: print "Empty tweets for user " + u['name'] except Exception as e: print 'Exception occurs when fetch tweets for user ' + u['name'] traceback.print_exc() index = index + 1 except Exception as e: traceback.print_exc() print str(datetime.now()) return datastore.store(data, 'twitter_tweets_cleaned_' + str(start) + '_' + str(stop-1), '') print str(datetime.now()) + ' Result at segment ' + str(start) + ' -> ' + str(stop-1) + '/' + str(total_users) + ': Total ' + str(len(data)) + ' tweets'
def build_graph_level3(start, stop): if (stop - start > 15): print "The [start, stop) range must not greater than 15, or (stop-start) <= 15" return users_l2 = list(datastore.load('khiem.twitter_users_level2')) users_l2 = users_l2[start:stop] user_ids = [] followings = [] users_l3 = [] index = start try: for user in users_l2: print str(index) + ": Find top friends for user " + user['name'] try: friends = findTopFriends(user["id"]) for friend in friends: followings.append({"user_id": user["id"], "friend_id": friend.id}) if friend.id not in user_ids: user_ids.append(friend.id) users_l3.append(friend.AsDict()) except Exception as e: print "Exception occurs when find top friends of user " + user['name'] traceback.print_exc() try: print str(index) + ": Find top followers for user " + user['name'] followers = findTopFollowers(user['id']) for follower in followers: followings.append({"user_id": follower.id, "friend_id": user['id']}) if follower.id not in user_ids: user_ids.append(follower.id) users_l3.append(follower.AsDict()) except Exception as e: print "Exception occurs when find top followers of user " + user['name'] traceback.print_exc() index = index + 1 except Exception as e: traceback.print_exc() print str(datetime.now()) return datastore.store(users_l3, "twitter_users_level3_" + str(start) + "_" + str(stop-1), "") datastore.store(followings, "twitter_followings_level3_" + str(start) + "_" + str(stop-1), "") print str(datetime.now()) + ": Result at level 3, segment " + str(start) + "->" + str(stop-1) + ": " + str(len(users_l3)) + " users, " + str(len(followings)) + " followings"
def main(): gcal_template = ( "http://www.google.com/calendar/feeds/%s/public/basic?" "showdeleted=true&updated-min=2011-08-01T01:00:00-08:00&max-results=1000") threads = [FetchThread(gcal_template % cal.calendar_id) for cal in config.calendar_ids] map(FetchThread.start, threads) # Do work. map(FetchThread.join, threads) # Wait for work to end. events = [] for t in threads: events.extend(gcal.parse_feed(t.document)) posts = blogger.GetPosts() store = datastore.load("database.db") store.update(events, posts) store.save() store.close()
def grabContent(start, stop): if stop < 0 or start < 0 or stop < start: print "Invalid arguments" return articles = [] tweets = list(datastore.load('twitter_tweets', start, stop-start)) #tweets = tweets[start:stop] print "Loaded " + str(len(tweets)) + " tweets from `twitter_tweets` table, " + str(start) + " -> " + str(stop-1) index = start url_counter = 0 for tweet in tweets: tweet_id = tweet['id'] user_id = tweet['user_id'] user_screen_name = tweet['user_screen_name'] text = tweet['text'] urls = tweet['urls'] print str(index) + ": Processing tweet " + str(tweet_id) if urls is not None and urls.strip() != '': #urls = dict(json.loads(urls, encoding="utf-8")) try: urls = ast.literal_eval(urls) except Exception as e: continue print "Found " + str(len(urls)) + " urls" for url in urls: url_counter = url_counter + 1 display_url = urls[url] print "Grabbing content from " + url + " ..." content = None try: content = contentdetector.upgradeLink(url) except Exception as e: print "Exception occurs when trying to grab content from " + url traceback.print_exc() if content is not None and content.strip() != '': articles.append({ "tweet_id": tweet_id, "user_id": user_id, "user_screen_name": user_screen_name, "text": text, "url": url, "display_url": display_url, "content": content }) index = index + 1 print "Processed tweets " + str(start) + " -> " + str(stop-1) print "Total urls processed: " + str(url_counter) print "Total articles grabbed: " + str(len(articles)) datastore.store(articles, 'twitter_articles_' + str(start) + '_' + str(stop-1), '')
def _get_tweets(): tweets = [] users = list(datastore.load('khiem.twitter_users')) for i in range(300,501): try: print str(i) + ': Getting tweets from user ' + users[i]['name'] statuses = api.GetUserTimeline(screen_name=users[i]['screen_name'], count=200) for st in statuses: tweets.append(st.AsDict()) print 'Processed ' + str(len(statuses)) + ' tweets for user ' + users[i]['name'] except Exception as e: print e print 'Exception occurs when process user ' + users[i]['name'] + '. Skip.' continue print 'Storing tweets for ' + str(i) + ' users' # store all tweets datastore.store(tweets, 'twitter_tweets_300_500', 'Tweets')
def build_graph_level2(): # get users from level1, except the top 3 users from level0 users_level1 = list(datastore.load('khiem.twitter_users_level1')) users_level1 = users_level1[3:] users_l2 = [] followings = [] user_ids = [] # for each user find top friends for user in users_level1: print "Find top friends of user " + user["name"] friends = findTopFriends(user["id"]) for friend in friends: followings.append({"user_id": user["id"], "friend_id": friend.id}) if friend.id not in user_ids: user_ids.append(friend.id) users_l2.append(friend.AsDict()) datastore.store(users_l2, "twitter_users_level2", "") datastore.store(followings, "twitter_followings_level2", "")
def _get_tweets(): tweets = [] users = list(datastore.load('khiem.twitter_users')) for i in range(300, 501): try: print str(i) + ': Getting tweets from user ' + users[i]['name'] statuses = api.GetUserTimeline(screen_name=users[i]['screen_name'], count=200) for st in statuses: tweets.append(st.AsDict()) print 'Processed ' + str( len(statuses)) + ' tweets for user ' + users[i]['name'] except Exception as e: print e print 'Exception occurs when process user ' + users[i][ 'name'] + '. Skip.' continue print 'Storing tweets for ' + str(i) + ' users' # store all tweets datastore.store(tweets, 'twitter_tweets_300_500', 'Tweets')
def main(pretend=False, now=datetime.datetime.now()): fm = filemanager.FileManager() fm.moveouttheway(UPCOMING_EVENTS_TEMPLATE) fm.moveouttheway(WHATS_NEW_TEMPLATE) fm.moveouttheway(POST_TEMPLATE) fm.moveouttheway(EVENT_TEMPLATE) new_cal_entries = gcal.get_calendar_entries(GCAL_URL) new_blog_entries = tumblr.get_blog_entries(TUMBLR_URL) store = datastore.load(DATASTORE_FILE) store.update(new_cal_entries, new_blog_entries) store.save() store.close() upcoming_events_tpl = template.parse(fm.read(UPCOMING_EVENTS_TEMPLATE)) upcoming_events = store.get_upcoming_events(now) upcoming_events_output = upcoming_events_tpl.render(upcoming_events) fm.save(UPCOMING_EVENTS_PAGE, upcoming_events_output) whats_new_tpl = template.parse(fm.read(WHATS_NEW_TEMPLATE)) whats_new = store.get_whats_new(now) whats_new_output = whats_new_tpl.render(whats_new) fm.save(WHATS_NEW_PAGE, whats_new_output) post_tpl = template.parse(fm.read(POST_TEMPLATE)) for post in store.get_blog_posts(): post_output = post_tpl.render(post) fm.save(POST_FILE % post.title, post_output) event_tpl = template.parse(fm.read(EVENT_TEMPLATE)) for event in store.get_events(): event_output = event_tpl.render(event) fm.save(EVENT_FILE % event.title, event_output) if pretend: fm.show_diff() else: fm.commit()
def build_graph_level3(start, stop): if (stop - start > 15): print "The [start, stop) range must not greater than 15, or (stop-start) <= 15" return users_l2 = list(datastore.load('khiem.twitter_users_level2')) users_l2 = users_l2[start:stop] user_ids = [] followings = [] users_l3 = [] index = start try: for user in users_l2: print str(index) + ": Find top friends for user " + user['name'] try: friends = findTopFriends(user["id"]) for friend in friends: followings.append({ "user_id": user["id"], "friend_id": friend.id }) if friend.id not in user_ids: user_ids.append(friend.id) users_l3.append(friend.AsDict()) except Exception as e: print "Exception occurs when find top friends of user " + user[ 'name'] traceback.print_exc() try: print str( index) + ": Find top followers for user " + user['name'] followers = findTopFollowers(user['id']) for follower in followers: followings.append({ "user_id": follower.id, "friend_id": user['id'] }) if follower.id not in user_ids: user_ids.append(follower.id) users_l3.append(follower.AsDict()) except Exception as e: print "Exception occurs when find top followers of user " + user[ 'name'] traceback.print_exc() index = index + 1 except Exception as e: traceback.print_exc() print str(datetime.now()) return datastore.store(users_l3, "twitter_users_level3_" + str(start) + "_" + str(stop - 1), "") datastore.store( followings, "twitter_followings_level3_" + str(start) + "_" + str(stop - 1), "") print str(datetime.now()) + ": Result at level 3, segment " + str( start) + "->" + str(stop - 1) + ": " + str( len(users_l3)) + " users, " + str(len(followings)) + " followings"