def collect_followers(version, app, dump_path): CONSUMER_KEY = app['c_key'] CONSUMER_SECRET = app['c_sec'] ACCESS_KEY = app['a_key'] ACCESS_SECRET = app['a_sec'] consumer = oauth.Consumer(key=CONSUMER_KEY, secret=CONSUMER_SECRET) access_token = oauth.Token(key=ACCESS_KEY, secret=ACCESS_SECRET) client = oauth.Client(consumer, access_token) raw_tweet = dump_path[0] followers_dump = dump_path[1] author_dump = dump_path[2] log_details = dump_path[4] fraw = open(raw_tweet,"r") flog = open(log_details,"a") fdump = open(followers_dump,"a") fauth = open(author_dump,"a") # ---------------------------------- # Skipping the lines to start from right place. start_from = find_start(followers_dump, raw_tweet) line="" for i in range(start_from): line = fraw.readline() # ---------------------------------- starttime = datetime.now() endtime ="" endtweet="" print "starting from the line number :",start_from count =1;limit =0;ret =0 line = fraw.readline() while line: count+=1 tweet = json.loads(line) uid = twitter.get_uid(tweet,"yahoo") tid = twitter.get_tweetid(tweet,"yahoo") endtweet = tid author_details = get_author_details(tweet,"yahoo", author_dump) #GETTING FOLLOWERS FROM TWITTER by user_id entry = twitter.get_followers(uid,0,version,client) if(version==1): limit = int(entry['response']['x-ratelimit-remaining']) else: limit = int(entry['response']['x-rate-limit-remaining']) sys.stdout.write("\rlimit x-ratelimit-remaining: %d The request number : %d" %(limit,count)) sys.stdout.flush() if(limit<3): endtime = datetime.now() ret =1 print "limit reached\n" break # Dumping followers ids and author_details entry["tweet_id"] = tid fdump.write(json.dumps(entry)+"\n") fauth.write(json.dumps(author_details)+"\n") #new line line = fraw.readline() fraw.close() fdump.close() fauth.close() if(limit >=3): ret =2 endtime = datetime.now() flog.write(raw_tweet+"\t"+str(starttime)+"\t"+str(endtime)+"\t"+str(endtweet)+"\t"+str(ret)+"\t"+str(count)+"\n") flog.close() return [ret, limit]
def get_authors(paths): dir_list = os.listdir(paths['sampled_tweets']) SIZE = 100 count =0 # print dir_list for d in dir_list: # The source of tweets old_dir = paths["sampled_tweets"]+"/"+d file_list = os.listdir(old_dir) print "Reading from dir ",old_dir #New dir for dumping authors new_dir = paths["graph"]+"/"+d os.mkdir(new_dir) print "Created new dir ",new_dir # New Authors dump file fauth = open(new_dir+"/authors.txt","w") for sample in file_list: print " file ",sample fsample = open(old_dir+"/"+sample,"r") line = fsample.readline() print "TAKING AUTHORS FROM", sample c = 0 while line: tweet = json.loads(line) tid = twitter.get_tweetid(tweet,"yahoo") uid = twitter.get_uid(tweet,"yahoo") fauth.write(json.dumps(get_author_details(tweet,"yahoo"))+"\n") count+=1 c+=1 if(c==SIZE): break # # IF THE AUTHORS ALREADY IN DATABASE THEN IGNORE # if(users_db.has_key(uid)): # line = fsample.readline() # continue # else: # # Insert in db(both file and Dict) # f_db.write(str(uid)+"\n") # users_db[uid] = 1 # # and also in the authors.txt # fauth.write(json.dumps(get_author_details(tweet,"yahoo"))+"\n") line = fsample.readline() fsample.close() fauth.close() # return # # GLOBAL DATABSAE OF ALL THE UNIQUE USERS # users_db = {} # print "loading users_db" # users_db = load_users_db(paths['users_db']) # count =0 return count