Beispiel #1
0
def collect_followers(version, app, dump_path):

	CONSUMER_KEY = app['c_key']
	CONSUMER_SECRET = app['c_sec']
	ACCESS_KEY = app['a_key']
	ACCESS_SECRET = app['a_sec']
	consumer = oauth.Consumer(key=CONSUMER_KEY, secret=CONSUMER_SECRET)
	access_token = oauth.Token(key=ACCESS_KEY, secret=ACCESS_SECRET)
	client = oauth.Client(consumer, access_token)

	raw_tweet = dump_path[0]
	followers_dump = dump_path[1]
	author_dump = dump_path[2]
	log_details = dump_path[4]
	
	fraw = open(raw_tweet,"r")
	flog = open(log_details,"a")
	fdump = open(followers_dump,"a")
	fauth = open(author_dump,"a")
	# ----------------------------------
	# Skipping the lines to start from right place.
	start_from = find_start(followers_dump, raw_tweet)
	line=""
	for i in range(start_from):
		line = fraw.readline()
	# ----------------------------------
	
	starttime = datetime.now() 
	endtime =""
	endtweet=""

	print "starting from the line number :",start_from
	count =1;limit =0;ret =0
	
	line = fraw.readline()
	while line:
		count+=1 
		tweet = json.loads(line)
		uid = twitter.get_uid(tweet,"yahoo")
		tid = twitter.get_tweetid(tweet,"yahoo")
		endtweet = tid
		author_details = get_author_details(tweet,"yahoo", author_dump)
		
		
		#GETTING FOLLOWERS FROM TWITTER by user_id
		entry = twitter.get_followers(uid,0,version,client)
		
		
		
		if(version==1):
			limit = int(entry['response']['x-ratelimit-remaining'])
		else:
			limit = int(entry['response']['x-rate-limit-remaining'])

		sys.stdout.write("\rlimit x-ratelimit-remaining: %d The request number : %d" %(limit,count))
		sys.stdout.flush()

		if(limit<3):
			endtime = datetime.now()
			ret =1
			print "limit reached\n"
			break

		# Dumping followers ids and author_details
		entry["tweet_id"] = tid
		fdump.write(json.dumps(entry)+"\n")	
		fauth.write(json.dumps(author_details)+"\n")
		
		#new line 
		line = fraw.readline()

	
	fraw.close()
	fdump.close()
	fauth.close()

	if(limit >=3):
		ret =2
		endtime = datetime.now()
	flog.write(raw_tweet+"\t"+str(starttime)+"\t"+str(endtime)+"\t"+str(endtweet)+"\t"+str(ret)+"\t"+str(count)+"\n")
	flog.close()
	return [ret, limit]
def get_authors(paths):
	
	dir_list = os.listdir(paths['sampled_tweets'])
	SIZE = 100
	count =0
	# print dir_list
	for d in dir_list:
		
		# The source of tweets
		old_dir = paths["sampled_tweets"]+"/"+d
		file_list = os.listdir(old_dir)
		print "Reading from dir ",old_dir
		
		#New dir for dumping authors
		new_dir = paths["graph"]+"/"+d
		os.mkdir(new_dir)
		print "Created new dir ",new_dir

		# New Authors dump file
		fauth   = open(new_dir+"/authors.txt","w")

		for sample in file_list:
			print "    file ",sample
			fsample = open(old_dir+"/"+sample,"r")
			line = fsample.readline()
			print "TAKING AUTHORS FROM", sample	
			c = 0
			while line:
				tweet = json.loads(line)
				tid = twitter.get_tweetid(tweet,"yahoo")
				uid = twitter.get_uid(tweet,"yahoo")
				fauth.write(json.dumps(get_author_details(tweet,"yahoo"))+"\n")
				count+=1
				c+=1
				if(c==SIZE):
					break


				# # IF THE AUTHORS ALREADY IN DATABASE THEN IGNORE
				# if(users_db.has_key(uid)):
				# 	line = fsample.readline()
				# 	continue
				# else:
				# 	# Insert in db(both file and Dict) 
				# 	f_db.write(str(uid)+"\n")
				# 	users_db[uid] = 1
				# 	# and also in the authors.txt 
				# 	fauth.write(json.dumps(get_author_details(tweet,"yahoo"))+"\n")
					
				
				line = fsample.readline()
		
		
			fsample.close()
		fauth.close()
			

	
	# return
	# # GLOBAL DATABSAE OF ALL THE UNIQUE USERS
	# users_db = {}
	# print "loading users_db"
	# users_db = load_users_db(paths['users_db'])
	# count =0
	
	return count