class TwitterFollowerCrawler: rateLimit = None mySQLDataStore = None userCrawler = None def __init__(self): self.rateLimit = RateLimit() self.mySQLDataStore = MySQLDataStore() self.userCrawler = TwitterUserCrawler() def remove_duplication(self, followerIDList): res = [] for id in followerIDList: if not self.mySQLDataStore.check_user_by_id(id): res.append(id) print "*******************removed %d duplicate users" % ( len(followerIDList) - len(res)) return res def handle_one_followee(self, screenName): #get id from users table for the screenName id = self.mySQLDataStore.get_one_id(screenName) #get current offset in tmp_offset table curOffset = self.mySQLDataStore.select_cur_offset(id) #get max offset in follower_id table maxOffset = self.mySQLDataStore.select_max_offset(id) if maxOffset <= 0: print "User %s has not started yet!" % (screenName) return #if curOffset < maxOffset: pull data from curOffset print "before while" while curOffset < maxOffset: print "In while" curOffset += 1 strFollowers = self.mySQLDataStore.select_follower_piece( id, curOffset) if not strFollowers: print "Piece %d %d is missing!" % (id, curOffset) return listFollowers = json.loads(strFollowers) # listFollowers = self.remove_duplication(listFollowers) print("++++++++++++++", screenName, curOffset, maxOffset, len(listFollowers)) self.userCrawler.get_user_info(listFollowers, parameter='user_id') self.mySQLDataStore.update_cur_offset(id, curOffset) def handle_all_followee(self, screenNameArr): for screenName in screenNameArr: self.handle_one_followee(screenName)
class TwitterUserCrawler: parameters = {'user_id':'user_id', 'screen_name':'screen_name'} urlUserLookup = "https://api.twitter.com/1/users/lookup.json?%s=%s" dataStore = None rateLimit = None urlHandler = None def __init__(self): self.dataStore = MySQLDataStore() self.rateLimit = RateLimit() self.urlHandler = URLHandler() def get_user_info(self, screenNameArr, parameter = 'screen_name'): cur = 0 next = 100 print ("get_user_info: ", parameter) curList = [] cnt = 0 for name in screenNameArr: if 'user_id' == parameter and not self.dataStore.check_user_by_id(name): curList.append(name) else: cnt += 1 if len(curList) >= 100: res = self.get_100_user_info(curList, parameter) if res: self.store_users(res) else: f = open("log/%f"%(time.time()), "w") f.write(str(screenNameArr[cur:next])) f.write("\n") f.close() curList = [] print ("removed", cnt, "users") """ while next < len(screenNameArr): res = self.get_100_user_info(screenNameArr[cur:next], parameter) if res: self.store_users(res) else: f = open("log/%f"%(time.time()), "w") f.write(str(screenNameArr[cur:next])) f.write("\n") f.close() cur = next next += 100 if cur < len(screenNameArr): res = self.get_100_user_info(screenNameArr[cur:len(screenNameArr)], parameter) if res: self.store_users(res) """ def store_users(self, dictData): for screenName in dictData.keys(): id = dictData[screenName]['id'] loc = dictData[screenName]['location'] followerNum = dictData[screenName]['followerNum'] followeeNum = dictData[screenName]['followeeNum'] statusNum = dictData[screenName]['statusNum'] favorNum = dictData[screenName]['favorNum'] createdAt = dictData[screenName]['createdAt'] verified = dictData[screenName]['verified'] #self.dataStore.store_user(id, screenName, folNum, loc) self.dataStore.store_user(id, screenName, followerNum, followeeNum, statusNum, favorNum, verified, createdAt, loc) def dump_resp(self, url): retry = True while retry: try: retry = False rawData = self.urlHandler.open_url(url) if not rawData: return data = json.loads(rawData.read()) return data except ValueError, e: print ("ValueError: ", e.message) retry = True