class TwitterFollowerCrawler:
    rateLimit = None
    mySQLDataStore = None
    userCrawler = None

    def __init__(self):
        self.rateLimit = RateLimit()
        self.mySQLDataStore = MySQLDataStore()
        self.userCrawler = TwitterUserCrawler()

    def remove_duplication(self, followerIDList):
        res = []
        for id in followerIDList:
            if not self.mySQLDataStore.check_user_by_id(id):
                res.append(id)
        print "*******************removed %d duplicate users" % (
            len(followerIDList) - len(res))
        return res

    def handle_one_followee(self, screenName):
        #get id from users table for the screenName
        id = self.mySQLDataStore.get_one_id(screenName)
        #get current offset in tmp_offset table
        curOffset = self.mySQLDataStore.select_cur_offset(id)
        #get max offset in follower_id table
        maxOffset = self.mySQLDataStore.select_max_offset(id)
        if maxOffset <= 0:
            print "User %s has not started yet!" % (screenName)
            return
        #if curOffset < maxOffset: pull data from curOffset
        print "before while"
        while curOffset < maxOffset:
            print "In while"
            curOffset += 1
            strFollowers = self.mySQLDataStore.select_follower_piece(
                id, curOffset)
            if not strFollowers:
                print "Piece %d %d is missing!" % (id, curOffset)
                return
            listFollowers = json.loads(strFollowers)
            #            listFollowers = self.remove_duplication(listFollowers)
            print("++++++++++++++", screenName, curOffset, maxOffset,
                  len(listFollowers))
            self.userCrawler.get_user_info(listFollowers, parameter='user_id')
            self.mySQLDataStore.update_cur_offset(id, curOffset)

    def handle_all_followee(self, screenNameArr):
        for screenName in screenNameArr:
            self.handle_one_followee(screenName)
예제 #2
0
class TwitterUserCrawler:

    parameters = {'user_id':'user_id', 'screen_name':'screen_name'}
    urlUserLookup = "https://api.twitter.com/1/users/lookup.json?%s=%s"    
    dataStore = None
    rateLimit = None
    urlHandler = None

    def __init__(self):
        self.dataStore = MySQLDataStore()
        self.rateLimit = RateLimit()
        self.urlHandler = URLHandler()


    def get_user_info(self, screenNameArr, parameter = 'screen_name'):
        cur = 0
        next = 100
        print ("get_user_info: ", parameter)
        curList = []
        cnt = 0
        for name in screenNameArr:
            if 'user_id' == parameter and not self.dataStore.check_user_by_id(name):
                curList.append(name)
            else:
                cnt += 1
            if len(curList) >= 100:
                res = self.get_100_user_info(curList, parameter)
                if res:
                    self.store_users(res)
                else:
                    f = open("log/%f"%(time.time()), "w")
                    f.write(str(screenNameArr[cur:next]))
                    f.write("\n")
                    f.close()               
                curList = []            
        print ("removed", cnt, "users")    

        """
        while next < len(screenNameArr):
            res = self.get_100_user_info(screenNameArr[cur:next], parameter)
            if res:
                self.store_users(res)
            else:
                f = open("log/%f"%(time.time()), "w")
                f.write(str(screenNameArr[cur:next]))
                f.write("\n")
                f.close()
            cur = next
            next += 100

        if cur < len(screenNameArr):
            res = self.get_100_user_info(screenNameArr[cur:len(screenNameArr)], parameter)
            if res:
                self.store_users(res)
	    """
        

    def store_users(self, dictData):
        for screenName in dictData.keys():
            id = dictData[screenName]['id']
            loc = dictData[screenName]['location']
            followerNum = dictData[screenName]['followerNum']
            followeeNum = dictData[screenName]['followeeNum']
            statusNum = dictData[screenName]['statusNum']
            favorNum = dictData[screenName]['favorNum']
            createdAt = dictData[screenName]['createdAt']
            verified = dictData[screenName]['verified']
            #self.dataStore.store_user(id, screenName, folNum, loc)
            self.dataStore.store_user(id, screenName, followerNum, followeeNum, statusNum, favorNum, verified, createdAt, loc)

    def dump_resp(self, url):
        retry = True
        while retry:
            try:
                retry = False
                rawData = self.urlHandler.open_url(url)
                if not rawData:
                    return
                data = json.loads(rawData.read())
                return data
            except ValueError, e:
                print ("ValueError: ",  e.message)
                retry = True