Ejemplo n.º 1
0
class CoorCrawler:

    INFTY = 99999

    filename = "prev_loc.txt"
    googleGeo = None
    dataStore = None
    loc = None

    def __init__(self):
        f = open(self.filename, 'r')
        self.loc = ''.join(f.readlines())
        f.close()
        self.googleGeo = GoogleGeo()
        self.dataStore = MySQLDataStore()

    def get_address(self):
        #cnt = self.dataStore.select_user_count()
        #print ("count: ", cnt)
        while True:
            f = open(self.filename, 'w')
            f.write(self.loc)
            f.close()
            self.loc = self.dataStore.select_user_location_offset(self.loc)
            print 
            if not self.loc:
                print ("in not loc")
                print ("done with current locations")
                break
            tmp = self.dataStore.select_addr_location(self.loc)
            if tmp:
                print ("in address", tmp)
                continue
            res = self.googleGeo.get_coordination(self.loc)
            if res:
                (lati, long, formatted, types) = res
            else:
                lati = -self.INFTY
                long = -self.INFTY
                formatted = None
                types = "None"

            print  (self.loc, lati, long, formatted, types)
            self.dataStore.insert_address(self.loc, lati, long, formatted, types)
Ejemplo n.º 2
0
class TwitterFollowerCrawler:
    rateLimit = None
    mySQLDataStore = None
    userCrawler = None

    def __init__(self):
        self.rateLimit = RateLimit()
        self.mySQLDataStore = MySQLDataStore()
        self.userCrawler = TwitterUserCrawler()

    def remove_duplication(self, followerIDList):
        res = []
        for id in followerIDList:
            if not self.mySQLDataStore.check_user_by_id(id):
                res.append(id)
        print "*******************removed %d duplicate users" % (
            len(followerIDList) - len(res))
        return res

    def handle_one_followee(self, screenName):
        #get id from users table for the screenName
        id = self.mySQLDataStore.get_one_id(screenName)
        #get current offset in tmp_offset table
        curOffset = self.mySQLDataStore.select_cur_offset(id)
        #get max offset in follower_id table
        maxOffset = self.mySQLDataStore.select_max_offset(id)
        if maxOffset <= 0:
            print "User %s has not started yet!" % (screenName)
            return
        #if curOffset < maxOffset: pull data from curOffset
        print "before while"
        while curOffset < maxOffset:
            print "In while"
            curOffset += 1
            strFollowers = self.mySQLDataStore.select_follower_piece(
                id, curOffset)
            if not strFollowers:
                print "Piece %d %d is missing!" % (id, curOffset)
                return
            listFollowers = json.loads(strFollowers)
            #            listFollowers = self.remove_duplication(listFollowers)
            print("++++++++++++++", screenName, curOffset, maxOffset,
                  len(listFollowers))
            self.userCrawler.get_user_info(listFollowers, parameter='user_id')
            self.mySQLDataStore.update_cur_offset(id, curOffset)

    def handle_all_followee(self, screenNameArr):
        for screenName in screenNameArr:
            self.handle_one_followee(screenName)
Ejemplo n.º 3
0
 def __init__(self):
     self.rateLimit = RateLimit()
     self.mySQLDataStore = MySQLDataStore()
     self.userCrawler = TwitterUserCrawler()
Ejemplo n.º 4
0
 def __init__(self, logName):
     self.logFile = open(logName,"w")
     self.db = MySQLDataStore()
     self.urlHandler = URLHandler()
Ejemplo n.º 5
0
class Crawler:
    
    logFile = None 
    db = None

    urlGetFollowerID = "https://api.twitter.com/1/followers/ids.json?cursor=%d&screen_name=%s"  
    urlCheckLimit = "https://api.twitter.com/1/account/rate_limit_status.json"
    # for 1 user: id, screen name, name 
    urlSingleUserInfo = "https://api.twitter.com/1/users/show.json?screen_name=%s&include_entities=true" 
    # up to 100 users: returns a list, data[0]['name'] include_entities = true?
    urlUserInfo = "https://api.twitter.com/1/users/lookup.json?include_entities=true&screen_name=%s"

    urlHandler = None

    def __init__(self, logName):
        self.logFile = open(logName,"w")
        self.db = MySQLDataStore()
        self.urlHandler = URLHandler()
    
    """
    def open_url_followerID(self,url,screenName):
        count = 1
        while (count):
            if (count == 10):
                self.logFile.write("URL exceptions occur in %s: %s\n"%(screenName,url))
                return None                 
            try: 
                res = urllib2.urlopen(url)
                return res
            except urllib2.HTTPError, e:
                self.logFile.write(str(e.strerror, e.message))
                count = count + 1
                time.sleep(5)
            except urllib2.URLError, e:
                self.logFile.write(e.reason)
                #self.logFile.write(e.strerror)
                count = count + 1
                time.sleep(5)
    """        
 
    def check_limit(self):
        url = self.urlCheckLimit
        res = self.urlHandler.open_url(url)
        data = json.loads(res.read())
        limit = data['remaining_hits']
        wakeup = data['reset_time_in_seconds']
        return (limit,wakeup)

    """
    def get_user_info(self,follower_sname_list):
        #construct sname-list seperated by ,
        url = self.urlUserInfo
        #check rate limit
        res = self.open_url(url)
        return json.loads(res.read())

    
    def get_follower_location(self,follower_sname_list):
        locations = []
        data = self.get_user_info(follower_sname_list)
        for i in range(len(follower_sname_list)):
            locations.append(data[i]['location'])
        return locations         
    """

    def create_file(self,screenName,i):
        if not os.path.isdir("./"+screenName+"/"):
            os.mkdir("./"+screenName+"/")
        outputFile = open("./%s/followerID%d.txt"%(screenName,i),"w")
        return outputFile

    def get_screen_name(self,in_filename,out_filename):
        inputFile = open(in_filename,"r")
        outputFile = open(out_filename,"w")
        for line in inputFile:
            name = re.split(r'[()]',line)
            outputFile.write(name[1]+'\n')

    def get_follower_id(self, screenName, userID, offset, cursor):	   
        screenName = screenName.split('\n')[0] #works for sample.txt
	 
        while cursor != 0: 
            offset += 1
            (limit,wakeup) = self.check_limit()
            while (limit == 0):  
                interval = wakeup-time.time()
                time.sleep(interval)
                time.sleep(30)
                (limit,wakeup) = self.check_limit()

            (pCursor,nCursor,ids) = self.get_one_page_id(screenName,cursor)
            print (screenName, userID, offset, pCursor, nCursor)
		
            if ids == 0 and pCursor == 0 and nCursor == 0:
                return 
            self.db.store_follower_piece(userID, offset, pCursor, nCursor, ids)
            cursor = nCursor


    def get_one_page_id(self, screenName, cursor):
        print ("Screen Name", screenName, "cursor", cursor)
        url = self.urlGetFollowerID%(cursor, screenName)
        print url
        res = self.urlHandler.open_url(url)
        if res == None:
            print "Fatal Errors: follower id page return None!!!"
            self.logFile.write("Fatal Errors in requesting %s: %s\n",(screenName, url))
            return (0, 0, 0)
        strData = res.read() 
        data = json.loads(strData)
        if 'errors' in data.keys():
            print "Fatal Errors: follower id page return None!!!"
            self.logFile.write("Fatal Errors in requesting %s: %s\n",(screenName, url))
            return (0,0,0)
        ids = data['ids']
        # the cursor is int64, I have used big int in the follower_id table -- Shen Li
        nCursor = data['next_cursor']
        pCursor = data['previous_cursor']
        return (pCursor, nCursor,ids)        

    def get_all_follower_id(self,filename):
        inputFile = open(filename,"r")
        for line in inputFile:
            screenName = line.split('\n')[0]
            userID = self.db.get_one_id(screenName)
            if not userID:
                continue
            (offset, cursor) = self.db.get_next_cursor(userID)
            self.get_follower_id(screenName, userID, offset, cursor) 
        inputFile.close()
                    

    def clean_up(self):
        self.logFile.close()
        self.db.close()
Ejemplo n.º 6
0
 def __init__(self):
     self.dataStore = MySQLDataStore()
     self.rateLimit = RateLimit()
     self.urlHandler = URLHandler()
Ejemplo n.º 7
0
class TwitterUserCrawler:

    parameters = {'user_id':'user_id', 'screen_name':'screen_name'}
    urlUserLookup = "https://api.twitter.com/1/users/lookup.json?%s=%s"    
    dataStore = None
    rateLimit = None
    urlHandler = None

    def __init__(self):
        self.dataStore = MySQLDataStore()
        self.rateLimit = RateLimit()
        self.urlHandler = URLHandler()


    def get_user_info(self, screenNameArr, parameter = 'screen_name'):
        cur = 0
        next = 100
        print ("get_user_info: ", parameter)
        curList = []
        cnt = 0
        for name in screenNameArr:
            if 'user_id' == parameter and not self.dataStore.check_user_by_id(name):
                curList.append(name)
            else:
                cnt += 1
            if len(curList) >= 100:
                res = self.get_100_user_info(curList, parameter)
                if res:
                    self.store_users(res)
                else:
                    f = open("log/%f"%(time.time()), "w")
                    f.write(str(screenNameArr[cur:next]))
                    f.write("\n")
                    f.close()               
                curList = []            
        print ("removed", cnt, "users")    

        """
        while next < len(screenNameArr):
            res = self.get_100_user_info(screenNameArr[cur:next], parameter)
            if res:
                self.store_users(res)
            else:
                f = open("log/%f"%(time.time()), "w")
                f.write(str(screenNameArr[cur:next]))
                f.write("\n")
                f.close()
            cur = next
            next += 100

        if cur < len(screenNameArr):
            res = self.get_100_user_info(screenNameArr[cur:len(screenNameArr)], parameter)
            if res:
                self.store_users(res)
	    """
        

    def store_users(self, dictData):
        for screenName in dictData.keys():
            id = dictData[screenName]['id']
            loc = dictData[screenName]['location']
            followerNum = dictData[screenName]['followerNum']
            followeeNum = dictData[screenName]['followeeNum']
            statusNum = dictData[screenName]['statusNum']
            favorNum = dictData[screenName]['favorNum']
            createdAt = dictData[screenName]['createdAt']
            verified = dictData[screenName]['verified']
            #self.dataStore.store_user(id, screenName, folNum, loc)
            self.dataStore.store_user(id, screenName, followerNum, followeeNum, statusNum, favorNum, verified, createdAt, loc)

    def dump_resp(self, url):
        retry = True
        while retry:
            try:
                retry = False
                rawData = self.urlHandler.open_url(url)
                if not rawData:
                    return
                data = json.loads(rawData.read())
                return data
            except ValueError, e:
                print ("ValueError: ",  e.message)
                retry = True
Ejemplo n.º 8
0
 def __init__(self):
     f = open(self.filename, 'r')
     self.loc = ''.join(f.readlines())
     f.close()
     self.googleGeo = GoogleGeo()
     self.dataStore = MySQLDataStore()