Example #1
0
 def _init(self):
     """Actually initialize the mysql connection."""
     try:
         self.mdbConnection = mdb.connect(self.host, self.username, 
             self.password, self.database, charset='utf8');
         self.cursor = self.mdbConnection.cursor()
         sucess = True
     except mdb.Error, e:
         log.error("Can not establish connection to mysql: " + str(e))
         sucess = False
Example #2
0
def recursiveProfileTest(
    username, password, testInterval, totalCount, startList):
    """Run a recursive get profile test."""
    generator = recursiveTestGenerator(
        username, password, testInterval, totalCount, startList)
    while True:
        try:
            id, info, errorCode = generator.next()
            if not errorCode:
                log.info('Profile url: ' + RenrenAgent.getProfileUrl(id))
                path = util.saveTestPage(info.html, id)
                log.info('Profile local path: file://'+path)
                printInfo(info)
        except Exception, e:
            log.error('Error happen or end: ' + str(e))
            break
Example #3
0
 def getGlobalInfo(self):
     """Get the global information."""
     AnalysedDataBase._acquireLock()
     self.pingServer()
     try:
         command = """SELECT info
             FROM GlobalNameInfo
             WHERE id = 1;
         """
         self.cursor.execute(command)
         rows = self.cursor.fetchall()
         if not rows:
             log.error('Read gloabal information fail!')
         string = rows[0][0]
         globalInfo = GlobalNameInfo.FromString(string)
         return globalInfo
     except Exception, e:
         log.warning("Get global info failed! " + str(e))
         self.mdbConnection.rollback()
         return None
Example #4
0
    def init(self, host, username, password, database):
        """Initialize the mysql connection.
            
        Args:
            @host {string} the name of the host, e.g. 'localhost'.
            @username {string} the user name of the database account.
            @password {string} the password.
            @database {string} the name of the database.

        Reuturns:
            True if the action success.
            False if the action failed.
        """
        try:
            self.mdbConnection = mdb.connect(host, username, 
                password, database);
            self.cursor = self.mdbConnection.cursor()
            sucess = True
        except mdb.Error, e:
            log.error("Can not establish connection to mysql: " + str(e))
            sucess = False
Example #5
0
    def _getInfo(self, key, tableName):
        """Get the RawInfo for the map the key.

        Returns: {RawNameItemInfo} the raw info.
        """
        AnalysedDataBase._acquireLock()
        self.pingServer()
        try:
            command = "SELECT info FROM %s WHERE s_key = %s;" % (tableName, '%s')
            self.cursor.execute(command, [key])
            rows = self.cursor.fetchall()
            if len(rows):
                if len(rows) > 1:
                    log.error("Mutiple result for key: " + key)
                infoString = rows[0][0]
                info = RawNameItemInfo.FromString(infoString)
                return info
            else:
                return None
        finally:
            AnalysedDataBase._releaseLock()
Example #6
0
def getProfileTest(agent, id, filePath=''):
    if filePath:
        log.info('================= Get Profile test (Local Html) ======' +\
            '=======================')
        log.info('Local Profile path: file://' + filePath)
        html = open(filePath).read()
        info, errorCode = agent.parseProfileHtml(html)
        if errorCode:
            log.error('Error happen in parse local html, path: ' + filePath)
            return
    else:
        log.info('================= Get Profile test (Online Html) =====' +\
            '=======================')
        log.info('Profile url: ' + agent.getProfileUrl(id))
        info, errorCode = agent.getProfile(id)
        if errorCode:
            log.error('Error happen in get profile, id: ' + id)
            return
        if not info.html:
            log.warning('No html')
            return
        path = util.saveTestPage(info.html, id)
        log.info('Online Profile path: file://'+path)
    printInfo(info)
Example #7
0
def recursiveTestGenerator(
    username, password, testInterval, totalCount, startList):
    """A recursive test generator.
   
    Start from a list of user id, and get all the profile of these id and
    their friends and friend of the friends.
    Every time it gets a user profile, it will yield the
    (id, UserInfo, ErrorCode)
    
    Args:
        @username {string} the user name of the agent.
        @password {string} the password of the agent.
        @testInterval {float} the interval time between every request.
        @totalCount {integer} total number of profile to get.
        @startList {List} a list of user id to start test.
    """
    agent = RenrenAgent(username, password)
    info, error = agent.login()
    if not error:
        log.info(info['name'])
        log.info(info['href'])
    else:
        log.error('Login error(username, password): ' +\
                username + ', ' + password)
    count = 1
    visitList = []
    for elem in startList:
        visitList.append((elem, None))
    while visitList:
        # Get the element to do requet.
        elem = visitList[0]
        id = elem[0]
        log.info('processing(' + str(count) + '): ' + id)
        visitList = visitList[1:]
        info, errorCode = agent.getProfile(id)
        # Error handle
        if errorCode:
            if elem[1]:
                log.warning('Error happen when getProfile. Refer id: ' +\
                            str(elem[1]) + '. Refer page url: ' +\
                            agent.getProfileUrl(str(elem[1])))
            else:
                log.warning('Error happen when getProfile, no refer id.')
            continue
        # Yield result
        yield (id, info, errorCode)
        # Result handle
        if len(visitList) < totalCount - count:
            newList = []
            if info.friendList:
                for ele in info.friendList:
                    newList += [(ele, id)]
            if info.recentVisitedList:
                for ele in info.recentVisitedList:
                    newList += [(ele, id)]
            visitList += newList
        # Acc
        count += 1
        if count > totalCount:
            return
        time.sleep(testInterval)
Example #8
0
    def run(self):
        log.info('>>>>>>  Thread %s start.  <<<<<<' % self.threadId)
        crawler = Crawler(self.dataBase)
        dataBase = self.dataBase
        agent = None
        account = None
        startNode = None
        startNodeRowId = None
        try:
            while True:
                # Prepare for agent, account and startnode.
                if not startNode:
                    startNode, startNodeRowId = dataBase.getStartNode()
                    log.info('Thread %s, startnode: %s, %s' %\
                        (self.threadId, startNode, startNodeRowId))
                    if not startNode or not startNodeRowId:
                        # No avaliable start node, exit crawling.
                        log.error(
                            'No start node for thread %s, exit crawling.' %\
                            (self.threadId, ))
                        break
                if not agent or not account:
                    agent,account = self.getAgentWithAccount()
                    if not agent or not account:
                        # No avaliable account, exit crawling.
                        log.warning(
                            'No avaliable agent for thread %s, exit crawling.' %\
                            (self.threadId, ))
                        break

                # One crawling process.
                crawler.setAgent(agent)
                try:
                    crawler.crawl(startNode)
                except CrawlerException, e:
                    log.info('Thread %s gets exception: %s' %\
                        (self.threadId, str(e)))
                    if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL:
                        log.info("Thread " + str(self.threadId) +\
                            " stop crawling because of stop signal.")
                        break
                    if e.errorCode ==\
                        CrawlerErrorCode.GET_EXPANDING_NODE_FAILED or\
                        e.errorCode == CrawlerErrorCode.EXPAND_EXPANDED_NODE or\
                        e.errorCode == CrawlerErrorCode.NO_NODE_TO_EXPAND:
                        # Start node's bad.
                        log.warning('Thread %s, bad start node: %s, %s' %\
                            (self.threadId, startNode, startNodeRowId))
                        dataBase.deleteFromStartList(startNode)
                        startNode = startNodeRowId = None
                    if e.errorCode == CrawlerErrorCode.REQUEST_FAILED:
                        # Still start node's bad.
                        # TODO: Implement invalid usernode test support in
                        # database to change it.
                        log.warning('Thread %s, bad start node: %s, %s' %\
                            (self.threadId, startNode, startNodeRowId))
                        dataBase.deleteFromStartList(startNode)
                        startNode = startNodeRowId = None
                    if e.errorCode == CrawlerErrorCode.REACH_REQUEST_LIMIT:
                        # Use a new accout
                        account.finishUsing()
                        account = agent = None
                finally:
                    # The start node change every time crawler.epand() called.
                    # So the start node can not be reused when exception happen.
                    # We need to release it and use a new one.
                    if startNodeRowId:
                        dataBase.releaseStartNode(startNodeRowId)
                        startNode = startNodeRowId = None
Example #9
0
                            (self.threadId, startNode, startNodeRowId))
                        dataBase.deleteFromStartList(startNode)
                        startNode = startNodeRowId = None
                    if e.errorCode == CrawlerErrorCode.REACH_REQUEST_LIMIT:
                        # Use a new accout
                        account.finishUsing()
                        account = agent = None
                finally:
                    # The start node change every time crawler.epand() called.
                    # So the start node can not be reused when exception happen.
                    # We need to release it and use a new one.
                    if startNodeRowId:
                        dataBase.releaseStartNode(startNodeRowId)
                        startNode = startNodeRowId = None
        except Exception, e:
            log.error('Thread %s gets exception, exit crawling: %s' %\
                (self.threadId, str(e)))
        finally:
            # Release resource.
            if account:
                account.finishUsing()
            if startNodeRowId:
                dataBase.releaseStartNode(startNodeRowId)
        log.info('>>>>>>  Thread %s end.  <<<<<<' % self.threadId)

class MainCrawlThread(threading.Thread):

    dataBase = None
    renrenAccountPool = None

    THREAD_NUMBER = flag.getFlag('thread_number')
    ROUND_NUMBER = flag.getFlag('round_number')