Ejemplo n.º 1
0
    def onStatusUpdate(self, cmd):
        cmd.updateTime = time.time()
        # append the command's status to the rendernode's history
        if isFinalStatus(cmd.status):
            # only if we don't already have a command for this task
            if hasattr(cmd.renderNode, 'tasksHistory'
                       ) and cmd.task.id not in cmd.renderNode.tasksHistory:
                cmd.renderNode.tasksHistory.append(cmd.task.id)
                cmd.renderNode.history.append(cmd.status)
        if cmd.status is CMD_DONE:
            # TOFIX: handle CANCEL status and update end time when cancelling a
            # job so that it can be properly cleaned
            cmd.endTime = cmd.updateTime
            cmd.computeAvgTimeByFrame()
            cmd.attempt += 1

        # autoretry
        elif cmd.status is CMD_ERROR:
            cmd.attempt += 1

            if cmd.attempt < cmd.task.maxAttempt:
                LOGGER.debug(
                    "Mark command %d for auto retry in %ds  (%d/%d)" %
                    (cmd.id,
                     singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'),
                     cmd.attempt, cmd.task.maxAttempt))
                t = Timer(
                    singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'),
                    self.autoretry, [cmd])
                t.start()

        elif cmd.status is CMD_ASSIGNED:
            cmd.startTime = cmd.updateTime
        elif cmd.status < CMD_ASSIGNED:
            cmd.startTime = None
Ejemplo n.º 2
0
    def onStatusUpdate(self, cmd):
        cmd.updateTime = time.time()
        # append the command's status to the rendernode's history
        if isFinalStatus(cmd.status):
            # only if we don't already have a command for this task
            if hasattr(cmd.renderNode, 'tasksHistory') and cmd.task.id not in cmd.renderNode.tasksHistory:
                cmd.renderNode.tasksHistory.append(cmd.task.id)
                cmd.renderNode.history.append(cmd.status)
        if cmd.status is CMD_DONE:
            # TOFIX: handle CANCEL status and update end time when cancelling a
            # job so that it can be properly cleaned
            cmd.endTime = cmd.updateTime
            cmd.computeAvgTimeByFrame()
            cmd.attempt += 1

        # autoretry
        elif cmd.status is CMD_ERROR:
            cmd.attempt += 1

            LOGGER.debug("Mark command %d for auto retry in %ds  (%d/%d)" % (cmd.id, singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'), cmd.attempt, cmd.task.maxAttempt))
            if cmd.attempt < cmd.task.maxAttempt:
                t = Timer(singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'), self.autoretry, [cmd])
                t.start()

        elif cmd.status is CMD_ASSIGNED:
            cmd.startTime = cmd.updateTime
        elif cmd.status < CMD_ASSIGNED:
            cmd.startTime = None
Ejemplo n.º 3
0
def setup_logging(options):
    if not os.path.exists(settings.LOGDIR):
        os.makedirs(settings.LOGDIR, 0755)

    mainLog = os.path.join(settings.LOGDIR, "dispatcher.log")
    assignLog = os.path.join(settings.LOGDIR, "assign.log")

    fileHandler = logging.handlers.RotatingFileHandler(
        mainLog,
        maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'),
        backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'),
        encoding="UTF-8")

    assignHandler = logging.handlers.RotatingFileHandler(
        assignLog,
        maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'),
        backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'),
        encoding="UTF-8")

    fileHandler.setFormatter(
        logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s"))
    assignHandler.setFormatter(
        logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s"))

    logLevel = logging.DEBUG if options.DEBUG else singletonconfig.get(
        'CORE', 'LOG_LEVEL')

    # Must be set otherwise it will receive the statsLog data, but not higher than DEBUG otherwise we might loose some info if reconfig with higher lvl
    fileHandler.setLevel(logging.DEBUG)

    # Create main logger
    logging.getLogger().addHandler(fileHandler)
    logging.getLogger().setLevel(logLevel)

    # Create a specific logger for assignment information (force level to INFO)
    logging.getLogger('assign').addHandler(assignHandler)
    logging.getLogger('assign').setLevel(logging.DEBUG)
    logging.getLogger(
        'assign'
    ).propagate = False  # cut event to avoid getting this to the root log

    if options.CONSOLE and not options.DAEMONIZE:
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(
            logging.Formatter(
                "%(asctime)s %(name)10s %(levelname)6s %(message)s",
                '%Y-%m-%d %H:%M:%S'))
        consoleHandler.setLevel(logLevel)
        logging.getLogger().addHandler(consoleHandler)

    logging.getLogger('main.dispatcher').setLevel(logLevel)
    logging.getLogger('main.webservice').setLevel(logging.ERROR)
Ejemplo n.º 4
0
def main():
    options = process_args()
    setup_logging(options)

    logging.getLogger('daemon').info( "" )
    logging.getLogger('daemon').info( "-----------------------------------------------" )
    logging.getLogger('daemon').info( "Starting PULI server on port:%d.", settings.PORT)
    logging.getLogger('daemon').info( "-----------------------------------------------" )
    logging.getLogger('daemon').info( " version = %s" % settings.VERSION )
    logging.getLogger('daemon').info( " command = %s" % " ".join(sys.argv) )
    logging.getLogger('daemon').info( "  daemon = %r" % options.DAEMONIZE )
    logging.getLogger('daemon').info( " console = %r" % options.CONSOLE )
    logging.getLogger('daemon').info( "    port = %s" % settings.PORT )

    if options.DAEMONIZE:
        daemonize(settings.RUN_AS)

    dispatcherApplication = make_dispatcher()

    periodic = tornado.ioloop.PeriodicCallback( dispatcherApplication.loop, singletonconfig.get('CORE','MASTER_UPDATE_INTERVAL') )
    periodic.start()
    try:
        tornado.ioloop.IOLoop.instance().start()
    except KeyboardInterrupt, SystemExit:
        logging.getLogger('dispatcher').info("Exit event caught: closing dispatcher...")
Ejemplo n.º 5
0
def main():
    options = process_args()
    setup_logging(options)

    logging.getLogger('main').warning("")
    logging.getLogger('main').warning(
        "-----------------------------------------------")
    logging.getLogger('main').warning("Starting PULI server on port:%d.",
                                      settings.PORT)
    logging.getLogger('main').warning(
        "-----------------------------------------------")
    logging.getLogger('main').warning(" version = %s" % settings.VERSION)
    logging.getLogger('main').warning(" command = %s" % " ".join(sys.argv))
    logging.getLogger('main').warning("  daemon = %r" % options.DAEMONIZE)
    logging.getLogger('main').warning(" console = %r" % options.CONSOLE)
    logging.getLogger('main').warning("    port = %s" % settings.PORT)
    logging.getLogger('main').warning("--")

    if options.DAEMONIZE:
        logging.getLogger('main').warning(
            "make current process a daemon and redirecting stdout/stderr to logfile"
        )
        daemonize(settings.RUN_AS)

        try:
            # Redirect stdout and stderr to log file (using the first handler set in logging)
            sys.stdout = logging.getLogger().handlers[0].stream
            sys.stderr = logging.getLogger().handlers[0].stream
        except Exception:
            logging.getLogger('main').error(
                "Unexpected error occured when redirecting stdout/stderr to logfile"
            )

    logging.getLogger('main').warning("creating dispatcher main application")
    server = make_dispatcher()

    # Define a periodic callback to process DB/COMPLETION/ASSIGNMENT updates
    periodic = tornado.ioloop.PeriodicCallback(
        server.loop, singletonconfig.get('CORE', 'MASTER_UPDATE_INTERVAL'))
    periodic.start()
    try:
        logging.getLogger('main').warning("starting tornado main loop")
        tornado.ioloop.IOLoop.instance().start()
    except (KeyboardInterrupt, SystemExit):
        server.application.shutdown()

    # If restart flag is set (via /restart webservice)
    if server.application.restartService:
        logging.getLogger('main').warning("Restarting service...")

        try:
            # Restart server using a specific command
            subprocess.check_call(settings.RESTART_COMMAND.split())
        except subprocess.CalledProcessError, e:
            logging.getLogger('main').warning(
                "Impossible to restart systemd unit (error: %s)" % e)
        except AttributeError, e:
            logging.getLogger('main').warning(
                "Dispatcher settings do not define: RESTART_COMMAND")
Ejemplo n.º 6
0
def setup_logging(options):
    if not os.path.exists(settings.LOGDIR):
        os.makedirs(settings.LOGDIR, 0755)

    mainLog = os.path.join(settings.LOGDIR, "dispatcher.log")
    assignLog = os.path.join(settings.LOGDIR, "assign.log")

    fileHandler = logging.handlers.RotatingFileHandler(
        mainLog,
        maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'),
        backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'),
        encoding="UTF-8")

    assignHandler = logging.handlers.RotatingFileHandler(
        assignLog,
        maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'),
        backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'),
        encoding="UTF-8")

    fileHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s"))
    assignHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s"))

    logLevel = logging.DEBUG if options.DEBUG else singletonconfig.get('CORE', 'LOG_LEVEL')

    # Must be set otherwise it will receive the statsLog data, but not higher than DEBUG otherwise we might loose some info if reconfig with higher lvl
    fileHandler.setLevel(logging.DEBUG)

    # Create main logger
    logging.getLogger().addHandler(fileHandler)
    logging.getLogger().setLevel(logLevel)

    # Create a specific logger for assignment information (force level to INFO)
    logging.getLogger('assign').addHandler(assignHandler)
    logging.getLogger('assign').setLevel(logging.DEBUG)
    logging.getLogger('assign').propagate = False  # cut event to avoid getting this to the root log

    if options.CONSOLE and not options.DAEMONIZE:
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)6s %(message)s", '%Y-%m-%d %H:%M:%S'))
        consoleHandler.setLevel(logLevel)
        logging.getLogger().addHandler(consoleHandler)

    logging.getLogger('main.dispatcher').setLevel(logLevel)
    logging.getLogger('main.webservice').setLevel(logging.ERROR)
Ejemplo n.º 7
0
    def post(self):

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.cycleCounts['add_graphs'] += 1

        try:
            nodes = self.dispatcher.handleNewGraphRequestApply(self.getBodyAsJSON())
        except Exception, e:
            logger.exception("Graph submission failed")
            raise Http500("Failed. %s" % str(e))
Ejemplo n.º 8
0
    def post(self):

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleCounts['add_graphs'] += 1

        try:
            nodes = self.dispatcher.handleNewGraphRequestApply(self.getBodyAsJSON())
        except Exception, e:
            logger.exception("Graph submission failed")
            raise Http500("Failed. %s" % str(e))
Ejemplo n.º 9
0
    def __init__(self, id, name, coresNumber, speed, ip, port, ramSize, caracteristics=None, performance=0.0, puliversion="undefined", createDate=None):
        '''Constructs a new Rendernode.

        :parameters:
        - `name`: the name of the rendernode
        - `coresNumber`: the number of processors
        - `speed`: the speed of the processor
        '''
        self.id = int(id) if id else None
        self.name = str(name)

        self.coresNumber = int(coresNumber)
        self.ramSize = int(ramSize)
        self.licenseManager = None
        self.freeCoresNumber = int(coresNumber)
        self.usedCoresNumber = {}
        self.freeRam = int(ramSize)  # ramSize-usedRam i.e. the amount of RAM used if several commands running concurrently
        self.systemFreeRam = int(ramSize)  # the RAM available on the system (updated each ping)
        self.systemSwapPercentage = 0
        self.usedRam = {}

        self.speed = speed
        self.commands = {}
        self.status = RN_UNKNOWN
        self.responseId = None
        self.host = str(ip)
        self.port = int(port)
        self.pools = []
        self.idInformed = False
        self.isRegistered = False
        self.lastAliveTime = 0
        self.httpConnection = None
        self.caracteristics = caracteristics if caracteristics else {}
        self.currentpoolshare = None
        self.performance = float(performance)
        self.history = deque(maxlen=singletonconfig.get('CORE', 'RN_NB_ERRORS_TOLERANCE'))
        self.tasksHistory = deque(maxlen=15)
        self.excluded = False

        # Init new data
        self.puliversion = puliversion
        if createDate is None:
            self.createDate = 0
        else:
            self.createDate = createDate

        self.registerDate = time.time()

        # Flag linked to the worker flag "isPaused". Handles the case when a worker is set paused but a command is still running (finishing)
        # the RN on the dispatcher must be flag not to be assigned (i.e. in isAvailable property)
        # self.canBeAssigned = True

        if not "softs" in self.caracteristics:
            self.caracteristics["softs"] = []
Ejemplo n.º 10
0
def setup_logging(options):
    if not os.path.exists(settings.LOGDIR):
        os.makedirs(settings.LOGDIR, 0755)

    logFile = os.path.join(settings.LOGDIR, "dispatcher.log")

    fileHandler = logging.handlers.RotatingFileHandler(logFile, 
                    maxBytes=singletonconfig.get('CORE','LOG_SIZE'), 
                    backupCount=singletonconfig.get('CORE','LOG_BACKUPS'), 
                    encoding="UTF-8")

    fileHandler.setFormatter(logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s"))
    logger = logging.getLogger()

    # logLevel = logging.DEBUG if options.DEBUG else logging.WARNING
    logLevel = logging.DEBUG if options.DEBUG else singletonconfig.get('CORE','LOG_LEVEL')
    logger.setLevel(logLevel)
    fileHandler.setLevel( logging.DEBUG ) # Must be set otherwise it will receive the statsLog data, but not higher than DEBUG otherwise we might loose some info if reconfig with higher lvl

    logger.addHandler(fileHandler)

    if options.CONSOLE and not options.DAEMONIZE:
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)6s %(message)s", '%Y-%m-%d %H:%M:%S'))
        consoleHandler.setLevel(logLevel)
        logger.addHandler(consoleHandler)

    #
    # Create a specific handler at DEBUG level for logging stats infos
    #
    # if singletonconfig.get('CORE','GET_STATS'):
    #     statsFile = os.path.join(settings.LOGDIR, "stats.log")
    #     statsLogger = logging.getLogger('stats')
    #     statsLogger.setLevel( logging.DEBUG )
    #     statsLogger.addHandler( logging.handlers.RotatingFileHandler( statsFile, 
    #                             maxBytes=singletonconfig.get('CORE','LOG_SIZE'), 
    #                             backupCount=singletonconfig.get('CORE','LOG_BACKUPS')) )


    logging.getLogger('dispatcher').setLevel(logLevel)
    logging.getLogger('webservice').setLevel(logging.ERROR)
 def post(self):
     '''
     Reload the main config file (using singletonconfig) and reload
     every main loggers.
     '''
     try:
         singletonconfig.reload()
         logLevel = singletonconfig.get('CORE', 'LOG_LEVEL')
         logging.getLogger().setLevel(logLevel)
         logging.getLogger('main').setLevel(logLevel)
         logging.getLogger("worker").setLevel(logLevel)
     except Exception, e:
         raise Http500("Error during server reconfig: %r" % e)
Ejemplo n.º 12
0
 def post(self):
     '''
     Reload the main config file (using singletonconfig) and reload
     every main loggers.
     '''
     try:
         singletonconfig.reload()
         logLevel = singletonconfig.get('CORE', 'LOG_LEVEL')
         logging.getLogger().setLevel(logLevel)
         logging.getLogger('main').setLevel(logLevel)
         logging.getLogger("worker").setLevel(logLevel)
     except Exception, e:
         raise Http500("Error during server reconfig: %r" % e)
Ejemplo n.º 13
0
def main():
    options = process_args()
    setup_logging(options)

    logging.getLogger('main').warning("")
    logging.getLogger('main').warning("-----------------------------------------------")
    logging.getLogger('main').warning("Starting PULI server on port:%d.", settings.PORT)
    logging.getLogger('main').warning("-----------------------------------------------")
    logging.getLogger('main').warning(" version = %s" % settings.VERSION)
    logging.getLogger('main').warning(" command = %s" % " ".join(sys.argv))
    logging.getLogger('main').warning("  daemon = %r" % options.DAEMONIZE)
    logging.getLogger('main').warning(" console = %r" % options.CONSOLE)
    logging.getLogger('main').warning("    port = %s" % settings.PORT)
    logging.getLogger('main').warning("--")

    if options.DAEMONIZE:
        logging.getLogger('main').warning("make current process a daemon and redirecting stdout/stderr to logfile")
        daemonize(settings.RUN_AS)

        try:
            # Redirect stdout and stderr to log file (using the first handler set in logging)
            sys.stdout = logging.getLogger().handlers[0].stream
            sys.stderr = logging.getLogger().handlers[0].stream
        except Exception:
            logging.getLogger('main').error("Unexpected error occured when redirecting stdout/stderr to logfile")

    logging.getLogger('main').warning("creating dispatcher main application")
    server = make_dispatcher()

    # Define a periodic callback to process DB/COMPLETION/ASSIGNMENT updates
    periodic = tornado.ioloop.PeriodicCallback(server.loop, singletonconfig.get('CORE', 'MASTER_UPDATE_INTERVAL'))
    periodic.start()
    try:
        logging.getLogger('main').warning("starting tornado main loop")
        tornado.ioloop.IOLoop.instance().start()
    except (KeyboardInterrupt, SystemExit):
        server.application.shutdown()

    # If restart flag is set (via /restart webservice)
    if server.application.restartService:
        logging.getLogger('main').warning("Restarting service...")

        try:
            # Restart server using a specific command
            subprocess.check_call(settings.RESTART_COMMAND.split())
        except subprocess.CalledProcessError, e:
            logging.getLogger('main').warning("Impossible to restart systemd unit (error: %s)" % e)
        except AttributeError, e:
            logging.getLogger('main').warning("Dispatcher settings do not define: RESTART_COMMAND")
Ejemplo n.º 14
0
    def prepare( self ):
        """
        For each request, update stats if needed
        """
        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.cycleCounts['incoming_requests'] += 1

            if self.request.method == 'GET':
                    singletonstats.theStats.cycleCounts['incoming_get'] += 1
            elif self.request.method == 'POST':
                    singletonstats.theStats.cycleCounts['incoming_post'] += 1
            elif self.request.method == 'PUT':
                    singletonstats.theStats.cycleCounts['incoming_put'] += 1
            elif self.request.method == 'DELETE':
                    singletonstats.theStats.cycleCounts['incoming_delete'] += 1
Ejemplo n.º 15
0
 def onStatusUpdate(self, cmd):
     cmd.updateTime = time.time()
     # append the command's status to the rendernode's history
     if isFinalStatus(cmd.status):
         # only if we don't already have a command for this task
         if hasattr(cmd.renderNode, 'tasksHistory') and cmd.task.id not in cmd.renderNode.tasksHistory:
             cmd.renderNode.tasksHistory.append(cmd.task.id)
             cmd.renderNode.history.append(cmd.status)
     if cmd.status is CMD_DONE:
         # TOFIX: handle CANCEL status and update end time when cancelling a 
         # job so that it can be properly cleaned
         cmd.endTime = cmd.updateTime
         cmd.computeAvgTimeByFrame()
     # autoretry
     elif cmd.status is CMD_ERROR:
         if cmd.retryCount == singletonconfig.get('CORE','MAX_RETRY_CMD_COUNT'):
             cmd.retryRnList.append(cmd.renderNode.name)
         elif cmd.retryCount < singletonconfig.get('CORE','MAX_RETRY_CMD_COUNT'):
             t = Timer(singletonconfig.get('CORE','DELAY_BEFORE_AUTORETRY'), self.autoretry, [cmd])
             t.start()
     elif cmd.status is CMD_ASSIGNED:
         cmd.startTime = cmd.updateTime
     elif cmd.status < CMD_ASSIGNED:
         cmd.startTime = None
Ejemplo n.º 16
0
    def prepare(self):
        """
        For each request, update stats if needed
        """
        self.startTime = time.time()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleCounts['incoming_requests'] += 1

            if self.request.method == 'GET':
                    singletonstats.theStats.cycleCounts['incoming_get'] += 1
            elif self.request.method == 'POST':
                    singletonstats.theStats.cycleCounts['incoming_post'] += 1
            elif self.request.method == 'PUT':
                    singletonstats.theStats.cycleCounts['incoming_put'] += 1
            elif self.request.method == 'DELETE':
                    singletonstats.theStats.cycleCounts['incoming_delete'] += 1
    def post(self):
        try:
            singletonconfig.reload()

            # FIXME on est oblige de changer le loglevel de tous les loggers du projet...
            # Il faudrait pouvoir affecter tous les log d'un seul coup

            logLevel = singletonconfig.get('CORE','LOG_LEVEL')
            logging.getLogger().setLevel( logLevel )
            logging.getLogger("cmdwatcher").setLevel( logLevel )
            logging.getLogger("command").setLevel( logLevel )
            logging.getLogger("dispatcher").setLevel( logLevel )
            logging.getLogger("framework").setLevel( logLevel )
            logging.getLogger("model").setLevel( logLevel )
            logging.getLogger("poolshares").setLevel( logLevel )
            logging.getLogger("process").setLevel( logLevel )
            logging.getLogger("worker").setLevel( logLevel )
            logging.getLogger("workerws").setLevel( logLevel )

            # Tous les logger de l'appli
            #
            # root
            # cmdwatcher
            # command
            # dispatcher
            # dispatcher.dispatchtree
            # dispatcher.webservice
            # dispatcher.webservice.editController
            # dispatcher.webservice.NodeController
            # dispatcher.webservice.PoolController
            # dispatcher.webservice.queryController
            # dispatcher.webservice.TaskController
            # framework
            # framework.application
            # framework.webservice
            # model
            # model.task
            # poolshares
            # process
            # userview
            # worker
            # worker.CmdThreader
            # workerws


        except Exception, e:
            raise Http500("Error during server reconfig: %r"%e)
    def aggregate( self ):
        """
        | Called each cycle to store data in a buffer array
        | Once every BUFFER_SIZE cycles, the data is dumped in stats.log for later use
        """

        cycleData = [ self.cycleDate, copy( self.cycleTimers ), copy(self.cycleCounts), copy( self.assignmentTimers ) ]
        self.accumulationBuffer.append( cycleData )

        # Clean data for next cycle (only counts need to be cleaned, timer are overwritten)
        self._resetCounts()

        # Dump to file
        if singletonconfig.get('CORE','STATS_BUFFER_SIZE') <= len(self.accumulationBuffer):
            self._flush()

        return True
Ejemplo n.º 19
0
    def put(self, computerName, commandId):
        '''Update command `commandId` running on rendernode `renderNodeId`.

        Returns "200 OK" on success, or "404 Bad Request" if the provided json data is not valid.
        '''

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.cycleCounts['update_commands'] += 1

        computerName = computerName.lower()
        # try:
        #     updateDict = self.sanitizeUpdateDict(self.getBodyAsJSON())
        # except TypeError, e:
        #     return Http400(repr(e.args))
        updateDict = self.getBodyAsJSON()
        updateDict['renderNodeName'] = computerName

        try:
            self.framework.application.updateCommandApply(updateDict)
        except KeyError, e:
            return Http404(str(e))
Ejemplo n.º 20
0
    def request(self, method, url, body=None, headers={}):
        """
        """
        
        # from octopus.dispatcher import settings

        LOGGER.debug("Send request to RN: http://%s:%s%s %s (%s)"%(self.host, self.port , url, method, headers))
        
        err=None
        conn = self.getHTTPConnection()

        # try to process the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times.
        for i in xrange( singletonconfig.get('COMMUNICATION','RENDERNODE_REQUEST_MAX_RETRY_COUNT') ):
            try:
                conn.request(method, url, body, headers)
                response = conn.getresponse()
                if response.length:
                    data = response.read(response.length)
                else:
                    data = None
                # request succeeded
                conn.close()
                return (response, data)
            except http.socket.error, e:
                err = e
                LOGGER.debug("socket error %r" % e)
                try:
                    conn.close()
                except:
                    pass
                if e in (errno.ECONNREFUSED, errno.ENETUNREACH):
                    raise self.RequestFailed(cause=e)
            except http.HTTPException, e:
                err = e
                LOGGER.debug("HTTPException %r" % e)
                try:
                    conn.close()
                except:
                    pass
                LOGGER.exception("rendernode.request failed")
Ejemplo n.º 21
0
    def request(self, method, url, body=None, headers={}):
        """
        """

        # from octopus.dispatcher import settings
        # LOGGER.debug("Send request to RN: http://%s:%s%s %s (%s)" % (self.host, self.port, url, method, headers))

        err = None
        conn = self.getHTTPConnection()

        # try to process the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times.
        for i in xrange(singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_MAX_RETRY_COUNT')):
            try:
                conn.request(method, url, body, headers)
                response = conn.getresponse()
                if response.length:
                    data = response.read(response.length)
                else:
                    data = None
                # request succeeded
                conn.close()
                return (response, data)
            except http.socket.error, e:
                err = e
                # LOGGER.debug("socket error %r" % e)
                try:
                    conn.close()
                except:
                    pass
                if e in (errno.ECONNREFUSED, errno.ENETUNREACH):
                    raise self.RequestFailed(cause=e)
            except http.HTTPException, e:
                err = e
                # LOGGER.debug("HTTPException %r" % e)
                try:
                    conn.close()
                except:
                    pass
                LOGGER.exception("rendernode.request failed")
Ejemplo n.º 22
0
    def aggregate(self):
        """
        | Called each cycle to store data in a buffer array
        | Once every BUFFER_SIZE cycles, the data is dumped in stats.log for later use
        """

        cycleData = [
            self.cycleDate,
            copy(self.cycleTimers),
            copy(self.cycleCounts),
            copy(self.assignmentTimers)
        ]
        self.accumulationBuffer.append(cycleData)

        # Clean data for next cycle (only counts need to be cleaned, timer are overwritten)
        self._resetCounts()

        # Dump to file
        if singletonconfig.get('CORE', 'STATS_BUFFER_SIZE') <= len(
                self.accumulationBuffer):
            self._flush()

        return True
Ejemplo n.º 23
0
    def put(self, computerName, commandId):
        '''Update command `commandId` running on rendernode `renderNodeId`.

        Returns "200 OK" on success, or "404 Bad Request" if the provided json data is not valid.
        '''

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleCounts['update_commands'] += 1

        computerName = computerName.lower()
        # try:
        #     updateDict = self.sanitizeUpdateDict(self.getBodyAsJSON())
        # except TypeError, e:
        #     return Http400(repr(e.args))
        updateDict = self.getBodyAsJSON()
        updateDict['renderNodeName'] = computerName

        try:
            self.framework.application.updateCommandApply(updateDict)
        except (KeyError, IndexError) as e:
            raise Http404(str(e))
        except Exception, e:
            raise Http500("Exception during command update")
Ejemplo n.º 24
0
    def computeAssignments(self):
        '''Computes and returns a list of (rendernode, command) assignments.'''

        LOGGER = logging.getLogger('main')

        from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask
        # if no rendernodes available, return
        if not any(rn.isAvailable()
                   for rn in self.dispatchTree.renderNodes.values()):
            return []

        assignments = []

        # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready
        # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely...
        entryPoints = set([
            poolShare.node
            for poolShare in self.dispatchTree.poolShares.values()
            if poolShare.node.status not in
            [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED]
            and poolShare.node.readyCommandCount > 0
            and poolShare.node.name != 'graphs'
        ])

        # don't proceed to the calculation if no rns availables in the requested pools
        rnsBool = False
        for pool, nodesiterator in groupby(
                entryPoints, lambda x: x.poolShares.values()[0].pool):
            rnsAvailables = set([
                rn for rn in pool.renderNodes
                if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]
            ])
            if len(rnsAvailables):
                rnsBool = True

        if not rnsBool:
            return []

        # Log time updating max rn
        prevTimer = time.time()

        # sort by pool for the groupby
        entryPoints = sorted(entryPoints,
                             key=lambda node: node.poolShares.values()[0].pool)

        # update the value of the maxrn for the poolshares (parallel dispatching)
        for pool, nodesiterator in groupby(
                entryPoints, lambda x: x.poolShares.values()[0].pool):

            # we are treating every active node of the pool
            nodesList = [node for node in nodesiterator]

            # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool
            rnsNotOffline = set([
                rn for rn in pool.renderNodes
                if rn.status not in [RN_UNKNOWN, RN_PAUSED]
            ])
            rnsSize = len(rnsNotOffline)
            # LOGGER.debug("@   - nb rns awake:%r" % (rnsSize) )

            # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size
            l = nodesList[:]  # duplicate the list to be safe when removing elements
            for node in l:
                # LOGGER.debug("@   - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) )
                if node.poolShares.values(
                )[0].userDefinedMaxRN and node.poolShares.values(
                )[0].maxRN not in [-1, 0]:
                    # LOGGER.debug("@     removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) )
                    nodesList.remove(node)
                    rnsSize -= node.poolShares.values()[0].maxRN

            # LOGGER.debug("@   - nb rns awake after maxRN:%d" % (rnsSize) )

            if len(nodesList) == 0:
                continue

            # Prepare updatedMaxRN with dispatch key proportions
            dkList = []  # list of dks (integer only)
            dkPositiveList = [
            ]  # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value)
            nbJobs = len(nodesList)  # number of jobs in the current pool
            nbRNAssigned = 0  # number of render nodes assigned for this pool

            for node in nodesList:
                dkList.append(node.dispatchKey)

            dkMin = min(dkList)
            dkPositiveList = map(lambda x: x - dkMin + 1, dkList)
            dkSum = sum(dkPositiveList)

            # sort by id (fifo)
            nodesList = sorted(nodesList, key=lambda x: x.id)

            # then sort by dispatchKey (priority)
            nodesList = sorted(nodesList,
                               key=lambda x: x.dispatchKey,
                               reverse=True)

            for dk, nodeIterator in groupby(nodesList,
                                            lambda x: x.dispatchKey):

                nodes = [node for node in nodeIterator]
                dkPos = dkPositiveList[dkList.index(dk)]

                if dkSum > 0:
                    updatedmaxRN = int(round(rnsSize * (dkPos / float(dkSum))))
                else:
                    updatedmaxRN = int(round(rnsSize / float(nbJobs)))

                for node in nodes:
                    node.poolShares.values()[0].maxRN = updatedmaxRN
                    nbRNAssigned += updatedmaxRN

            # Add remaining RNs to most important jobs
            unassignedRN = rnsSize - nbRNAssigned
            while unassignedRN > 0:
                for node in nodesList:
                    if unassignedRN > 0:
                        node.poolShares.values()[0].maxRN += 1
                        unassignedRN -= 1
                    else:
                        break

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.assignmentTimers[
                'update_max_rn'] = time.time() - prevTimer
        LOGGER.info("%8.2f ms --> .... updating max RN values",
                    (time.time() - prevTimer) * 1000)

        # now, we are treating every nodes
        # sort by id (fifo)
        entryPoints = sorted(entryPoints, key=lambda node: node.id)
        # then sort by dispatchKey (priority)
        entryPoints = sorted(entryPoints,
                             key=lambda node: node.dispatchKey,
                             reverse=True)

        # Put nodes with a userDefinedMaxRN first
        userDefEntryPoints = ifilter(
            lambda node: node.poolShares.values()[0].userDefinedMaxRN,
            entryPoints)
        standardEntryPoints = ifilter(
            lambda node: not node.poolShares.values()[0].userDefinedMaxRN,
            entryPoints)
        scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints)

        # Log time dispatching RNs
        prevTimer = time.time()

        # Iterate over each entryPoint to get an assignment
        for entryPoint in scoredEntryPoints:
            if any([
                    poolShare.hasRenderNodesAvailable()
                    for poolShare in entryPoint.poolShares.values()
            ]):
                try:

                    for (rn, com) in entryPoint.dispatchIterator(
                            lambda: self.queue.qsize() > 0):
                        assignments.append((rn, com))
                        # increment the allocatedRN for the poolshare
                        poolShare.allocatedRN += 1
                        # save the active poolshare of the rendernode
                        rn.currentpoolshare = poolShare

                except NoRenderNodeAvailable:
                    pass
                except NoLicenseAvailableForTask:
                    LOGGER.info(
                        "Missing license for node \"%s\" (other commands can start anyway)."
                        % entryPoint.name)
                    pass

        assignmentDict = collections.defaultdict(list)
        for (rn, com) in assignments:
            assignmentDict[rn].append(com)

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.assignmentTimers[
                'dispatch_command'] = time.time() - prevTimer
        LOGGER.info("%8.2f ms --> .... dispatching commands",
                    (time.time() - prevTimer) * 1000)

        #
        # Check replacements
        #
        # - faire une passe pour les jobs n'ayant pas leur part de gateau
        #     - identifier dans leur pool les jobs killable
        #     - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable

        #
        # Backfill
        #
        # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle

        return assignmentDict.items()
Ejemplo n.º 25
0
    def mainLoop(self):
        '''
        | Dispatcher main loop iteration.
        | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL
        | During this process, the dispatcher will:
        |   - update completion and status for all jobs in dispatchTree
        |   - update status of renderNodes
        |   - validate inter tasks dependencies
        |   - update the DB with recorded changes in the model
        |   - compute new assignments and send them to the proper rendernodes
        |   - release all finished jobs/rns
        '''
        log = logging.getLogger('main')
        loopStartTime = time.time()
        prevTimer = loopStartTime

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleDate = loopStartTime

        log.info("-----------------------------------------------------")
        log.info(" Start dispatcher process cycle (old version).")

        try:
            self.threadPool.poll()
        except NoResultsPending:
            pass
        else:
            log.info("finished some network requests")
            pass

        self.cycle += 1

        # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed)
        self.dispatchTree.updateCompletionAndStatus()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_tree'] = time.time(
            ) - prevTimer
        log.info("%8.2f ms --> update completion status" %
                 ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # Update render nodes
        self.updateRenderNodes()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_rn'] = time.time(
            ) - prevTimer
        log.info("%8.2f ms --> update render node" %
                 ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # Validate dependencies
        self.dispatchTree.validateDependencies()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers[
                'update_dependencies'] = time.time() - prevTimer
        log.info("%8.2f ms --> validate dependencies" %
                 ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # update db
        self.updateDB()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_db'] = time.time(
            ) - prevTimer
        log.info("%8.2f ms --> update DB" % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # compute and send command assignments to rendernodes
        assignments = self.computeAssignments()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers[
                'compute_assignment'] = time.time() - prevTimer
        log.info("%8.2f ms --> compute assignments." %
                 ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        self.sendAssignments(assignments)
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['send_assignment'] = time.time(
            ) - prevTimer
            singletonstats.theStats.cycleCounts['num_assignments'] = len(
                assignments)
        log.info("%8.2f ms --> send %r assignments." %
                 ((time.time() - prevTimer) * 1000, len(assignments)))
        prevTimer = time.time()

        # call the release finishing status on all rendernodes
        for renderNode in self.dispatchTree.renderNodes.values():
            renderNode.releaseFinishingStatus()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers[
                'release_finishing'] = time.time() - prevTimer
        log.info("%8.2f ms --> releaseFinishingStatus" %
                 ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        loopDuration = (time.time() - loopStartTime) * 1000
        log.info("%8.2f ms --> cycle ended. " % loopDuration)

        #
        # Send stat data to disk
        #
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['time_elapsed'] = time.time(
            ) - loopStartTime
            singletonstats.theStats.aggregate()
Ejemplo n.º 26
0
    #
    _request = "http://%s:%s/stats" % (options.hostname, options.port)
    _logPath = os.path.join(options.outputFile)

    # fileHandler = logging.handlers.RotatingFileHandler( _logPath,
    #                                                     maxBytes=20000000,
    #                                                     backupCount=1,
    #                                                     encoding="UTF-8")

    fileHandler = logging.FileHandler(_logPath, encoding="UTF-8")

    fileHandler.setFormatter(logging.Formatter('%(message)s'))

    statsLogger = logging.getLogger('stats')
    statsLogger.addHandler(fileHandler)
    statsLogger.setLevel(singletonconfig.get('CORE', 'LOG_LEVEL'))

    http_client = HTTPClient()
    try:
        response = http_client.fetch(_request)

        if response.error:
            print "Error:   %s" % response.error
            print "         %s" % response.body
        else:
            if response.body == "":
                print "Error: No stats retrieved"
            else:
                tmp = json.loads(response.body)
                del tmp["jobs"]
                del tmp["commands"]
Ejemplo n.º 27
0
    def computeAssignments(self):
        '''Computes and returns a list of (rendernode, command) assignments.'''

        LOGGER = logging.getLogger('main')

        from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask
        # if no rendernodes available, return
        if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()):
            return []

        # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready
        # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely...
        entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values()
                                if poolShare.node.status not in (NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED) and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs'])

        # don't proceed to the calculation if no render nodes available in the requested pools
        isRenderNodesAvailable = False
        for pool, jobsIterator in groupby(entryPoints, lambda x: x.mainPoolShare().pool):
            renderNodesAvailable = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]])
            if len(renderNodesAvailable):
                isRenderNodesAvailable = True
                break
        if not isRenderNodesAvailable:
            return []

        # Log time updating max rn
        prevTimer = time.time()

        # sort by pool for the groupby
        entryPoints = sorted(entryPoints, key=lambda node: node.mainPoolShare().pool)

        # update the value of the maxrn for the poolshares (parallel dispatching)
        for pool, jobsIterator in groupby(entryPoints, lambda x: x.mainPoolShare().pool):

            # we are treating every active job of the pool
            jobsList = [job for job in jobsIterator]

            # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool
            onlineRenderNodes = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]])
            nbOnlineRenderNodes = len(onlineRenderNodes)
            # LOGGER.debug("@   - nb rns awake:%r" % (nbOnlineRenderNodes) )

            # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size
            l = jobsList[:]  # duplicate the list to be safe when removing elements
            for job in l:
                # LOGGER.debug("@   - checking userDefMaxRN: %s -> %r maxRN=%d" % (job.name, job.mainPoolShare().userDefinedMaxRN, job.mainPoolShare().maxRN ) )
                if job.mainPoolShare().userDefinedMaxRN and job.mainPoolShare().maxRN not in [-1, 0]:
                    # LOGGER.debug("@     removing: %s -> maxRN=%d" % (job.name, job.mainPoolShare().maxRN ) )
                    jobsList.remove(job)
                    nbOnlineRenderNodes -= job.mainPoolShare().maxRN

            # LOGGER.debug("@   - nb rns awake after maxRN:%d" % (nbOnlineRenderNodes) )
            if len(jobsList) == 0:
                continue

            # Prepare updatedMaxRN with dispatch key proportions
            # list of dks (integer only)
            dkList = [job.dispatchKey for job in jobsList]
            nbJobs = len(jobsList)     # number of jobs in the current pool
            nbRNAssigned = 0            # number of render nodes assigned for this pool

            dkMin = min(dkList)
            # dkPositiveList: Shift all dks values in order that each min value of dk becomes 1
            dkPositiveList = map(lambda x: x-dkMin+1, dkList)  # dk values start at 1
            dkSum = sum(dkPositiveList)

            # sort by id (fifo)
            jobsList = sorted(jobsList, key=lambda x: x.id)

            # then sort by dispatchKey (priority)
            jobsList = sorted(jobsList, key=lambda x: x.dispatchKey, reverse=True)

            for dk, jobIterator in groupby(jobsList, lambda x: x.dispatchKey):

                jobs = [job for job in jobIterator]
                # dkPositive: Shift all dks values in order that each min value of dk becomes 1
                dkPositive = dk - dkMin + 1

                # Proportion of render nodes for
                updatedmaxRN = int(round(nbOnlineRenderNodes * (dkPositive / float(dkSum))))

                for job in jobs:
                    job.mainPoolShare().maxRN = updatedmaxRN
                    nbRNAssigned += updatedmaxRN

            # PRA: Here is the main choice!
            # Add remaining RNs to most important jobs (to fix rounding errors)
            unassignedRN = nbOnlineRenderNodes - nbRNAssigned
            while unassignedRN > 0:
                for job in jobsList:
                    if unassignedRN <= 0:
                        break
                    job.mainPoolShare().maxRN += 1
                    unassignedRN -= 1

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer
        LOGGER.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 )

        # now, we are treating every nodes
        # sort by id (fifo)
        entryPoints = sorted(entryPoints, key=lambda node: node.id)
        # then sort by dispatchKey (priority)
        entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True)

        # Put nodes with a userDefinedMaxRN first
        userDefEntryPoints = ifilter(lambda node: node.mainPoolShare().userDefinedMaxRN, entryPoints)
        standardEntryPoints = ifilter(lambda node: not node.mainPoolShare().userDefinedMaxRN, entryPoints)
        scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints)

        # Log time dispatching RNs
        prevTimer = time.time()

        # Iterate over each entryPoint to get an assignment
        assignments = []  # list of (renderNode, Command)
        for entryPoint in scoredEntryPoints:
            # If we have dedicated render nodes for this poolShare
            if not any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]):
                continue

            for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0):
                assignments.append((rn, com))
                # increment the allocatedRN for the poolshare
                entryPoint.mainPoolShare().allocatedRN += 1
                # save the active poolshare of the rendernode
                rn.currentpoolshare = entryPoint.mainPoolShare()

        assignmentDict = collections.defaultdict(list)
        for (rn, com) in assignments:
            assignmentDict[rn].append(com)

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.assignmentTimers['dispatch_command'] = time.time() - prevTimer
        LOGGER.info( "%8.2f ms --> .... dispatching commands", (time.time() - prevTimer)*1000  )

        #
        # Check replacements
        #
        # - faire une passe pour les jobs n'ayant pas leur part de gateau
        #     - identifier dans leur pool les jobs killable
        #     - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable


        #
        # Backfill
        #
        # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle

        return assignmentDict.items()
Ejemplo n.º 28
0
    def computeAssignments(self):
        '''Computes and returns a list of (rendernode, command) assignments.'''

        LOGGER = logging.getLogger('main')

        from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask
        # if no rendernodes available, return
        if not any(rn.isAvailable()
                   for rn in self.dispatchTree.renderNodes.values()):
            return []

        # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready
        # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely...
        entryPoints = set([
            poolShare.node
            for poolShare in self.dispatchTree.poolShares.values()
            if poolShare.node.status not in (
                NODE_BLOCKED, NODE_DONE, NODE_CANCELED,
                NODE_PAUSED) and poolShare.node.readyCommandCount > 0
            and poolShare.node.name != 'graphs'
        ])

        # don't proceed to the calculation if no render nodes available in the requested pools
        isRenderNodesAvailable = False
        for pool, jobsIterator in groupby(entryPoints,
                                          lambda x: x.mainPoolShare().pool):
            renderNodesAvailable = set([
                rn for rn in pool.renderNodes
                if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]
            ])
            if len(renderNodesAvailable):
                isRenderNodesAvailable = True
                break
        if not isRenderNodesAvailable:
            return []

        # Log time updating max rn
        prevTimer = time.time()

        # sort by pool for the groupby
        entryPoints = sorted(entryPoints,
                             key=lambda node: node.mainPoolShare().pool)

        # update the value of the maxrn for the poolshares (parallel dispatching)
        for pool, jobsIterator in groupby(entryPoints,
                                          lambda x: x.mainPoolShare().pool):

            # we are treating every active job of the pool
            jobsList = [job for job in jobsIterator]

            # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool
            onlineRenderNodes = set([
                rn for rn in pool.renderNodes
                if rn.status not in [RN_UNKNOWN, RN_PAUSED]
            ])
            nbOnlineRenderNodes = len(onlineRenderNodes)
            # LOGGER.debug("@   - nb rns awake:%r" % (nbOnlineRenderNodes) )

            # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size
            l = jobsList[:]  # duplicate the list to be safe when removing elements
            for job in l:
                # LOGGER.debug("@   - checking userDefMaxRN: %s -> %r maxRN=%d" % (job.name, job.mainPoolShare().userDefinedMaxRN, job.mainPoolShare().maxRN ) )
                if job.mainPoolShare().userDefinedMaxRN and job.mainPoolShare(
                ).maxRN not in [-1, 0]:
                    # LOGGER.debug("@     removing: %s -> maxRN=%d" % (job.name, job.mainPoolShare().maxRN ) )
                    jobsList.remove(job)
                    nbOnlineRenderNodes -= job.mainPoolShare().maxRN

            # LOGGER.debug("@   - nb rns awake after maxRN:%d" % (nbOnlineRenderNodes) )
            if len(jobsList) == 0:
                continue

            # Prepare updatedMaxRN with dispatch key proportions
            # list of dks (integer only)
            dkList = [job.dispatchKey for job in jobsList]
            nbJobs = len(jobsList)  # number of jobs in the current pool
            nbRNAssigned = 0  # number of render nodes assigned for this pool

            dkMin = min(dkList)
            # dkPositiveList: Shift all dks values in order that each min value of dk becomes 1
            dkPositiveList = map(lambda x: x - dkMin + 1,
                                 dkList)  # dk values start at 1
            dkSum = sum(dkPositiveList)

            # sort by id (fifo)
            jobsList = sorted(jobsList, key=lambda x: x.id)

            # then sort by dispatchKey (priority)
            jobsList = sorted(jobsList,
                              key=lambda x: x.dispatchKey,
                              reverse=True)

            for dk, jobIterator in groupby(jobsList, lambda x: x.dispatchKey):

                jobs = [job for job in jobIterator]
                # dkPositive: Shift all dks values in order that each min value of dk becomes 1
                dkPositive = dk - dkMin + 1

                # Proportion of render nodes for
                updatedmaxRN = int(
                    round(nbOnlineRenderNodes * (dkPositive / float(dkSum))))

                for job in jobs:
                    job.mainPoolShare().maxRN = updatedmaxRN
                    nbRNAssigned += updatedmaxRN

            # PRA: Here is the main choice!
            # Add remaining RNs to most important jobs (to fix rounding errors)
            unassignedRN = nbOnlineRenderNodes - nbRNAssigned
            while unassignedRN > 0:
                for job in jobsList:
                    if unassignedRN <= 0:
                        break
                    job.mainPoolShare().maxRN += 1
                    unassignedRN -= 1

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.assignmentTimers[
                'update_max_rn'] = time.time() - prevTimer
        LOGGER.info("%8.2f ms --> .... updating max RN values",
                    (time.time() - prevTimer) * 1000)

        # now, we are treating every nodes
        # sort by id (fifo)
        entryPoints = sorted(entryPoints, key=lambda node: node.id)
        # then sort by dispatchKey (priority)
        entryPoints = sorted(entryPoints,
                             key=lambda node: node.dispatchKey,
                             reverse=True)

        # Put nodes with a userDefinedMaxRN first
        userDefEntryPoints = ifilter(
            lambda node: node.mainPoolShare().userDefinedMaxRN, entryPoints)
        standardEntryPoints = ifilter(
            lambda node: not node.mainPoolShare().userDefinedMaxRN,
            entryPoints)
        scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints)

        # Log time dispatching RNs
        prevTimer = time.time()

        # Iterate over each entryPoint to get an assignment
        assignments = []  # list of (renderNode, Command)
        for entryPoint in scoredEntryPoints:
            # If we have dedicated render nodes for this poolShare
            if not any([
                    poolShare.hasRenderNodesAvailable()
                    for poolShare in entryPoint.poolShares.values()
            ]):
                continue

            for (rn, com) in entryPoint.dispatchIterator(
                    lambda: self.queue.qsize() > 0):
                assignments.append((rn, com))
                # increment the allocatedRN for the poolshare
                entryPoint.mainPoolShare().allocatedRN += 1
                # save the active poolshare of the rendernode
                rn.currentpoolshare = entryPoint.mainPoolShare()

        assignmentDict = collections.defaultdict(list)
        for (rn, com) in assignments:
            assignmentDict[rn].append(com)

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.assignmentTimers[
                'dispatch_command'] = time.time() - prevTimer
        LOGGER.info("%8.2f ms --> .... dispatching commands",
                    (time.time() - prevTimer) * 1000)

        #
        # Check replacements
        #
        # - faire une passe pour les jobs n'ayant pas leur part de gateau
        #     - identifier dans leur pool les jobs killable
        #     - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable

        #
        # Backfill
        #
        # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle

        return assignmentDict.items()
Ejemplo n.º 29
0
    def post(self, computerName):
        """
        A worker send a request to get registered on the server.
        """
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleCounts['add_rns'] += 1

        computerName = computerName.lower()
        if computerName.startswith(('1', '2')):
            return Http403(
                message="Cannot register a RenderNode without a name",
                content="Cannot register a RenderNode without a name")

        dct = self.getBodyAsJSON()

        if computerName in self.getDispatchTree().renderNodes:
            # When the registering worker is already listed in RN list
            logger.warning("RenderNode already registered: %s" % computerName)
            existingRN = self.getDispatchTree().renderNodes[computerName]

            if 'commands' not in dct:
                # No commands in current RN, reset command that might be still assigned to this RN
                existingRN.reset()
            else:
                logger.warning(
                    "Reset commands that are assigned to this RN: %r" %
                    dct.get('commands', '-'))
                for cmdId in dct['commands']:
                    existingRN.commands[cmdId] = self.getDispatchTree(
                    ).commands[cmdId]

            if 'status' in dct:
                existingRN.status = int(dct['status'])

            return HttpResponse(304, "RenderNode already registered.")

        else:
            # Add a new worker (and set infos given in request body)
            for key in ('name', 'port', 'status', 'cores', 'speed', 'ram',
                        'pools', 'caracteristics'):
                if not key in dct:
                    return Http400("Missing key %r" % key,
                                   content="Missing key %r" % key)
            port = int(dct['port'])
            status = int(dct['status'])
            if status not in (RN_UNKNOWN, RN_PAUSED, RN_IDLE, RN_BOOTING):
                # FIXME: CONFLICT is not a good value maybe
                return HttpConflict(
                    "Unallowed status for RenderNode registration")
            cores = int(dct['cores'])
            speed = float(dct['speed'])
            ram = int(dct['ram'])
            pools = dct['pools']
            caracteristics = dct['caracteristics']
            name, port = computerName.split(":", 1)

            puliversion = dct.get('puliversion', "unknown")
            createDate = dct.get('createDate', time.time())

            renderNode = RenderNode(None,
                                    computerName,
                                    cores,
                                    speed,
                                    name,
                                    port,
                                    ram,
                                    caracteristics,
                                    puliversion=puliversion,
                                    createDate=createDate)

            renderNode.status = status
            poolList = []
            # check the existence of the pools
            for poolName in pools:
                try:
                    pool = self.getDispatchTree().pools[poolName]
                    poolList.append(pool)
                except KeyError:
                    return HttpConflict("Pool %s is not a registered pool",
                                        poolName)
            # add the rendernode to the pools
            for pool in poolList:
                pool.addRenderNode(renderNode)
            # add the rendernode to the list of rendernodes
            renderNode.pools = poolList
            self.getDispatchTree().renderNodes[renderNode.name] = renderNode
            self.writeCallback(json.dumps(renderNode.to_json()))
Ejemplo n.º 30
0
from octopus.dispatcher import settings

#
# Load specific logger for collecting stats.
#

# Custom level to avoid flooding the main loggers
# We use a logger and handler with low level to ensure it always receive message even in log level is change
# via the config file and reloaded.
statsLog = logging.getLogger('server_stats')
statsLog.setLevel(1)
statsLog.propagate = False
try:
    hd = logging.handlers.RotatingFileHandler(
        os.path.join(settings.LOGDIR, "stats.log"),
        maxBytes=singletonconfig.get('CORE', 'STATS_SIZE'),
        backupCount=0)
    hd.setFormatter(logging.Formatter('%(message)s'))
    hd.setLevel(1)

    statsLog.addHandler(hd)
except IOError as err:
    print "Warning invalid path specified for log."


class DispatcherStats():
    """
    | Class holding custom infos on the dispatcher.
    | This data can be periodically flushed in a specific log file for later use
    """
        print "Command arguments: %s" % args
        print "Query: %s"+_request

    # fileHandler = logging.handlers.RotatingFileHandler( _logPath, 
    #                                                     maxBytes=20000000,
    #                                                     backupCount=1, 
    #                                                     encoding="UTF-8")

    fileHandler = logging.FileHandler( _logPath, encoding="UTF-8")


    fileHandler.setFormatter( logging.Formatter('%(message)s') )
    
    statsLogger = logging.getLogger('stats')
    statsLogger.addHandler( fileHandler )
    statsLogger.setLevel( singletonconfig.get('CORE','LOG_LEVEL') )


    http_client = HTTPClient()
    try:
        response = http_client.fetch( _request )

        if response.error:
            print "Error:   %s" % response.error
            print "         %s" % response.body
        else:
            if response.body == "":
                print "Error: No stats retrieved"
            else:
                data = json.loads(response.body)
Ejemplo n.º 32
0
 def getHTTPConnection(self):
     timeout = singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_TIMEOUT', 5)
     return http.HTTPConnection(self.host, self.port, timeout=timeout)
Ejemplo n.º 33
0
    def computeAssignments(self):
        '''Computes and returns a list of (rendernode, command) assignments.'''

        from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask
        # if no rendernodes available, return
        if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()):
            return []

        assignments = []

        # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready
        # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely...
        entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs'])

        # don't proceed to the calculation if no rns availables in the requested pools
        rnsBool = False
        for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool):
            rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]])
            if len(rnsAvailables):
                rnsBool = True

        if not rnsBool:
            return []


        # Log time updating max rn
        prevTimer = time.time()

        # sort by pool for the groupby
        entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool)

        # update the value of the maxrn for the poolshares (parallel dispatching)
        for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool):

            # we are treating every active node of the pool
            nodesList = [node for node in nodesiterator]

            # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool
            rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]])
            rnsSize = len(rnsNotOffline)
            # LOGGER.debug("@   - nb rns awake:%r" % (rnsSize) )

            # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size
            l = nodesList[:]  # duplicate the list to be safe when removing elements
            for node in l:
                # LOGGER.debug("@   - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) )
                if node.poolShares.values()[0].userDefinedMaxRN and node.poolShares.values()[0].maxRN not in [-1, 0]:
                    # LOGGER.debug("@     removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) )
                    nodesList.remove(node)
                    rnsSize -= node.poolShares.values()[0].maxRN

            # LOGGER.debug("@   - nb rns awake after maxRN:%d" % (rnsSize) )

            if len(nodesList) == 0:
                continue

            # Prepare updatedMaxRN with dispatch key proportions
            dkList = []                 # list of dks (integer only)
            dkPositiveList = []         # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value)
            nbJobs = len(nodesList)     # number of jobs in the current pool
            nbRNAssigned = 0            # number of render nodes assigned for this pool

            for node in nodesList:
                dkList.append(node.dispatchKey)

            dkMin = min(dkList)
            dkPositiveList = map(lambda x: x-dkMin+1, dkList)
            dkSum = sum(dkPositiveList)

            # sort by id (fifo)
            nodesList = sorted(nodesList, key=lambda x: x.id)

            # then sort by dispatchKey (priority)
            nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True)
            
            for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey):

                nodes = [node for node in nodeIterator]
                dkPos = dkPositiveList[ dkList.index(dk) ]

                if dkSum > 0:                  
                    updatedmaxRN = int( round( rnsSize * (dkPos / float(dkSum) )))
                else:
                    updatedmaxRN = int(round( rnsSize / float(nbJobs) ))

                for node in nodes:
                    node.poolShares.values()[0].maxRN = updatedmaxRN
                    nbRNAssigned += updatedmaxRN

            # Add remaining RNs to most important jobs
            unassignedRN = rnsSize - nbRNAssigned
            while unassignedRN > 0:
                for node in nodesList:
                    if unassignedRN > 0:
                        node.poolShares.values()[0].maxRN += 1
                        unassignedRN -= 1
                    else:
                        break

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer
        LOGGER.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 )

        # now, we are treating every nodes
        # sort by id (fifo)
        entryPoints = sorted(entryPoints, key=lambda node: node.id)
        # then sort by dispatchKey (priority)
        entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True)

        # Put nodes with a userDefinedMaxRN first
        userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints )
        standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints )
        scoredEntryPoints = chain( userDefEntryPoints, standardEntryPoints)


        # Log time dispatching RNs
        prevTimer = time.time()

        # 
        # HACK update license info for katana with rlmutils
        # This helps having the real number of used licenses before finishing assignment
        # This is done because katana rlm management sometime reserves 2 token (cf BUGLIST v1.4)
        try:
            import subprocess
            strRlmKatanaUsed=''
            strRlmKatanaUsed = subprocess.Popen(["/s/apps/lin/farm/tools/rlm_katana_used.sh"], stdout=subprocess.PIPE).communicate()[0]

            katanaUsed = int(strRlmKatanaUsed)
            LOGGER.debug("HACK update katana license: used = %d (+buffer in config:%d)" % (katanaUsed,singletonconfig.get('HACK','KATANA_BUFFER')))

            # Sets used license number
            try:
                self.licenseManager.licenses["katana"].used = katanaUsed + singletonconfig.get('HACK','KATANA_BUFFER')
            except KeyError:
                LOGGER.warning("License katana not found... Impossible to set 'used' value: %d" % katanaUsed)
        except Exception, e:
            LOGGER.warning("Error getting number of katana license used via rlmutil (e: %r, rlmoutput=%r)" % (e,strRlmKatanaUsed))
Ejemplo n.º 34
0
class RenderNode(models.Model):
    '''This class represents the state of a RenderNode.'''

    # Sys infos
    name = models.StringField()
    speed = models.FloatField()
    coresNumber = models.IntegerField()
    ramSize = models.IntegerField()

    # Dynamic sys infos
    freeCoresNumber = models.IntegerField()
    usedCoresNumber = models.DictField(as_item_list=True)
    freeRam = models.IntegerField()
    systemFreeRam = models.IntegerField()
    systemSwapPercentage = models.FloatField()
    usedRam = models.DictField(as_item_list=True)

    # Worker state
    puliversion = models.StringField()
    commands = models.ModelDictField()
    status = models.IntegerField()
    host = models.StringField()
    port = models.IntegerField()
    pools = models.ModelListField(indexField='name')
    caracteristics = models.DictField()
    isRegistered = models.BooleanField()
    performance = models.FloatField()
    excluded = models.BooleanField()

    # Timers
    createDate = models.FloatField()
    registerDate = models.FloatField()
    lastAliveTime = models.FloatField()

    def __init__(self, id, name, coresNumber, speed, ip, port, ramSize, caracteristics=None, performance=0.0, puliversion="undefined", createDate=None):
        '''Constructs a new Rendernode.

        :parameters:
        - `name`: the name of the rendernode
        - `coresNumber`: the number of processors
        - `speed`: the speed of the processor
        '''
        self.id = int(id) if id else None
        self.name = str(name)

        self.coresNumber = int(coresNumber)
        self.ramSize = int(ramSize)
        self.licenseManager = None
        self.freeCoresNumber = int(coresNumber)
        self.usedCoresNumber = {}
        self.freeRam = int(ramSize)  # ramSize-usedRam i.e. the amount of RAM used if several commands running concurrently
        self.systemFreeRam = int(ramSize)  # the RAM available on the system (updated each ping)
        self.systemSwapPercentage = 0
        self.usedRam = {}

        self.speed = speed
        self.commands = {}
        self.status = RN_UNKNOWN
        self.responseId = None
        self.host = str(ip)
        self.port = int(port)
        self.pools = []
        self.idInformed = False
        self.isRegistered = False
        self.lastAliveTime = 0
        self.httpConnection = None
        self.caracteristics = caracteristics if caracteristics else {}
        self.currentpoolshare = None
        self.performance = float(performance)
        self.history = deque(maxlen=singletonconfig.get('CORE', 'RN_NB_ERRORS_TOLERANCE'))
        self.tasksHistory = deque(maxlen=15)
        self.excluded = False

        # Init new data
        self.puliversion = puliversion
        if createDate is None:
            self.createDate = 0
        else:
            self.createDate = createDate

        self.registerDate = time.time()

        # Flag linked to the worker flag "isPaused". Handles the case when a worker is set paused but a command is still running (finishing)
        # the RN on the dispatcher must be flag not to be assigned (i.e. in isAvailable property)
        # self.canBeAssigned = True

        if not "softs" in self.caracteristics:
            self.caracteristics["softs"] = []

    ## Returns True if this render node is available for command assignment.
    #
    def isAvailable(self):
        # Need to avoid nodes that have flag isPaused set (i.e. nodes paused by user but still running a command)
        return (self.isRegistered and self.status == RN_IDLE and not self.commands and not self.excluded)

    def reset(self, paused=False):
        # if paused, set the status to RN_PAUSED, else set it to Finishing, it will be set to IDLE in the next iteration of the dispatcher main loop
        if paused:
            self.status = RN_PAUSED
        else:
            self.status = RN_FINISHING
        # reset the commands left on this RN, if any
        for cmd in self.commands.values():
            cmd.status = CMD_READY
            cmd.completion = 0.
            cmd.renderNode = None
            self.clearAssignment(cmd)
        self.commands = {}
        # reset the associated poolshare, if any
        if self.currentpoolshare:
            self.currentpoolshare.allocatedRN -= 1
            self.currentpoolshare = None
        # reset the values for cores and ram
        self.freeCoresNumber = int(self.coresNumber)
        self.usedCoresNumber = {}
        self.freeRam = int(self.ramSize)
        self.usedRam = {}

    ## Returns a human readable representation of this RenderNode.
    #
    def __repr__(self):
        return u'RenderNode(id=%s, name=%s, host=%s, port=%s)' % (repr(self.id), repr(self.name), repr(self.host), repr(self.port))

    ## Clears all of this rendernode's fields related to the specified assignment.
    #
    def clearAssignment(self, command):
        '''Removes command from the list of commands assigned to this rendernode.'''
        # in case of failed assignment, decrement the allocatedRN value
        if self.currentpoolshare:
            self.currentpoolshare.allocatedRN -= 1
            self.currentpoolshare = None
        try:
            del self.commands[command.id]
        except KeyError:
            pass
            #LOGGER.debug('attempt to clear assignment of not assigned command %d on worker %s', command.id, self.name)
        else:
            self.releaseRessources(command)
            self.releaseLicense(command)

    ## Add a command assignment
    #
    def addAssignment(self, command):
        if not command.id in self.commands:
            self.commands[command.id] = command
            self.reserveRessources(command)
            # FIXME the assignment of the cmd should be done here and not in the dispatchIterator func
            command.assign(self)
            self.updateStatus()

    ## Reserve license
    #
    def reserveLicense(self, command, licenseManager):
        self.licenseManager = licenseManager
        lic = command.task.lic
        if not lic:
            return True
        return licenseManager.reserveLicenseForRenderNode(lic, self)

    ## Release licence
    #
    def releaseLicense(self, command):
        lic = command.task.lic
        if lic and self.licenseManager:
            self.licenseManager.releaseLicenseForRenderNode(lic, self)

    ## Reserve ressource
    #
    def reserveRessources(self, command):
        res = min(self.freeCoresNumber, command.task.maxNbCores) or self.freeCoresNumber
        self.usedCoresNumber[command.id] = res
        self.freeCoresNumber -= res

        res = min(self.freeRam, command.task.ramUse) or self.freeRam

        self.usedRam[command.id] = res
        self.freeRam -= res

    ## Release ressource
    #
    def releaseRessources(self, command):
        #res = self.usedCoresNumber[command.id]
        self.freeCoresNumber = self.coresNumber
        if command.id in self.usedCoresNumber:
            del self.usedCoresNumber[command.id]

        #res = self.usedRam[command.id]
        self.freeRam = self.ramSize
        if command.id in self.usedRam:
            del self.usedRam[command.id]

    ## Unassign a finished command
    #
    def unassign(self, command):
        if not isFinalStatus(command.status):
            raise ValueError("cannot unassign unfinished command %s" % repr(command))
        self.clearAssignment(command)
        self.updateStatus()

    def remove(self):
        self.fireDestructionEvent(self)

    def updateStatus(self):
        """
        Update rendernode status according to its states: having commands or not, commands status, time etc
        Status is not changed if no info is brought by the commands.
        """
        # self.status is not RN_PAUSED and time elapsed is enough
        if time.time() > (self.lastAliveTime + singletonconfig.conf["COMMUNICATION"]["RN_TIMEOUT"]):

            # set the status of a render node to RN_UNKNOWN after TIMEOUT seconds have elapsed since last update
            # timeout the commands running on this node
            if RN_UNKNOWN != self.status:
                LOGGER.warning("rendernode %s is not responding", self.name)
                self.status = RN_UNKNOWN
                if self.commands:
                    for cmd in self.commands.values():
                        cmd.status = CMD_TIMEOUT
                        self.clearAssignment(cmd)
            return
        # This is necessary in case of a cancel command or a mylawn -k
        if not self.commands:
            # if self.status is RN_WORKING:
            #     # cancel the command that is running on this RN because it's no longer registered in the model
            #     LOGGER.warning("rendernode %s is reported as working but has no registered command" % self.name)
            if self.status not in (RN_IDLE, RN_PAUSED, RN_BOOTING):
                #LOGGER.warning("rendernode %s was %d and is now IDLE." % (self.name, self.status))
                self.status = RN_IDLE
                if self.currentpoolshare:
                    self.currentpoolshare.allocatedRN -= 1
                    self.currentpoolshare = None
            return
        commandStatus = [command.status for command in self.commands.values()]
        if CMD_RUNNING in commandStatus:
            self.status = RN_WORKING
        elif CMD_ASSIGNED in commandStatus:
            self.status = RN_ASSIGNED
        elif CMD_ERROR in commandStatus:
            self.status = RN_FINISHING
        elif CMD_FINISHING in commandStatus:
            self.status = RN_FINISHING
        elif CMD_DONE in commandStatus:
            self.status = RN_FINISHING  # do not set the status to IDLE immediately, to ensure that the order of affectation will be respected

        elif CMD_TIMEOUT in commandStatus:
            self.status = RN_FINISHING

        elif CMD_CANCELED in commandStatus:
            for cmd in self.commands.values():
                # this should not happened, but if it does, ensure the command is no more registered to the rn
                if cmd.status is CMD_CANCELED:
                    self.clearAssignment(cmd)
        elif self.status not in (RN_IDLE, RN_BOOTING, RN_UNKNOWN, RN_PAUSED):
            LOGGER.error("Unable to compute new status for rendernode %r (status %r, commands %r)", self, self.status, self.commands)

    ## releases the finishing status of the rendernodes
    #
    def releaseFinishingStatus(self):
        if self.status is RN_FINISHING:
            # remove the commands that are in a final status
            for cmd in self.commands.values():
                if isFinalStatus(cmd.status):
                    self.unassign(cmd)
                    if CMD_DONE == cmd.status:
                        cmd.completion = 1.0
                    cmd.finish()
            self.status = RN_IDLE

    ##
    #
    # @warning The returned HTTPConnection is not safe to use from multiple threads
    #
    def getHTTPConnection(self):
        timeout = singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_TIMEOUT', 5)
        return http.HTTPConnection(self.host, self.port, timeout=timeout)

    ## An exception class to report a render node http request failure.
    #
    class RequestFailed(Exception):
        pass

    ## Sends a HTTP request to the render node and returns a (HTTPResponse, data) tuple on success.
    #
    # This method tries to send the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times,
    # waiting RENDERNODE_REQUEST_DELAY_AFTER_REQUEST_FAILURE seconds between each try. It
    # then raises a RenderNode.RequestFailed exception.
    #
    # @param method the HTTP method for this request
    # @param url the requested URL
    # @param headers a dictionary with string-keys and string-values (empty by default)
    # @param body the string body for this request (None by default)
    # @raise RenderNode.RequestFailed if the request fails.
    # @note it is a good idea to specify a Content-Length header when giving a non-empty body.
    # @see  the RENDERNODE_REQUEST_MAX_RETRY_COUNT and
    #       RENDERNODE_REQUEST_DELAY_AFTER_REQUEST_FAILURE params affect the execution of this method.
    #
    def request(self, method, url, body=None, headers={}):
        """
        """

        # from octopus.dispatcher import settings
        # LOGGER.debug("Send request to RN: http://%s:%s%s %s (%s)" % (self.host, self.port, url, method, headers))

        err = None
        conn = self.getHTTPConnection()

        # try to process the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times.
        for i in xrange(singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_MAX_RETRY_COUNT')):
            try:
                conn.request(method, url, body, headers)
                response = conn.getresponse()
                if response.length:
                    data = response.read(response.length)
                else:
                    data = None
                # request succeeded
                conn.close()
                return (response, data)
            except http.socket.error, e:
                err = e
                # LOGGER.debug("socket error %r" % e)
                try:
                    conn.close()
                except:
                    pass
                if e in (errno.ECONNREFUSED, errno.ENETUNREACH):
                    raise self.RequestFailed(cause=e)
            except http.HTTPException, e:
                err = e
                # LOGGER.debug("HTTPException %r" % e)
                try:
                    conn.close()
                except:
                    pass
                LOGGER.exception("rendernode.request failed")

            LOGGER.warning("request failed (%d/%d), reason: %s" % (i + 1, singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_MAX_RETRY_COUNT'), err))
            # request failed so let's sleep for a while
            time.sleep(singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_DELAY_AFTER_REQUEST_FAILURE'))
Ejemplo n.º 35
0
    def mainLoop(self):
        '''
        | Dispatcher main loop iteration.
        | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL
        | During this process, the dispatcher will:
        |   - update completion and status for all jobs in dispatchTree
        |   - update status of renderNodes
        |   - validate inter tasks dependencies
        |   - update the DB with recorded changes in the model
        |   - compute new assignments and send them to the proper rendernodes
        |   - release all finished jobs/rns
        '''
        log = logging.getLogger('main')
        loopStartTime = time.time()
        prevTimer = loopStartTime

        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleDate = loopStartTime

        log.info("-----------------------------------------------------")
        log.info(" Start dispatcher process cycle (old version).")

        try:
            self.threadPool.poll()
        except NoResultsPending:
            pass
        else:
            log.info("finished some network requests")
            pass

        self.cycle += 1

        # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed)
        self.dispatchTree.updateCompletionAndStatus()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_tree'] = time.time() - prevTimer
        log.info("%8.2f ms --> update completion status" % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # Update render nodes
        self.updateRenderNodes()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_rn'] = time.time() - prevTimer
        log.info("%8.2f ms --> update render node" % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # Validate dependencies
        self.dispatchTree.validateDependencies()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_dependencies'] = time.time() - prevTimer
        log.info("%8.2f ms --> validate dependencies" % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # update db
        self.updateDB()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['update_db'] = time.time() - prevTimer
        log.info("%8.2f ms --> update DB" % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        # compute and send command assignments to rendernodes
        assignments = self.computeAssignments()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['compute_assignment'] = time.time() - prevTimer
        log.info("%8.2f ms --> compute assignments." % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        self.sendAssignments(assignments)
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['send_assignment'] = time.time() - prevTimer
            singletonstats.theStats.cycleCounts['num_assignments'] = len(assignments)
        log.info("%8.2f ms --> send %r assignments." % ((time.time() - prevTimer) * 1000, len(assignments)))
        prevTimer = time.time()

        # call the release finishing status on all rendernodes
        for renderNode in self.dispatchTree.renderNodes.values():
            renderNode.releaseFinishingStatus()
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['release_finishing'] = time.time() - prevTimer
        log.info("%8.2f ms --> releaseFinishingStatus" % ((time.time() - prevTimer) * 1000))
        prevTimer = time.time()

        loopDuration = (time.time() - loopStartTime)*1000
        log.info("%8.2f ms --> cycle ended. " % loopDuration)

        #
        # Send stat data to disk
        #
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleTimers['time_elapsed'] = time.time() - loopStartTime
            singletonstats.theStats.aggregate()
Ejemplo n.º 36
0
    if options.verbose:
        print "Command options: %s" % options
        print "Command arguments: %s" % args
        print "Query: %s" + _request

    # fileHandler = logging.handlers.RotatingFileHandler( _logPath,
    #                                                     maxBytes=20000000,
    #                                                     backupCount=1,
    #                                                     encoding="UTF-8")

    fileHandler = logging.FileHandler(_logPath, encoding="UTF-8")
    fileHandler.setFormatter(logging.Formatter("%(message)s"))

    statsLogger = logging.getLogger("stats")
    statsLogger.addHandler(fileHandler)
    statsLogger.setLevel(singletonconfig.get("CORE", "LOG_LEVEL"))

    http_client = HTTPClient()
    try:
        response = http_client.fetch(_request)

        if response.error:
            print "Error:   %s" % response.error
            print "         %s" % response.body
        else:
            if response.body == "":
                print "Error: No stats retrieved"
            else:
                data = json.loads(response.body)

                aggregatedData = formatData(data)
try:
    import simplejson as json
except ImportError:
    import json

from octopus.core import singletonconfig
from octopus.dispatcher import settings

# 
# Load specific logger for collecting stats. 
#

# Custom level to avoid flooding the main loggers
# We use a logger and handler with low level to ensure it always receive message even in log level is change 
# via the config file and reloaded.
hd = logging.handlers.RotatingFileHandler( os.path.join(settings.LOGDIR, "stats.log"), maxBytes=singletonconfig.get('CORE','STATS_SIZE'), backupCount=0 )
hd.setFormatter( logging.Formatter('%(message)s') )
hd.setLevel( 1 )

statsLog = logging.getLogger('server_stats')
statsLog.addHandler( hd )
statsLog.setLevel( 1 )


class DispatcherStats():
    """
    | Class holding custom infos on the dispatcher.
    | This data can be periodically flushed in a specific log file for later use
    """

    cycleDate = 0.0
Ejemplo n.º 38
0
    def computeAssignments(self):
        '''Computes and returns a list of (rendernode, command) assignments.'''

        LOGGER = logging.getLogger('main')

        from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask
        # if no rendernodes available, return
        if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()):
            return []

        assignments = []

        # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready
        # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely...
        entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs'])

        # don't proceed to the calculation if no rns availables in the requested pools
        rnsBool = False
        for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool):
            rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]])
            if len(rnsAvailables):
                rnsBool = True

        if not rnsBool:
            return []


        # Log time updating max rn
        prevTimer = time.time()

        # sort by pool for the groupby
        entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool)

        # update the value of the maxrn for the poolshares (parallel dispatching)
        for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool):

            # we are treating every active node of the pool
            nodesList = [node for node in nodesiterator]

            # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool
            rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]])
            rnsSize = len(rnsNotOffline)
            # log.debug("@   - nb rns awake:%r" % (rnsSize) )

            # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size
            l = nodesList[:]  # duplicate the list to be safe when removing elements
            for node in l:
                # log.debug("@   - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) )
                if node.poolShares.values()[0].userDefinedMaxRN and node.poolShares.values()[0].maxRN not in [-1, 0]:
                    # log.debug("@     removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) )
                    nodesList.remove(node)
                    rnsSize -= node.poolShares.values()[0].maxRN

            # log.debug("@   - nb rns awake after maxRN:%d" % (rnsSize) )

            if len(nodesList) == 0:
                continue

            # Prepare updatedMaxRN with dispatch key proportions
            dkList = []                 # list of dks (integer only)
            dkPositiveList = []         # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value)
            nbJobs = len(nodesList)     # number of jobs in the current pool
            nbRNAssigned = 0            # number of render nodes assigned for this pool

            for node in nodesList:
                dkList.append(node.dispatchKey)

            dkMin = min(dkList)
            dkPositiveList = map(lambda x: x-dkMin+1, dkList)
            dkSum = sum(dkPositiveList)

            # sort by id (fifo)
            nodesList = sorted(nodesList, key=lambda x: x.id)

            # then sort by dispatchKey (priority)
            nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True)

            for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey):

                nodes = [node for node in nodeIterator]
                dkPos = dkPositiveList[ dkList.index(dk) ]

                if dkSum > 0:
                    updatedmaxRN = int( round( rnsSize * (dkPos / float(dkSum) )))
                else:
                    updatedmaxRN = int(round( rnsSize / float(nbJobs) ))

                for node in nodes:
                    node.poolShares.values()[0].maxRN = updatedmaxRN
                    nbRNAssigned += updatedmaxRN

            # Add remaining RNs to most important jobs
            unassignedRN = rnsSize - nbRNAssigned
            while unassignedRN > 0:
                for node in nodesList:
                    if unassignedRN > 0:
                        node.poolShares.values()[0].maxRN += 1
                        unassignedRN -= 1
                    else:
                        break

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer
        log.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 )

        # now, we are treating every nodes
        # sort by id (fifo)
        entryPoints = sorted(entryPoints, key=lambda node: node.id)
        # then sort by dispatchKey (priority)
        entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True)

        # Put nodes with a userDefinedMaxRN first
        userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints )
        standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints )
        scoredEntryPoints = chain( userDefEntryPoints, standardEntryPoints)

        # Log time dispatching RNs
        prevTimer = time.time()

        # Iterate over each entryPoint to get an assignment
        for entryPoint in scoredEntryPoints:
            if any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]):
                try:

                    for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0):
                        assignments.append((rn, com))
                        # increment the allocatedRN for the poolshare
                        poolShare.allocatedRN += 1
                        # save the active poolshare of the rendernode
                        rn.currentpoolshare = poolShare

                except NoRenderNodeAvailable:
                    pass
                except NoLicenseAvailableForTask:
                    log.info("Missing license for node \"%s\" (other commands can start anyway)." % entryPoint.name)
                    pass

        assignmentDict = collections.defaultdict(list)
        for (rn, com) in assignments:
            assignmentDict[rn].append(com)

        if singletonconfig.get('CORE','GET_STATS'):
            singletonstats.theStats.assignmentTimers['dispatch_command'] = time.time() - prevTimer
        log.info( "%8.2f ms --> .... dispatching commands", (time.time() - prevTimer)*1000  )

        #
        # Check replacements
        #
        # - faire une passe pour les jobs n'ayant pas leur part de gateau
        #     - identifier dans leur pool les jobs killable
        #     - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable


        #
        # Backfill
        #
        # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle

        return assignmentDict.items()
Ejemplo n.º 39
0
    def post(self, computerName):
        """
        A worker send a request to get registered on the server.
        """
        if singletonconfig.get('CORE', 'GET_STATS'):
            singletonstats.theStats.cycleCounts['add_rns'] += 1

        computerName = computerName.lower()
        if computerName.startswith(('1', '2')):
            return Http403(message="Cannot register a RenderNode without a name", content="Cannot register a RenderNode without a name")

        dct = self.getBodyAsJSON()

        if computerName in self.getDispatchTree().renderNodes:
            # When the registering worker is already listed in RN list
            logger.warning("RenderNode already registered: %s" % computerName)
            existingRN = self.getDispatchTree().renderNodes[computerName]

            if 'commands' not in dct:
                # No commands in current RN, reset command that might be still assigned to this RN
                existingRN.reset()
            else:
                logger.warning("Reset commands that are assigned to this RN: %r" % dct.get('commands', '-'))
                for cmdId in dct['commands']:
                    existingRN.commands[cmdId] = self.getDispatchTree().commands[cmdId]

            if 'status' in dct:
                existingRN.status = int(dct['status'])

            return HttpResponse(304, "RenderNode already registered.")

        else:
            # Add a new worker (and set infos given in request body)
            for key in ('name', 'port', 'status', 'cores', 'speed', 'ram', 'pools', 'caracteristics'):
                if not key in dct:
                    return Http400("Missing key %r" % key, content="Missing key %r" % key)
            port = int(dct['port'])
            status = int(dct['status'])
            if status not in (RN_UNKNOWN, RN_PAUSED, RN_IDLE, RN_BOOTING):
                # FIXME: CONFLICT is not a good value maybe
                return HttpConflict("Unallowed status for RenderNode registration")
            cores = int(dct['cores'])
            speed = float(dct['speed'])
            ram = int(dct['ram'])
            pools = dct['pools']
            caracteristics = dct['caracteristics']
            name, port = computerName.split(":", 1)

            puliversion = dct.get('puliversion', "unknown")
            createDate = dct.get('createDate', time.time())

            renderNode = RenderNode(None, computerName, cores, speed, name, port, ram, caracteristics, puliversion=puliversion, createDate=createDate)

            renderNode.status = status
            poolList = []
            # check the existence of the pools
            for poolName in pools:
                try:
                    pool = self.getDispatchTree().pools[poolName]
                    poolList.append(pool)
                except KeyError:
                    return HttpConflict("Pool %s is not a registered pool", poolName)
            # add the rendernode to the pools
            for pool in poolList:
                pool.addRenderNode(renderNode)
            # add the rendernode to the list of rendernodes
            renderNode.pools = poolList
            self.getDispatchTree().renderNodes[renderNode.name] = renderNode
            self.writeCallback(json.dumps(renderNode.to_json()))
Ejemplo n.º 40
0
    def canRun(self, command):
        # check if this rendernode has made too much errors in its last commands
        cpt = 0
        for i in self.history:
            if i == CMD_ERROR:
                cpt += 1
        if cpt == singletonconfig.get('CORE', 'RN_NB_ERRORS_TOLERANCE'):
            LOGGER.warning("RenderNode %s had only errors in its commands history, excluding..." % self.name)
            self.excluded = True
            return False
        if self.excluded:
            return False
        for (requirement, value) in command.task.requirements.items():
            if requirement.lower() == "softs":  # todo
                for soft in value:
                    if not soft in self.caracteristics['softs']:
                        return False
            else:
                if not requirement in self.caracteristics:
                    return False
                else:
                    caracteristic = self.caracteristics[requirement]
                    if type(caracteristic) != type(value) and not isinstance(value, list):
                        return False
                    if isinstance(value, list) and len(value) == 2:
                        a, b = value
                        if type(a) != type(b) or type(a) != type(caracteristic):
                            return False
                        try:
                            if not (a < caracteristic < b):
                                return False
                        except ValueError:
                            return False
                    else:
                        if isinstance(caracteristic, bool) and caracteristic != value:
                            return False
                        if isinstance(caracteristic, basestring) and caracteristic != value:
                            return False
                        if isinstance(caracteristic, int) and caracteristic < value:
                            return False

        if command.task.minNbCores:
            if self.freeCoresNumber < command.task.minNbCores:
                return False
        else:
            if self.freeCoresNumber != self.coresNumber:
                return False

        #
        # RAM requirement: we check task requirement with the amount of free RAM reported at last ping (systemFreeRam)
        #
        if command.task.ramUse != 0:
            if self.systemFreeRam < command.task.ramUse:
                LOGGER.info("Not enough ram on %s for command %d. %d needed, %d avail." % (self.name, command.id, int(command.task.ramUse), self.systemFreeRam))
                return False

        #
        # timer requirements: a timer is on the task and is the same for all commands
        #
        if command.task.timer is not None:
            # LOGGER.debug("Current command %r has a timer : %s" % (command.id, datetime.datetime.fromtimestamp(command.task.timer) ) )
            if time.time() < command.task.timer:
                LOGGER.info("Prevented execution of command %d because of timer present (%s)" % (command.id, datetime.datetime.fromtimestamp(command.task.timer)))
                return False

        return True