Ejemplo n.º 1
0
    def __request(self, methodname, params):
        response = None
        retryWaitTime = 5 + random.randint(0, 5)
        for i in range(0, 30):
            signal.alarm(self.__timeOut)
            try:
                response = self._ServerProxy__request(methodname, params)
                signal.alarm(0)
                break
            except Exception:
                if self.__retryRequests:
                  if hodInterrupt.isSet():
                    raise HodInterruptException()
                  time.sleep(retryWaitTime)
                else:
                  raise Exception("hodXRClientTimeout")

        return response
Ejemplo n.º 2
0
  def _op_allocate(self, args):
    operation = "allocate"
    argLength = len(args)
    min = 0
    max = 0
    errorFlag = False
    errorMsgs = []

    if argLength == 3:
      nodes = args[2]
      clusterDir = self.__norm_cluster_dir(args[1])

      if not os.path.exists(clusterDir):
        try:
          os.makedirs(clusterDir)
        except OSError, err:
          errorFlag = True
          errorMsgs.append("Could not create cluster directory. %s" \
                            % (str(err)))
      elif not os.path.isdir(clusterDir):
        errorFlag = True
        errorMsgs.append( \
                    "Invalid cluster directory (--hod.clusterdir or -d) : " + \
                         clusterDir + " : Not a directory")
        
      if int(nodes) < 3 :
        errorFlag = True
        errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \
                         "Must be >= 3. Given nodes: %s" % nodes)
      if errorFlag:
        for msg in errorMsgs:
          self.__log.critical(msg)
        self.__opCode = 3
        return

      if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, \
                                              (os.R_OK, os.W_OK)):
        self.__log.critical(INVALID_STATE_FILE_MSGS[2] % \
                         self.__userState.get_state_file())
        self.__opCode = 1
        return

      clusterList = self.__userState.read(CLUSTER_DATA_FILE)
      if clusterDir in clusterList.keys():
        self.__setup_cluster_state(clusterDir)
        clusterInfo = self.__clusterState.read()
        # Check if the job is not running. Only then can we safely
        # allocate another cluster. Otherwise the user would need
        # to deallocate and free up resources himself.
        if clusterInfo.has_key('jobid') and \
            self.__cluster.is_cluster_deallocated(clusterInfo['jobid']):
          self.__log.warn("Found a dead cluster at cluster directory '%s'. Deallocating it to allocate a new one." % (clusterDir))
          self.__remove_cluster(clusterDir)
          self.__clusterState.clear()
        else:
          self.__log.critical("Found a previously allocated cluster at cluster directory '%s'. HOD cannot determine if this cluster can be automatically deallocated. Deallocate the cluster if it is unused." % (clusterDir))
          self.__opCode = 12
          return
 
      self.__setup_cluster_logger(clusterDir)

      (status, message) = self.__cluster.is_valid_account()
      if status is not 0:
        if message:
          for line in message:
            self.__log.critical("verify-account output: %s" % line)
        self.__log.critical("Cluster cannot be allocated because account verification failed. " \
                              + "verify-account returned exit code: %s." % status)
        self.__opCode = 4
        return
      else:
        self.__log.debug("verify-account returned zero exit code.")
        if message:
          self.__log.debug("verify-account output: %s" % message)

      if re.match('\d+-\d+', nodes):
        (min, max) = nodes.split("-")
        min = int(min)
        max = int(max)
      else:
        try:
          nodes = int(nodes)
          min = nodes
          max = nodes
        except ValueError:
          print self.__hodhelp.help(operation)
          self.__log.critical(
          "%s operation requires a pos_int value for n(nodecount)." % 
          operation)
          self.__opCode = 3
        else:
          self.__setup_cluster_state(clusterDir)
          clusterInfo = self.__clusterState.read()
          self.__opCode = self.__cluster.check_cluster(clusterInfo)
          if self.__opCode == 0 or self.__opCode == 15:
            self.__setup_service_registry()   
            if hodInterrupt.isSet(): 
              self.__cleanup()
              raise HodInterruptException()
            self.__log.debug("Service Registry started.")

            self.__adjustMasterFailureCountConfig(nodes)
            
            try:
              allocateStatus = self.__cluster.allocate(clusterDir, min, max)    
            except HodInterruptException, h:
              self.__cleanup()
              raise h
            # Allocation has gone through.
            # Don't care about interrupts any more

            try:
              if allocateStatus == 0:
                self.__set_cluster_state_info(os.environ, 
                                              self.__cluster.hdfsInfo, 
                                              self.__cluster.mapredInfo, 
                                              self.__cluster.ringmasterXRS,
                                              self.__cluster.jobId,
                                              min, max)
                self.__setup_cluster_state(clusterDir)
                self.__clusterState.write(self.__cluster.jobId, 
                                          self.__clusterStateInfo)
                #  Do we need to check for interrupts here ??
  
                self.__set_user_state_info( 
                  { clusterDir : self.__cluster.jobId, } )
              self.__opCode = allocateStatus
            except Exception, e:
              # Some unknown problem.
              self.__cleanup()
              self.__cluster.deallocate(clusterDir, self.__clusterStateInfo)
              self.__opCode = 1
              raise Exception(e)
          elif self.__opCode == 12:
            self.__log.critical("Cluster %s already allocated." % clusterDir)
Ejemplo n.º 3
0
      errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \
                       "Must be >= 3. Given nodes: %s" % nodes)

    if errorFlag:
      for msg in errorMsgs:
        self.__log.critical(msg)
      self.handle_script_exit_code(scriptRet, clusterDir)
      sys.exit(3)

    try:
      self._op_allocate(('allocate', clusterDir, str(nodes)))
      if self.__opCode == 0:
        if self.__cfg['hod'].has_key('script-wait-time'):
          time.sleep(self.__cfg['hod']['script-wait-time'])
          self.__log.debug('Slept for %d time. Now going to run the script' % self.__cfg['hod']['script-wait-time'])
        if hodInterrupt.isSet():
          self.__log.debug('Hod interrupted - not executing script')
        else:
          scriptRunner = hadoopScript(clusterDir, 
                                  self.__cfg['hod']['original-dir'])
          self.__opCode = scriptRunner.run(script)
          scriptRet = self.__opCode
          self.__log.info("Exit code from running the script: %d" % self.__opCode)
      else:
        self.__log.critical("Error %d in allocating the cluster. Cannot run the script." % self.__opCode)

      if hodInterrupt.isSet():
        # Got interrupt while executing script. Unsetting it for deallocating
        hodInterrupt.setFlag(False)
      if self._is_cluster_allocated(clusterDir):
        self._op_deallocate(('deallocate', clusterDir))
Ejemplo n.º 4
0
    def _op_allocate(self, args):
        operation = "allocate"
        argLength = len(args)
        min = 0
        max = 0
        errorFlag = False
        errorMsgs = []

        if argLength == 3:
            nodes = args[2]
            clusterDir = self.__norm_cluster_dir(args[1])

            if not os.path.exists(clusterDir):
                try:
                    os.makedirs(clusterDir)
                except OSError, err:
                    errorFlag = True
                    errorMsgs.append("Could not create cluster directory. %s" \
                                      % (str(err)))
            elif not os.path.isdir(clusterDir):
                errorFlag = True
                errorMsgs.append( \
                            "Invalid cluster directory (--hod.clusterdir or -d) : " + \
                                 clusterDir + " : Not a directory")

            if int(nodes) < 3:
                errorFlag = True
                errorMsgs.append("Invalid nodecount (--hod.nodecount or -n) : " + \
                                 "Must be >= 3. Given nodes: %s" % nodes)
            if errorFlag:
                for msg in errorMsgs:
                    self.__log.critical(msg)
                self.__opCode = 3
                return

            if not self.__userState.checkStateFile(CLUSTER_DATA_FILE, \
                                                    (os.R_OK, os.W_OK)):
                self.__log.critical(INVALID_STATE_FILE_MSGS[2] % \
                                 self.__userState.get_state_file())
                self.__opCode = 1
                return

            clusterList = self.__userState.read(CLUSTER_DATA_FILE)
            if clusterDir in clusterList.keys():
                self.__setup_cluster_state(clusterDir)
                clusterInfo = self.__clusterState.read()
                # Check if the job is not running. Only then can we safely
                # allocate another cluster. Otherwise the user would need
                # to deallocate and free up resources himself.
                if clusterInfo.has_key('jobid') and \
                    self.__cluster.is_cluster_deallocated(clusterInfo['jobid']):
                    self.__log.warn(
                        "Found a dead cluster at cluster directory '%s'. Deallocating it to allocate a new one."
                        % (clusterDir))
                    self.__remove_cluster(clusterDir)
                    self.__clusterState.clear()
                else:
                    self.__log.critical(
                        "Found a previously allocated cluster at cluster directory '%s'. HOD cannot determine if this cluster can be automatically deallocated. Deallocate the cluster if it is unused."
                        % (clusterDir))
                    self.__opCode = 12
                    return

            self.__setup_cluster_logger(clusterDir)
            if re.match('\d+-\d+', nodes):
                (min, max) = nodes.split("-")
                min = int(min)
                max = int(max)
            else:
                try:
                    nodes = int(nodes)
                    min = nodes
                    max = nodes
                except ValueError:
                    print self.__hodhelp.help(operation)
                    self.__log.critical(
                        "%s operation requires a pos_int value for n(nodecount)."
                        % operation)
                    self.__opCode = 3
                else:
                    self.__setup_cluster_state(clusterDir)
                    clusterInfo = self.__clusterState.read()
                    self.__opCode = self.__cluster.check_cluster(clusterInfo)
                    if self.__opCode == 0 or self.__opCode == 15:
                        self.__setup_service_registry()
                        if hodInterrupt.isSet():
                            self.__cleanup()
                            raise HodInterruptException()
                        self.__log.debug("Service Registry started.")

                        self.__adjustMasterFailureCountConfig(nodes)

                        try:
                            allocateStatus = self.__cluster.allocate(
                                clusterDir, min, max)
                        except HodInterruptException, h:
                            self.__cleanup()
                            raise h
                        # Allocation has gone through.
                        # Don't care about interrupts any more

                        try:
                            if allocateStatus == 0:
                                self.__set_cluster_state_info(
                                    os.environ, self.__cluster.hdfsInfo,
                                    self.__cluster.mapredInfo,
                                    self.__cluster.ringmasterXRS,
                                    self.__cluster.jobId, min, max)
                                self.__setup_cluster_state(clusterDir)
                                self.__clusterState.write(
                                    self.__cluster.jobId,
                                    self.__clusterStateInfo)
                                #  Do we need to check for interrupts here ??

                                self.__set_user_state_info({
                                    clusterDir:
                                    self.__cluster.jobId,
                                })
                            self.__opCode = allocateStatus
                        except Exception, e:
                            # Some unknown problem.
                            self.__cleanup()
                            self.__cluster.deallocate(clusterDir,
                                                      self.__clusterStateInfo)
                            self.__opCode = 1
                            raise Exception(e)
                    elif self.__opCode == 12:
                        self.__log.critical("Cluster %s already allocated." %
                                            clusterDir)
Ejemplo n.º 5
0
        if errorFlag:
            for msg in errorMsgs:
                self.__log.critical(msg)
            self.handle_script_exit_code(scriptRet, clusterDir)
            sys.exit(3)

        try:
            self._op_allocate(('allocate', clusterDir, str(nodes)))
            if self.__opCode == 0:
                if self.__cfg['hod'].has_key('script-wait-time'):
                    time.sleep(self.__cfg['hod']['script-wait-time'])
                    self.__log.debug(
                        'Slept for %d time. Now going to run the script' %
                        self.__cfg['hod']['script-wait-time'])
                if hodInterrupt.isSet():
                    self.__log.debug('Hod interrupted - not executing script')
                else:
                    scriptRunner = hadoopScript(
                        clusterDir, self.__cfg['hod']['original-dir'])
                    self.__opCode = scriptRunner.run(script)
                    scriptRet = self.__opCode
                    self.__log.info("Exit code from running the script: %d" %
                                    self.__opCode)
            else:
                self.__log.critical(
                    "Error %d in allocating the cluster. Cannot run the script."
                    % self.__opCode)

            if hodInterrupt.isSet():
                # Got interrupt while executing script. Unsetting it for deallocating