예제 #1
0
 def validate_apps(self, local_dir_name="small_rw_jobs"):  # pylint: disable=unused-argument
     '''
     Validate small apps passed
     :param local_dir_name:
     :return:
     '''
     local_dir = os.path.join(Config.getEnv('ARTIFACTS_DIR'),
                              self.local_dir_name)
     appIds = []
     for root, _dirs, filenames in os.walk(local_dir):
         for f in filenames:
             logfile = open(os.path.join(root, f), 'r')
             stdout = logfile.read()
             appId = YARN.getApplicationIDFromStdout(stdout,
                                                     logoutput=False)
             appIds.append(appId)
     # Sleep for 30 seconds before checking App status
     time.sleep(30)
     status, d = YARN.checkAppsSucceeded(appIds,
                                         logPrefix=None,
                                         useWS=True,
                                         localDir=None)
     for app, status in d.items():
         if status != "SUCCEEDED":
             appInfo = YARN.getApplicationInfo(app)
             logger.info(appInfo)
             if appInfo:
                 assert appInfo[
                     'state'] == 'ACCEPTED', "app is neither in ACCEPTED or SUCCEEDED State"
예제 #2
0
 def _postStopAction(cls, service):
     if service == 'hiveserver2':
         logger.info("Hard kill Tez sessions")
         yarn_user = YARN.getYarnUser()
         apps = YARN.getApplicationIDList(state='NEW,NEW_SAVING,SUBMITTED,ACCEPTED,RUNNING')
         if len(apps) > 0:
             for app in apps:
                 YARN.killApplicationAs(app, user=yarn_user)
                 time.sleep(5)
         logger.info("Hard kill the HS2 application if still running")
         admin_user = Machine.getAdminUser()
         hosts = cls.getServiceHosts(service)
         port = cls.getHiveserver2ThriftPort()
         for host in hosts:
             pid = Machine.getPIDByPort(port, host=host, user=admin_user)
             if pid:
                 logger.info("Found process for '%s' with PID %d" % (service, pid))
                 Machine.killProcessRemote(pid, host=host, user=admin_user)
                 time.sleep(2)
         logger.info("Hard Kill proc_llap daemon due to BUG-62657")
         allnodes = util.getAllNodes() if Machine.isHumboldt() else Hadoop.getAllNodes()
         for node in allnodes:
             proc_llap_pids = Machine.getProcessListRemote(
                 node, format="%U %p %P %a", filter="proc_llap", logoutput=True
             )
             if len(proc_llap_pids) != 0:
                 proc_llap_pid = Machine.getPidFromString(proc_llap_pids[0], yarn_user)
                 if proc_llap_pid:
                     logger.info("Found proc_llap process with PID %d on %s" % (proc_llap_pid, node))
                     Machine.killProcessRemote(proc_llap_pid, host=node, user=admin_user)
                     time.sleep(2)
예제 #3
0
def verifyOozieAppsAndJobsSucceeded(workflowIds,
                                    logPrefix,
                                    localDir,
                                    testMap,
                                    action_name='wc',
                                    checkJob=True):
    '''
  Verifies if all apps and jobs submitted/created via. Oozie have succeed all the validations.
  :param workflowIDs: List of workflow ids to verify.
  :param logPrefix: log prefix for YARN app logs.
  :param localDir: Path to local log dir.
  :return: Bool status indicating if validation succeeded.
  '''
    appIds = []
    jobIds = []
    dLog = {}
    appStatus = True
    jobStatus = True
    wprStatus = True

    # check the job and app status for each workflow we launched.
    if Hadoop.isHadoop2():
        # get all the app and job ids
        for workflowId in workflowIds:
            if action_name != 'None':
                stdout = Oozie.getJobInfo('%s@%s' % (workflowId, action_name),
                                          verbose=True,
                                          retry=True)
            else:
                stdout = Oozie.getJobInfo('%s' % (workflowId),
                                          verbose=True,
                                          retry=True)
            ids = Oozie.getJobAndAppIds(stdout)
            for id in ids:
                appIds.append(id['application'])
                jobIds.append(id['job'])
        # get the app and job status for all the jobs we found
        appStatus, appLog = YARN.checkAppsSucceeded(appIds,
                                                    logPrefix=logPrefix,
                                                    localDir=localDir)
        dLog.update(appLog)
        if checkJob:
            jobStatus, jobLog = YARN.checkJobsSucceeded(jobIds)
            dLog.update(jobLog)
        for key, value in dLog.items():
            logger.info("%s -> %s" % (key, value))

        wprStatus, d = verifyWorkPreservingRMRestart(jobIds, testMap)
        for k, v in d.items():
            logger.info("%s -> %s" % (k, v))

    logger.info("appStatus: %s jobStatus: %s wprStatus: %s" %
                (appStatus, jobStatus, wprStatus))
    return appStatus and jobStatus and wprStatus
예제 #4
0
def resetYarn(skip_check=False):
    # updates for Hadoop 2
    if YARN.isHAEnabled():
        logger.info("Resetting YARN...")
        # only do this on nano as we these services are unreliable on nano
        tasktrackers = MAPRED.getTasktrackers()
        if skip_check or (Hadoop.isHadoop2() and Machine.isLinux()
                          and Machine.isNano()):
            YARN.restartHARMNodes()
            # add sleep to give RM enough time to register all the nodes
            # and be ready
            MAPRED.waitForNMToRegister(len(tasktrackers))

        logger.info("Resetting YARN Completed.")
예제 #5
0
def verifyAppsAndJobsSucceeded(appLogSearchPrefix,
                               stdout,
                               localDir,
                               testMap=None,
                               user=None):
    '''
  :param appLogSearchPrefix: The prefix using which the app logs are going to be searched.
  :param stdout: stdout from the app.
  :param localDir: Path to current dir.
  :param testMap: map containing the service(s) names and the kwargs of the services being restarted in the test.
  :return: success status and a dict with the relevant info.
  '''
    d = {}
    status = True

    if appLogSearchPrefix is None or localDir is None:
        status = False

    # Check if all the Jobs and the apps succeeded.
    if Hadoop.isHadoop2():
        dLog = jobLog = wprdLog = {}
        appStatus = jobStatus = True
        appIds, jobIds = YARN.getAppAndJobIdsFromConsole(stdout)
        appStatus, dLog = YARN.checkAppsSucceeded(appIds,
                                                  logPrefix=appLogSearchPrefix,
                                                  localDir=localDir)
        jobStatus, jobLog = YARN.checkJobsSucceeded(jobIds, user)
        d.update(dLog)
        d.update(jobLog)

        jobIds = cleanseJobIds(jobIds)

        # Performing WPR validations.
        wprStatus, wprdLog = verifyWorkPreservingRMRestart(jobIds, testMap)
        d.update(wprdLog)

        # Check if all the validations succeeded.
        if appStatus is False or jobStatus is False or wprStatus is False:
            d[appLogSearchPrefix] = "appStatus: %s jobStatus: %s wprStatus: %s" % (
                appStatus, jobStatus, wprStatus)
            status = False

    # Prepend the method names to all the keys in the dict.
    tempd = {}
    for k, v in d.items():
        tempd["%s: %s" % ("verifyAppsAndJobsSucceeded", k)] = v

    return status, tempd
예제 #6
0
    def run_background_job(cls, runSmokeTestSetup=True, config=None):
        '''
        Runs background long running TestOrderedWordCount tez job
        :param runSmokeTestSetup: Runs smoke test setup if set to true
        :param config: expected configuration location
        :return: Total number of long running jobs started
        '''
        logger.info("*** Start background job for Tez ***")
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode
        UpgradePerNode.reportProgress(
            "###  Starting background job for Tez  ####")

        #Sleep for 180 seconds between DAGs
        sleepInterval = 180
        cmdLineArgs = ""
        for i in range(0, 4, 1):
            cmdLineArgs += cls._hdfsInputList[i] + " " + cls._hdfsOutputList[
                i] + " "
        logger.info(cmdLineArgs)
        Tez.runTezExampleJar(
            "testorderedwordcount \"-DUSE_TEZ_SESSION=true\" \"-Dtez.queue.name=%s\" \"-DINTER_JOB_SLEEP_INTERVAL=%d\" \"-DRETAIN_STAGING_DIR=true\" %s "
            % (cls._queue, sleepInterval, cmdLineArgs),
            runInBackground=True)
        interval = 300
        while (cls._background_job_appId == ''
               or cls._background_job_appId == None and interval > 0):
            logger.info("Trying to get appID..")
            time.sleep(10)
            interval = interval - 10
            cls._background_job_appId = YARN.getAppIDFromAppName(
                "OrderedWordCountSession", state="RUNNING")
        logger.info("*******************appID=%s" % cls._background_job_appId)

        logger.info("*** End background job for Tez ***")
        return 1
예제 #7
0
def verifyNodesTab(basePage, commonPage, appsVisible):
    '''
    Verify Nodes Tab
    :param appsVisible: dict{appID:True/False based on if app is visible or not}
    :return:
    '''
    commonPage.clickClusterOverview()
    nodesPage = NodesPage(basePage)

    for node in YARN.getNodeManagerHosts(True):
        commonPage.clickNodes()
        assert nodesPage.isNodesPage()
        nodesPage.clickNodeAddress(node)
        assert nodesPage.isNodeInfoPage()

        #List of Applications tab
        nodesPage.clickNodeAppPage()
        assert nodesPage.isNodeAppPage()
        for appID, visibility in appsVisible.iteritems():
            if visibility == True:
                assert nodesPage.isNodeAppIDLink(appID)
            else:
                assert not nodesPage.isNodeAppIDLink(appID)

        #List of Containers tab
        nodesPage.clickNodeContainerPage()
        assert nodesPage.isNodeContainerPage()
        for appID, visibility in appsVisible.iteritems():
            if visibility == True:
                assert nodesPage.isNodeContainerIDLink(appID)
            else:
                assert not nodesPage.isNodeContainerIDLink(appID)
예제 #8
0
    def validate_HDFS_stream_job(cls,
                                 appId,
                                 mode,
                                 patterns,
                                 expected_count,
                                 clientfile=None):
        '''
          count the occurance of word in the yarn logs.
            -> check clientfile for yarn-client mode
            -> check yarn logs for yarn-cluster mode

          appId : application Id
          mode : mode of execution 
          patterns : list of words to check in log
          expected_count : the expected number of occurence for each word in patterns
          clientfile : jobclient output for app
          '''
        if mode == "yarn-client":
            file_to_read = clientfile
        else:
            file_to_read = Spark.createTmpClientFile(appId + ".log")
            YARN.getLogsApplicationID(appId,
                                      appOwner=None,
                                      nodeAddress=None,
                                      containerId=None,
                                      logoutput=False,
                                      grepFilter=None,
                                      pipeToFileOutput=file_to_read,
                                      config=None)

        count = 0
        word_count = {}
        # initialize word_count dictonary
        for p in patterns:
            word_count[p] = 0
        with open(file_to_read) as f:
            for line in f:
                words = line.split()
                for word in words:
                    if word in word_count.keys():
                        word_count[word] = word_count[word] + 1

        logger.info(word_count)
        for key, value in word_count.iteritems():
            assert value >= expected_count, "%s wordcount is %s. expected_count is %s" % (
                key, value, expected_count)
예제 #9
0
def verifyWorkPreservingRMRestart(jobIds, testMap=None):
    '''
  Validates the functionality related to Work Preserving RM Restart feature in the context of HA.
  :param jobids: List of job ids on which the validation needs to be performed.
  :param testMap: map containing the service(s) names and the kwargs of the services being restarted in the test.
  :return: success status and a dict with the relevant info.
  '''
    d = {}
    status = True

    if jobIds is None or len(jobIds) <= 0:
        status = False

    jobIds = cleanseJobIds(jobIds)

    # If AM is being restarted, skip WPR validations.
    validateWpr = False
    if testMap:
        if testMap.has_key("services"):
            # The service names are separated by ,.
            serviceList = testMap["services"].split(",")
            for service in serviceList:
                if testMap[service].has_key("kwargs"):
                    if testMap[service]["kwargs"][
                            "service"] == "applicationmaster":
                        validateWpr = False
                        d["ApplicationMaster"] = "ApplicationMaster is being restarted. Skipping WPR validations."
                        status = True
                        break
                    # check if RM is being restarted in the test and only validate WPR if it is.
                    if testMap[service]["kwargs"][
                            "service"] == "resourcemanager":
                        validateWpr = True

    if validateWpr and YARN.isWorkPreservingRMRestartEnabled():
        status, d = YARN.verifyJobSuccessWithWorkPreservingRMRestart(jobIds)
    else:
        d['WPR-Check-Off'] = "RM was not restarted in the test so no need to check WPR!"

    # Prepend the method names to all the keys in the dict.
    tempd = {}
    for k, v in d.items():
        tempd["%s: %s" % ("verifyWorkPreservingRMRestart", k)] = v

    return status, tempd
예제 #10
0
    def checkIfCurrentURLIsProxyURL(self):

        url = self.getCurrentURL()
        if not YARN.isKnoxProxySet() and self.PROXY_URL not in url:
            return True
        if self.PROXY_URL in url:
            return True

        return False
예제 #11
0
 def YARN_isHAEnabled(cls, logoutput=True):
     try:
         from beaver.component.hadoop import YARN
         return YARN.isHAEnabled()
     except Exception:
         if logoutput:
             logger.error(
                 "Exception occured during YARN_isHAEnabled() call")
             logger.error(traceback.format_exc())
         return False
예제 #12
0
 def YARN_getRMHANodes(cls, logoutput=True):
     try:
         from beaver.component.hadoop import YARN
         return YARN.getRMHANodes()
     except Exception:
         if logoutput:
             logger.error(
                 "Exception occured during YARN_getRMHANodes() call")
             logger.error(traceback.format_exc())
         return None
예제 #13
0
 def collect_application_log_locally(cls, appId, user):
     '''
     Collects application log and save it in Local Dir with <appId>.log filename
     :param appId: Application Id
     :param user: Application Id owner
     '''
     try:
         from beaver.component.hadoop import YARN
         filename = os.path.join(cls.LOCAL_TMP_APP_STORAGE, appId + ".log")
         if not Machine.pathExists(None, None, filename, None):
             logger.info("Storing syslog of %s in %s", appId, filename)
             YARN.getLogsApplicationID(appId, user, None, None, False, None,
                                       filename)
         else:
             logger.info("%s already present at %s", appId, filename)
     except Exception:
         logger.error(
             "Exception occured during collect_application_log_locally() call"
         )
         logger.error(traceback.format_exc())
예제 #14
0
 def verifyLongRunningJob(cls):
     '''
     Validate long running background job after end of all component upgrade
     '''
     from beaver.component.spark_ha import Spark_Ha
     from tools import stacktracer
     stacktracer.trace_stop()
     if cls.hdfs_thread.is_alive():
         cls.hdfs_thread.join()
     YARN.killApplication(cls.appId_hdfs_cluster)
     time.sleep(15)
     patterns = [
         "(Spark,1)", "(,1)", "(world,1)", "(Word,1)", "(hello,1)",
         "(count,1)", "(HDFS,1)", "(application,1)", "(Testing,1)"
     ]
     Spark_Ha.validate_HDFS_stream_job(cls.appId_hdfs_cluster,
                                       "yarn-cluster",
                                       patterns=patterns,
                                       expected_count=40,
                                       clientfile=cls.local_hdfs_cluster)
예제 #15
0
 def verifyLongRunningJob(cls):
     '''
     Verify long running background job after it finishes
     :return:
     '''
     ruAssert(
         "Tez",
         YARN.getAppFinalStateFromID(
             cls._background_job_appId) == 'SUCCEEDED')
     for output in cls._hdfsOutputList:
         ruAssert("Tez", HDFS.fileExists(output + '/part*'))
     logger.info("**** Verified long running job for Tez ****")
예제 #16
0
    def run_smoke_test(cls, config=None):
        '''
        Run smoke test for spark
        '''
        logger.info("config = %s", config)
        from beaver.component.spark import Spark
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode
        UpgradePerNode.reportProgress(
            "[INFO][Spark][Smoke] Smoke test for Spark started ")
        exit_code, _ = Spark.submitSparkApplication(
            "org.apache.spark.examples.SparkPi", "yarn-cluster", "3")
        if exit_code != 0:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Smoke Test Failed in Yarn-cluster mode"
            )
            return

        exit_code, stdout2 = Spark.submitSparkApplication(
            "org.apache.spark.examples.SparkPi", "yarn-client", "3")
        if exit_code != 0:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Smoke Test Failed in Yarn-client mode"
            )
            return

        if Machine.isWindows():
            appName_pi = "SparkPi"
        else:
            appName_pi = "Spark Pi"
        HADOOP_QA = Config.get('hadoop', 'HADOOPQA_USER')
        appId = YARN.getApplicationIDFromStdout(stdout2).strip()
        logger.info(
            "Validate http://<host>:<port>/ws/v1/timeline/spark_event_v01/<appId>"
        )
        Spark.getSparkATSAppUrl(appId)
        time.sleep(30)
        # Spark-ats check. We will enable it once Ambari enables Spark-ATS by default
        #cls.validate_ApplicationEntry(appId, appName_pi, HADOOP_QA, mode="yarn-client", url=url)
        Spark.hitSparkURL()
        time.sleep(50)
        result_HS_completeApp = Spark.validateSparkHSCompletedApps(
            appId, appName_pi, HADOOP_QA)
        if not result_HS_completeApp:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Spark HS complete App Validation failed"
            )
            return
        result_HS_Jobs = Spark.validateSparkHSJobs(appId, "1/1", "3/3")
        if not result_HS_Jobs:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Spark HS Job page validation failed"
            )
            return
예제 #17
0
    def start_LongRunning_Federation_HDFS_stream_job(
            cls,
            inputDir,
            outputDir,
            num_executor,
            mode="yarn-client",
            inBackground=True,
            clientfile=None,
            pythonFile="federation_hdfs_wordcount.py",
            srcDir=None,
            keytab=None,
            principal=None):
        """
          Starts Spark-HDFS Streaming application using python file
          :param inputDir:
          :param outputDir:
          :param num_executor:
          :param mode:
          :param inBackground:
          :param clientfile:
          :param pythonFile: Python file which need to be run as spark streaming application
          :param srcDir: Path of the Python file
          :return: (application ID, Local client log)
          """
        if clientfile == None:
            Local_clientlog = Spark.createTmpClientFile(pythonFile + "_" +
                                                        mode)
        else:
            Local_clientlog = Spark.createTmpClientFile(clientfile)

        if pythonFile == "federation_hdfs_wordcount.py":
            srcDir = os.path.join(Config.getEnv("WORKSPACE"), "tests", "spark",
                                  "examples", "streaming")

        arg = " %s %s 2>&1 | tee %s" % (inputDir, outputDir, Local_clientlog)

        Spark.submitSparkPyApplication(pythonFile,
                                       mode,
                                       arg,
                                       num_executor=num_executor,
                                       inBackground=inBackground,
                                       srcDir=srcDir,
                                       timeout=120,
                                       clientfile=clientfile,
                                       conf=None,
                                       keytab=keytab,
                                       principal=principal)

        f = open(Local_clientlog, "r")
        stdout = f.read()
        f.close()
        appId = YARN.getApplicationIDFromStdout(stdout)
        return appId, Local_clientlog
예제 #18
0
    def getComponnetsToTest(cls, compFile, depFile):
        '''
        Get the components that are being tested according to depFile
        '''
        # read in the config file
        conf = RuSetup.readJson(compFile)
        isStandalone = conf[RuSetup.CONF_STANDALONE]
        RuSetup._skipQueue = set(conf[RuSetup.CONF_SKIP_QUEUE])
        RuSetup._defaultQueue = conf[RuSetup.CONF_DEFAULT_QUEUE]
        returnSet = None
        if isStandalone:
            # get the components to test
            returnSet = set(conf[RuSetup.CONF_COMPONENTS_TEST])
        else:
            returnSet = set(RuSetup.getComponentsAffected(compFile, depFile))

        # skip tests according to cluster settings
        if not HDFS.isHAEnabled():
            logger.info("Skip HDFS since HA is not enabled")
            returnSet.discard("hdfs")

        # as discussed in Ru standup for 11/13, enabling storm-slider for non HA cluster and storm standalone for HA cluster
        if YARN.isHAEnabled():
            returnSet.discard("storm-slider")
        else:
            returnSet.discard("storm")

        if Hadoop.isEncrypted():
            returnSet.discard("knox")
            returnSet.discard("falcon")

        if Hadoop.isTez():
            logger.info("Add tez since Hadoop.isTez()")
            returnSet.add("tez")
        else:
            logger.info(
                "Make sure tez is not in the list since Hadoop.isTez() is false"
            )
            returnSet.discard("tez")
        # Note: component.xa is always available, even if xa is not installed
        # So this line should work even if the cluster does not have xa installed
        from beaver.component.xa import Xa
        if Xa.isArgusInstalled():
            logger.info("Add argus since argus is there")
            returnSet.add("argus")
        else:
            logger.info(
                "Make sure argus is not in the list since it's not available")
            returnSet.discard("argus")

        return list(returnSet)
예제 #19
0
    def createClusterEntities(cls, colo, desc, name):
        try:
            from beaver.component.falcon import Falcon
        except ImportError:
            ## Import fails when Falcon is not installed on this machine. Nothing to do
            return

        from beaver.component.hadoop import Hadoop, HDFS, YARN
        write_endpoint = Hadoop.getFSDefaultValue()
        webhdfs_scheme = 'webhdfs'
        if HDFS.isHttpsEnabled():
            webhdfs_scheme = 'swebhdfs'
        read_endpoint = '%s://%s:%s' % (
            webhdfs_scheme, write_endpoint.split('/')[2].split(':')[0],
            HDFS.getNNWebPort())
        execute_endpoint = YARN.getResourceManager()
        falconNode = Falcon.get_falcon_server()

        from beaver.component.oozie import Oozie
        oozieUrl = Oozie.getOozieUrl()
        entityText = "<?xml version=\"1.0\"?>" \
                     "<cluster colo=\"" + colo + "\" description=\"" + desc + "\" name=\"" + name + "\" " \
                     "xmlns=\"uri:falcon:cluster:0.1\"> " \
                        "<interfaces> " \
                            "<interface type=\"readonly\" endpoint=\""+read_endpoint+"\" version=\"0.20.2\"/> " \
                            "<interface type=\"write\" endpoint=\""+write_endpoint+"\" version=\"0.20.2\"/> " \
                            "<interface type=\"execute\" endpoint=\"" + execute_endpoint + "\" version=\"0.20.2\"/> " \
                            "<interface type=\"workflow\" endpoint=\"" + oozieUrl + "\" version=\"3.1\"/>" \
                            "<interface type=\"messaging\" endpoint=\"" \
                                "tcp://" + falconNode + ":61616?daemon=true\" version=\"5.1.6\"/>" \
                        "</interfaces>" \
                        "<locations>" \
                            "<location name=\"staging\" path=\"/apps/falcon/" + name + "/staging\" />" \
                            "<location name=\"temp\" path=\"/tmp\" />" \
                            "<location name=\"working\" path=\"/apps/falcon/" + name + "/working\" />" \
                        "</locations>" \
                        "<ACL owner=\"" + cls._job_user + "\" group=\"users\" permission=\"0755\"/>"
        if Hadoop.isSecure():
            realm = HDFS.getConfigValue(
                'dfs.namenode.kerberos.principal').split('@')[1]
            entityText += "<properties> <property name=\"dfs.namenode.kerberos.principal\" value=\"nn/_HOST@" + realm + "\"/> </properties>"
        entityText += "</cluster>"
        textFile = open(os.path.join(cls._local_workspace, name + ".xml"), "w")
        textFile.write("%s" % entityText)
        textFile.close()

        return
예제 #20
0
    def start_LongRunning_HDFS_stream_job(cls,
                                          inputDir,
                                          num_executor,
                                          mode="yarn-client",
                                          inBackground=True,
                                          clientfile=None):
        '''
          Start Spark-HDFS Streaming application
          '''
        className = "org.apache.spark.examples.streaming.HdfsWordCount"
        if mode == "yarn-client" and not HDFS.isASV():
            jars = Spark.getLzoJar()
        else:
            jars = None
        if clientfile == None:
            Local_clientlog = Spark.createTmpClientFile(className + "_" + mode)
        else:
            Local_clientlog = Spark.createTmpClientFile(clientfile)
        arg = " %s 2>&1 | tee %s" % (inputDir, Local_clientlog)
        if Hadoop.isSecure():
            keytab = Machine.getHeadlessUserKeytab(
                Config.get('hadoop', 'HADOOPQA_USER'))
            principal = Machine.get_user_principal(
                Config.get('hadoop', 'HADOOPQA_USER'))
        else:
            keytab = None
            principal = None

        Spark.submitSparkApplication(className,
                                     mode,
                                     arg,
                                     jars=jars,
                                     num_executor=num_executor,
                                     inBackground=inBackground,
                                     timeout=120,
                                     keytab=keytab,
                                     principal=principal)
        f = open(Local_clientlog, "r")
        stdout = f.read()
        f.close()
        appId = YARN.getApplicationIDFromStdout(stdout)
        return appId, Local_clientlog
예제 #21
0
def getSucceededFinishedApps(prefix, startTime, endTime):
    d = {}
    appFound = 0
    apps = YARN.getApplicationsInfo(startedTimeBegin=startTime,
                                    finishedTimeEnd=endTime)
    if not apps:
        return appFound, d

    # traverse through all the apps and find the ones which we are looking for
    for app in apps:
        if prefix in app['name']:
            d[str(app['id'])] = str(app['finalStatus'])

    # check succeeded apps after the dict object is created in order
    # to not count duplicate entries
    # https://hortonworks.jira.com/browse/BUG-14532
    for key, value in d.items():
        if 'SUCCEEDED' == value:
            appFound += 1

    return appFound, d
예제 #22
0
    def launchMultipleSleepJobs(cls,
                                numJobs,
                                mapSleepTime=1000,
                                reduceSleepTime=1000,
                                config=None):
        '''
        Function to Launch multiple sleep jobs
        :param numJobs: number of sleep jobs want to run
        :param mapSleepTime: Map sleep time
        :param reduceSleepTime: Reduce sleep time
        :param config: expected Configuration location
        :return: jobIDs
        '''
        jobIds = []
        # Create jobs
        i = 0
        for i in range(0, numJobs):
            jobclientFile = os.path.join(Config.getEnv('ARTIFACTS_DIR'),
                                         "JobClient_output.log")
            HadoopJobHelper.runSleepJob(numOfMaps=1,
                                        numOfReduce=1,
                                        mapSleepTime=mapSleepTime,
                                        reduceSleepTime=reduceSleepTime,
                                        extraJobArg=cls._jobArgs,
                                        runInBackground=False,
                                        config=config,
                                        directoutput=True,
                                        outputFile=jobclientFile)
            f = open(jobclientFile)
            text = f.read()
            f.close()
            currJobId = YARN.getAppAndJobIdsFromConsole(text)[1][0]
            jobIds.append(currJobId)
        # Join jobs

        for job in jobIds:
            ruAssert("YARN", MAPRED.isJobSucceed(job))
        return jobIds
예제 #23
0
    def hdp_upgrade(cls,
                    components,
                    currVersion,
                    latestVersion,
                    doTeardown=True,
                    finalize=True):
        '''
        Upgrade HDP Stack With Per Node Method.

        Steps
        1) Prepare and save component states.
        2) Setup prerequisites for background jobs.
        3) Start long-running background jobs for all components.
        4) Upgrade core components from bottom to top.
          For each service, does:
          4a) Upgrade service.
          4b) Run smoke tests for all components.
          4c) Check number of all background jobs.
        5) After all components are upgraded, run another set of tests.
        6) Repeat same process for non-core components.
        7) Upgrade clients of components which were upgraded earlier.
        8) Upgrade client-only components.
        9) After all components are upgraded, run smoke tests.
        10) Stop long running jobs.
        11) Look for failed and kill jobs.
        12) Verify outputs of successful jobs.
        13) Finalize all states.

        :param components: list of Components to upgrade
        :param currVersion: Current Version
        :param latestVersion: Version to be upgraded to
        :param doTeardown: Only Cleanup when required
        '''
        cls.reportProgress(
            "###  Starting upgrade from %s to %s for components=%s ####" %
            (currVersion, latestVersion, components))
        DN = HDFS.getDatanodes()

        # Find core components (HDFS, YARN, HBase) if exist.
        core_components = cls.find_existing_core_components(components)

        #Prepare and save state before upgrade
        Rollingupgrade.ru_prepare_save_state_for_upgrade(components)

        # Run setup for background Jobs for all components
        Rollingupgrade.background_job_setup(components, config=None)

        # Starts Long running background Jobs for all components
        numBackgroundJobs = Rollingupgrade.run_longRunning_Application(
            components, config=None)
        logger.info(
            "Total number of long running background jobs before starting upgrade is %s"
            % numBackgroundJobs)
        cls.reportProgress("###  Just started %s background jobs  ###" %
                           numBackgroundJobs)

        #upgrade the components in Hierchacy
        cls.reportProgress("###  Starting upgrade of core %s masters  ###" %
                           core_components)
        #### IF XA is enabled, upgrade XA services ####
        from beaver.component.xa import Xa
        cls.reportProgress(
            "******************************* checking for argus to be installed *******************************"
        )
        if "argus" in components and Xa.isArgusInstalled():
            logger.info(
                '**************************************************** XA is Enabled in the cluster, setting up and upgrading the same ****************************************************'
            )
            Rollingupgrade.upgrade_master_and_smoketest(
                ['argus'], latestVersion, config=None, currVersion=currVersion)

        ##### TODO - upgrade ZOOKEEPER ########
        if "zookeeper" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["zookeeper"],
                                                        latestVersion,
                                                        config=None)
        # Upgrade Master services - Namenode, Secondarynamenode, Resourcemanager, Application Timelineserver, JobHistoryserver and HbaseMaster with new version
        #### TODO - Application Timelineserver HbaseMaster ####
        AfterUpgradeBackGroundJobs = Rollingupgrade.upgrade_master_and_smoketest(
            core_components, latestVersion, config=None)
        cls.reportProgress("###  Finished upgrade of core %s masters  ###" %
                           core_components)
        numBackgroundJobs = numBackgroundJobs + AfterUpgradeBackGroundJobs
        logger.info(
            "Total number of long running background jobs after upgrading master services is %s"
            % numBackgroundJobs)

        # upgrade slave service - Datanodes, Nodemanagers and Regionservers with new version
        cls.reportProgress("###  Starting upgrade of core %s slaves  ###" %
                           core_components)
        i = 0
        #### TODO - upgrade Regionserver  ####
        for node in DN:
            i += 1
            logger.info("**** Upgrading slave number " + str(i) + ": " + node +
                        " ****")
            if i % 4 == 0:
                runSmoke = True
            else:
                runSmoke = False
            Rollingupgrade.upgrade_slave_and_smoketest(core_components,
                                                       latestVersion, node,
                                                       None, runSmoke)
            #check if background function running
            runningJobs = YARN.getNumOfRunningJobs()
            logger.info("Long-running job ended too early; running jobs =" +
                        str(runningJobs))
            #assert runningJobs == numBackgroundJobs, 'Long-running job ended too early; running jobs = ' + str(runningJobs)

        cls.reportProgress("###  Finished upgrade of %d core %s slaves  ###" %
                           (i, core_components))
        #### Run all component Smoke tests ####
        Rollingupgrade.run_smokeTests(components, config=None)

        #  Run Tests to verify components accessibility
        Rollingupgrade.testAfterAllMasterSlavesUpgraded(components)

        #### Starting upgrade non core components ####
        cls.reportProgress(
            "###  Starting upgrade of non-core cluster components  ###")
        if "hive" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["hive"],
                                                        latestVersion,
                                                        config=None)

        #### TODO- upgrade pig to N+1 version ####

        #### TODO - Run pig smoke test ####
        #     ## Example : ##
        if "pig" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["pig"],
                                                        latestVersion,
                                                        config=None)
        # ##    Rollingupgrade.upgrade_slave_and_smoketest(["pig"], latestVersion, node)

        # #### TODO - upgrade oozie server to N+1 version ####

        # #### - Run oozie smoke test ####
        if "oozie" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["oozie"],
                                                        latestVersion,
                                                        config=None)

        #### upgrade falcon to N+1 version and run its smoke tests ####

        if "falcon" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["falcon"],
                                                        latestVersion,
                                                        config=None)

        #### TODO - upgrade phoenix to N+1 version ####

        #### TODO - Run phoenix smoke test ####
        if "phoenix" in components:
            ruPhoenix.run_smoke_test(ruPhoenix._smokeTestNum)

        #### TODO - upgrade sqoop to N+1 version ####
        #### TODO - Run sqoop smoke test ####

        cls.reportProgress(
            "###  Finished upgrade of non-core cluster components  ###")

        ##For storm-slider we want toverify the topologies and kill the storm-slider app.
        if "storm-slider" in components:
            from beaver.component.rollingupgrade.ruStorm import ruStorm
            ruStorm.verify_and_stop_slider_app()

        #### TODO- upgrade clients for Argus, Zk, Hdfs, Yarn, MR, Tez, Hive, Pig, Hbase, Falcon, oozie, sqoop , phoenix, mahout ####
        cls.reportProgress(
            "###  Starting upgrade of clients %s inside the cluster ###" %
            components)
        Rollingupgrade.upgrade_client_insideCluster_and_smoketest(
            components, latestVersion, config=None)

        if "storm-slider" in components:
            from beaver.component.rollingupgrade.ruStorm import ruStorm
            ruStorm.start_slider_app_resubmit_topologies()
            time.sleep(120)  # Allow time for storm-slider topologies to run.

        cls.reportProgress("###  Starting upgrade of slider apps ###")
        ### TODO- upgrade slider client and non rolling upgrade of slider-apps ####
        ### TODO- Stop storm-slider app, hbase-slider app, accumulo-slider app
        ### TODO- Upgrade storm-slider client
        ### TODO- resubmit storm-slider app, hbase-slider app, accumulo-slider app
        cls.reportProgress("###  Finished upgrade of slider apps ###")

        #### Knox upgrade
        if "knox" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["knox"],
                                                        latestVersion,
                                                        config=None)

        #### upgrade Flume to N+1 version ####
        if "flume" in components:
            Rollingupgrade.upgrade_master_and_smoketest(["flume"],
                                                        latestVersion,
                                                        config=None)

        #### TODO - upgrade Kafka to N+1 version ####

        #### TODO - Run Kafka smoke test ####

        ## Example : ##
        ## if "kafka" in components:
        ##    Rollingupgrade.upgrade_master_and_smoketest(["kafka"], latestVersion, config=None)
        ##    Rollingupgrade.upgrade_slave_and_smoketest(["kafka"], latestVersion, node)

        #### TODO - upgrade Storm to N+1 version ####

        #### TODO - Run storm smoke test ####

        ## Example : ##
        ## if "storm" in components:
        ##    Rollingupgrade.upgrade_master_and_smoketest(["storm"], latestVersion, config=None)
        ##    Rollingupgrade.upgrade_slave_and_smoketest(["storm"], latestVersion, node)

        #### TODO - upgrade Hue to N+1 version ####

        #### TODO - Run Hue smoke test ####

        ## Example : ##
        ## if "hue" in components:
        ##    Rollingupgrade.upgrade_master_and_smoketest(["hue"], latestVersion, config=None)
        ##    Rollingupgrade.upgrade_slave_and_smoketest(["hue"], latestVersion, node)
        cls.reportProgress(
            "###  Finished upgrade of non-core components outside the cluster  ###"
        )

        #### TODO - Run all component Smoke tests ####
        Rollingupgrade.run_smokeTests(components, config=None)

        ### Need to stop HDFS Falcon,Yarn long runningJobs ####
        # create flagFile to kill HDFS background job
        TEST_USER = Config.get('hadoop', 'HADOOPQA_USER')
        createCmd = "dfs -touchz " + cls._HDFS_FLAG_FILE
        exit_code, output = HDFS.runas(TEST_USER, createCmd)

        if "falcon" in components:
            from beaver.component.rollingupgrade.ruFalcon import ruFalcon
            ruFalcon.stopFalconLongRunningJob()
        if "yarn" in components:
            ruYARN.stopYarnLongRunningJob()
        if "slider" in components:
            ruSlider.stopSliderLongRunningJob()
        if "storm-slider" in components:
            from beaver.component.rollingupgrade.ruStorm import ruStorm
            ruStorm.teardown_storm_slider_app()

        ## TODO - wait for long running jobs to finish
        isZero = YARN.waitForZeroRunningApps()
        if isZero:
            cls.reportProgress("#### None apps are running. ####")
        else:
            cls.reportProgress(
                "#### Check Failed. some apps are running. ####")
        #assert isZero, "all long running jobs are not finished"

        ### List down Failed/Killed applications ####
        Failed_Killed_apps = YARN.getFailedKilledAppList()
        cls.reportProgress(
            "### Listing Killed/Failed applications while performing upgrade ####"
        )
        for app in Failed_Killed_apps:
            queue = YARN.getQueueForApp(app)
            logger.info(" %s running on %s queue Failed/Killed." %
                        (app, queue))
            cls.reportProgress(
                "#### %s running on %s queue Failed/Killed. ####" %
                (app, queue))

        ## TODO - Validate long running jobs
        Rollingupgrade.verifyLongRunningJob(components)

        ## KILL APPLICATIONS ####
        YARN.killAllApplications(useYarnUser=True)

        ## TODO - call Finalize
        if finalize:
            Rollingupgrade.ru_finalize_state(components)

        ## TODO - call Teardown for long running jobs
        if doTeardown:
            Rollingupgrade.background_job_teardown(components, None)
        cls.reportProgress(
            "###  Completed upgrade from %s to %s for components=%s ####" %
            (currVersion, latestVersion, components))
예제 #24
0
    def hdp_downgrade(cls,
                      components,
                      currVersion,
                      latestVersion,
                      doTeardown=True):
        '''
        Downgrade HDP Stack With Per Node Method
        :param components: Components to be downgraded
        :param currVersion: Current version (Version V1)
        :param latestVersion: Version to be downgraded to (Version V0)
        '''
        UpgradePerNode.reportProgress(
            "###  Starting downgrade from %s to %s for components=%s ####" %
            (currVersion, latestVersion, components))
        DN = HDFS.getDatanodes()
        core_components = UpgradePerNode.find_existing_core_components(
            components)

        # Run setup for background Jobs for all components
        Rollingupgrade.background_job_setup(components, config=None)

        # Starts Long running background Jobs for all components
        numBackgroundJobs = Rollingupgrade.run_longRunning_Application(
            components, config=None)
        logger.info(
            "Total number of long running background jobs before starting upgrade is %s"
            % numBackgroundJobs)
        UpgradePerNode.reportProgress(
            "###  Just started %s background jobs  ###" % numBackgroundJobs)

        #### TODO - downgrade Hue and run Hue smoke test ####
        UpgradePerNode.reportProgress(
            "###  Starting downgrade of non-core components outside the cluster  ###"
        )
        ## Example : ##
        ## if "hue" in components:
        ##    Rollingupgrade.downgrade_master_and_smoketest(["hue"], latestVersion, config=None)
        ##    Rollingupgrade.downgrade_slave_and_smoketest(["hue"], latestVersion, node)

        #### TODO - downgrade storm and run smoke test ####

        ## Example : ##
        ## if "storm" in components:
        ##    Rollingupgrade.downgrade_master_and_smoketest(["storm"], latestVersion, config=None)
        ##    Rollingupgrade.downgrade_slave_and_smoketest(["storm"], latestVersion, node)

        #### TODO - downgrade Kafka and run smoke test ####

        ## Example : ##
        ## if "kafka" in components:
        ##    Rollingupgrade.downgrade_master_and_smoketest(["kafka"], latestVersion, config=None)
        ##    Rollingupgrade.downgrade_slave_and_smoketest(["kafka"], latestVersion, node)

        #### downgrade Flume ####
        if "flume" in components:
            Rollingupgrade.downgrade_master_and_smoketest(["flume"],
                                                          latestVersion,
                                                          config=None)

        #### downgrade Knox and run smoke test ####
        if "knox" in components:
            Rollingupgrade.downgrade_master_and_smoketest(["knox"],
                                                          latestVersion,
                                                          config=None)
        UpgradePerNode.reportProgress(
            "###  Finished downgrade of non-core components outside the cluster  ###"
        )

        UpgradePerNode.reportProgress(
            "###  Starting downgrade of slider apps ###")
        ### TODO- downgrade slider client and non rolling upgrade of slider-apps ####
        ### TODO- Stop storm-slider app, hbase-slider app, accumulo-slider app
        ### TODO- downgrade storm-slider client
        ### TODO- resubmit storm-slider app, hbase-slider app, accumulo-slider app
        UpgradePerNode.reportProgress(
            "###  Finished downgrade of slider apps ###")

        # Downgrade Non core components
        UpgradePerNode.reportProgress(
            "###  Starting downgrade clients %s inside the cluster ###" %
            components)
        ### TODO - Downgrade CLIENTS ####
        Rollingupgrade.downgrade_client_insideCluster_and_smoketest(
            components, latestVersion, config=None)
        UpgradePerNode.reportProgress(
            "###  Finished downgrade of clients %s inside the cluster ###" %
            components)

        #### TODO - Downgrade phoenix and Run phoenix smoke test ####
        UpgradePerNode.reportProgress(
            "###  started downgrade of non-core cluster components  ###")

        ## Example : ##
        ## if "phoenix" in components:
        ##    Rollingupgrade.downgrade_master_and_smoketest(["phoenix"], latestVersion, config=None)
        ##    Rollingupgrade.downgrade_slave_and_smoketest(["phoenix"], latestVersion, node)

        #### downgrade falcon and run smoke test ####

        if "falcon" in components:
            Rollingupgrade.downgrade_master_and_smoketest(["falcon"],
                                                          latestVersion,
                                                          config=None)

        # #### - downgrade oozie and run smoke test ####
        if "oozie" in components:
            Rollingupgrade.downgrade_master_and_smoketest(["oozie"],
                                                          latestVersion,
                                                          config=None)

        #### Downgrade Pig and run pig smoke test ####
        if "pig" in components:
            Rollingupgrade.downgrade_master_and_smoketest(["pig"],
                                                          latestVersion,
                                                          config=None)

        if "hive" in components:
            Rollingupgrade.downgrade_master_and_smoketest(["hive"],
                                                          latestVersion,
                                                          config=None)
        UpgradePerNode.reportProgress(
            "###  Finished downgrade of non-core cluster components  ###")

        # Downgrade Slave services of core-components (Hdfs, Yarn, hbase)
        UpgradePerNode.reportProgress(
            "###  Starting downgrade of core %s slaves  ###" % core_components)
        i = 0
        #### TODO - Downgrade Datanode, Nodemanager, Regionserver  ####
        for node in DN:
            i += 1
            logger.info("**** Downgrading slave number " + str(i) + ": " +
                        node + " ****")
            Rollingupgrade.downgrade_slave_and_smoketest(
                core_components, latestVersion, node, None)
            #check if background function running
            runningJobs = YARN.getNumOfRunningJobs()
            logger.info("Long-running job ended too early; running jobs =" +
                        str(runningJobs))
            #assert runningJobs == numBackgroundJobs, 'Long-running job ended too early; running jobs = ' + str(runningJobs)
        UpgradePerNode.reportProgress(
            "###  Finished downgrade of %d core %s slaves  ###" %
            (i, core_components))

        # run smoke tests after downgrading
        Rollingupgrade.run_smokeTests(components, config=None)

        #### TODO - Downgrade Namenode, Resourcemanager, Hbase master ####
        UpgradePerNode.reportProgress(
            "###  Starting downgrade of core %s masters  ###" %
            core_components)
        Rollingupgrade.downgrade_master_and_smoketest(core_components,
                                                      latestVersion,
                                                      config=None)

        #### TODO - Run Validation after All Master and slave services are down ####
        Rollingupgrade.testAfterAllMasterSlavesUpgraded(components)

        ### TODO - Downgrade Zookeeper ####
        #Rollingupgrade.downgrade_master_and_smoketest(["zookeeeper"], latestVersion, config=None)
        UpgradePerNode.reportProgress(
            "###  Finished downgrade of core %s masters  ###" %
            core_components)

        #### IF XA is enabled, downgrade XA services ####
        from beaver.component.xa import Xa
        if "argus" in components and Xa.isArgusInstalled():
            logger.info(
                'XA is Enabled in the cluster, setting up and downgrading the same'
            )
            Rollingupgrade.downgrade_master_and_smoketest(['argus'],
                                                          latestVersion,
                                                          config=None,
                                                          currVersion=None)

#### TODO - Run all component Smoke tests ####
        Rollingupgrade.run_smokeTests(components, config=None)

        #TODO - this is common code with upgrade - move it to a function.   - but the slider part is differnt in downgrade; shouldn't be ---
        ### Need to stop HDFS Falcon,Yarn long runningJobs ####
        # create flagFile to kill HDFS background job
        ### Need to stop HDFS Falcon,Yarn long runningJobs ####
        TEST_USER = Config.get('hadoop', 'HADOOPQA_USER')
        createCmd = "dfs -touchz " + UpgradePerNode._HDFS_FLAG_FILE
        exit_code, output = HDFS.runas(TEST_USER, createCmd)

        ruYARN.stopYarnLongRunningJob()
        if "falcon" in components:
            from beaver.component.rollingupgrade.ruFalcon import ruFalcon
            ruFalcon.stopFalconLongRunningJob()
        if "storm-slider" in components:
            from beaver.component.rollingupgrade.ruStorm import ruStorm
            ruStorm.teardown_storm_slider_app()

        ## TODO - wait for long running jobs to finish
        isZero = YARN.waitForZeroRunningApps()
        ## Temporarily uncommenting to tune test
        #assert isZero, "all long running jobs are not finished"

        ## TODO - Validate long running jobs
        Rollingupgrade.verifyLongRunningJob(components)

        ## TODO - call Teardown for long running jobs
        Rollingupgrade.background_job_teardown(components, None)

        ## Finalize State
        Rollingupgrade.ru_finalize_state(components)
        UpgradePerNode.reportProgress(
            "###  Completed downgrade from %s to %s for components=%s ####" %
            (currVersion, latestVersion, components))
예제 #25
0
    def validate_ApplicationEntry(cls,
                                  appId,
                                  appName,
                                  appUser,
                                  mode="yarn-client",
                                  url=None):
        '''
        Validate Application entry
        :param entities: Its output from getCorrectApplicationJsonData
        :param appId: Application Id
        :param appName: Application name
        :param appUser: Application user
        :return:
        '''
        from beaver.component.spark import Spark
        if not url:
            entities = Spark.getCorrectApplicationJsonData(appId)
        else:
            entities = Spark.getCorrectApplicationJsonData(
                appId, url, gatherAppSpecificJson=False)

        logger.info("***** entities *****")
        logger.info(entities)
        logger.info("********************")

        if mode == "yarn-cluster":
            ruAssert(
                "Spark", entities["entity"] == YARN.createAttemptIdFromAppId(
                    appId, "1"), "[Smoke] attemptid entity not found in ATS")
        else:
            ruAssert("Spark", entities["entity"] == appId,
                     "[Smoke] appid entity not found in ATS")
        ruAssert("Spark", entities["domain"] == "DEFAULT",
                 "[Smoke] domain is not default")
        ruAssert("Spark", entities["entitytype"] == "spark_event_v01",
                 "[Smoke] entitytype is not spark_event_v01")
        ruAssert(
            "Spark", entities["primaryfilters"]["endApp"] == [
                'SparkListenerApplicationEnd'
            ], "[Smoke] endapp event missing from ats")
        ruAssert(
            "Spark", entities["primaryfilters"]["startApp"] == [
                'SparkListenerApplicationStart'
            ], "[Smoke] startapp event missing from ats")
        if not Machine.isLinux() and appName == "Spark Pi":
            ruAssert("Spark", entities["otherinfo"]["appName"] == "SparkPi",
                     "[Smoke] otherinfo -> appname is missing from ats")
        else:
            ruAssert("Spark", entities["otherinfo"]["appName"] == appName,
                     "[Smoke] otherinfo -> appname is missing from ats")

        ruAssert("Spark", entities["otherinfo"]["appUser"] == appUser,
                 "[Smoke] otherinfo -> appuser is missing from ats")
        ruAssert(
            "Spark",
            Spark.matchparamater(entities["otherinfo"]["startTime"],
                                 "[0-9]{13}"),
            "[Smoke] otherinfo -> starttime is missing from ats")
        ruAssert(
            "Spark",
            Spark.matchparamater(entities["otherinfo"]["endTime"],
                                 "[0-9]{13}"),
            "[Smoke] otherinfo -> endtime is missing from ats")
예제 #26
0
    def run_rest_apis_test(self):
        '''
        Run checks to make sure the REST interfaces for the RM, NM, JHS and TimelineServer are up
        :return:
        '''
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode
        UpgradePerNode.reportProgress(
            "[INFO][YARN][RestTest] Testing REST interfaces for RM, NM, JHS and TimelineServer "
        )
        logger.info(
            "**** Testing REST interfaces for RM, NM, JHS and TimelineServer ****"
        )
        hostUrlMap = {}
        hostUrlExpectedStatusCode = {}
        rmAddress = YARN.getResourceManagerWebappAddress()
        rmPort = None
        nmPort = None
        jhsAddress = None
        atsAddress = YARN.get_ats_web_app_address()
        scheme = "http"

        if YARN.isHttpsEnabled():
            scheme = "https"
            if rmAddress.startswith("https://"):
                rmAddress = rmAddress[len("https://"):]
            nmPort = YARN.getNodeManagerWebappHttpsPort()
            jhsAddress = MAPRED.getHistoryServerWebappHttpsAddress()
            if jhsAddress.startswith("https://"):
                jhsAddress = jhsAddress[len("https://"):]
            if atsAddress.startswith("https://"):
                atsAddress = atsAddress[len("https://"):]
        else:
            if rmAddress.startswith("http://"):
                rmAddress = rmAddress[len("http://"):]
            nmPort = YARN.getNodeManagerWebappPort()
            jhsAddress = MAPRED.getHistoryServerWebappAddress()
            if jhsAddress.startswith("http://"):
                jhsAddress = jhsAddress[len("http://"):]
            if atsAddress.startswith("http://"):
                atsAddress = atsAddress[len("http://"):]

        rmPort = rmAddress.split(":")[1]
        hostUrlMap[rmAddress] = ["/ws/v1/cluster/info"]
        hostUrlExpectedStatusCode[rmAddress] = 200

        for nm in MAPRED.getTasktrackers():
            host = "%s:%s" % (nm, nmPort)
            hostUrlMap[host] = ["/ws/v1/node/info"]
            hostUrlExpectedStatusCode[host] = 200
        hostUrlMap[jhsAddress] = ["/ws/v1/history/info"]
        hostUrlExpectedStatusCode[jhsAddress] = 200
        hostUrlMap[atsAddress] = ["/ws/v1/timeline"]
        hostUrlExpectedStatusCode[atsAddress] = 200

        for host in hostUrlMap.keys():
            urls = hostUrlMap[host]
            for url in urls:
                fetch_url = scheme + "://" + host + url
                (return_code, data, headers) = util.query_yarn_web_service(
                    fetch_url,
                    Config.get('hadoop', 'HADOOPQA_USER'),
                    also_check_modified_config_for_spnego=False)
                if int(return_code) == hostUrlExpectedStatusCode[host]:
                    UpgradePerNode.reportProgress(
                        "[PASSED][YARN][RestTest] Got %s status code from url %s. Passed "
                        % (return_code, fetch_url))
                else:
                    UpgradePerNode.reportProgress(
                        "[FAILED][YARN][RestTest]Got %s status code from url %s. Failed "
                        % (return_code, fetch_url))
예제 #27
0
    def validate_lr_job(self):
        #################################################Finsih long running jobs
        ### Need to stop HDFS Falcon,Yarn long runningJobs ####
        # create flagFile to kill HDFS background job
        from beaver.component.hadoop import HDFS, YARN
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode
        from beaver.component.rollingupgrade.ruCommon import Rollingupgrade

        TEST_USER = Config.get('hadoop', 'HADOOPQA_USER')
        createCmd = "dfs -touchz " + UpgradePerNode._HDFS_FLAG_FILE
        exit_code, output = HDFS.runas(TEST_USER, createCmd)
        logger.info(
            "8==========================================================================="
        )
        if self.DO_DOWNGRADE:
            self.removeComponentFromTest("falcon")
        if "falcon" in self.COMPONENTS_TO_TEST:
            from beaver.component.rollingupgrade.ruFalcon import ruFalcon
            ruFalcon.stopFalconLongRunningJob()
        if "yarn" in self.COMPONENTS_TO_TEST:
            from beaver.component.rollingupgrade.ruYarn import ruYARN
            ruYARN.stopYarnLongRunningJob()
        # if "hive" in self.COMPONENTS_TO_TEST:
        #     from beaver.component.rollingupgrade.ruHive import ruHive
        #     ruHive.stopHiveLongRunningJob()
        if "slider" in self.COMPONENTS_TO_TEST:
            from beaver.component.rollingupgrade.ruSlider import ruSlider
            ruSlider.stopSliderLongRunningJob()
        if "storm-slider" in self.COMPONENTS_TO_TEST:
            from beaver.component.rollingupgrade.ruStorm import ruStorm
            ruStorm.teardown_storm_slider_app()

        logger.info(
            "9==========================================================================="
        )
        ## TODO - wait for long running jobs to finish

        isZero = YARN.waitForZeroRunningApps()
        logger.info(
            "10==========================================================================="
        )
        if isZero:
            UpgradePerNode.reportProgress("#### None apps are running. ####")
            UpgradeLogger.reportProgress("#### None apps are running. ####",
                                         True)
        else:
            UpgradePerNode.reportProgress(
                "#### Check Failed. some apps are running. ####")
            UpgradeLogger.reportProgress(
                "#### Check Failed. some apps are running. ####", False)
        #assert isZero, "all long running jobs are not finished"

        ### List down Failed/Killed applications ####
        Failed_Killed_apps = YARN.getFailedKilledAppList()
        UpgradePerNode.reportProgress(
            "### Listing Killed/Failed applications while performing upgrade ####"
        )
        UpgradeLogger.reportProgress(
            "### Listing Killed/Failed applications while performing upgrade ####",
            False)

        for app in Failed_Killed_apps:
            queue = YARN.getQueueForApp(app)
            logger.info(" %s running on %s queue Failed/Killed." %
                        (app, queue))
            UpgradePerNode.reportProgress(
                "#### %s running on %s queue Failed/Killed. ####" %
                (app, queue))
            UpgradeLogger.reportProgress(
                "#### %s running on %s queue Failed/Killed. ####" %
                (app, queue), False)

        ## TODO - Validate long running jobs
        Rollingupgrade.verifyLongRunningJob(self.COMPONENTS_TO_TEST)

        ## KILL APPLICATIONS ####
        YARN.killAllApplications(useYarnUser=True)

        #logger.info("Running smoke tests after upgrade")
        #Rollingupgrade.run_smokeTests(COMPONENTS_TO_TEST)

        ## TODO - call Teardown for long running jobs
        Rollingupgrade.background_job_teardown(self.COMPONENTS_TO_TEST, None)
        UpgradePerNode.reportProgress("###  Completed upgrade ")
        UpgradeLogger.reportProgress("###  Completed upgrade ", True)
예제 #28
0
 def getContainerFromDropDown(self, appID, returnLocatorName=False):
     select = Select(self.getElement("FetchContainer", returnLocatorName))
     amContainer = YARN.getAMcontainerId(appID, 1)
     select.select_by_value(amContainer)
예제 #29
0
 def getBaseUrl(self):
     from beaver.component.hadoop import YARN
     return YARN.getRMUrl()
예제 #30
0
 def getBaseUrl(self):
     if self.proxy == 'true' and YARN.isKnoxProxySet():
         return self.PROXY_URL
     else:
         return YARN.getRMUrl() + "/ui2/#/cluster-overview"