Python HDFS.cat Exemples, beaver.component.hadoop.HDFS.cat Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : spark_ha.py Projet : thakkardharmik/beaver

    def validate_wordcount_written_to_HDFS(cls,
                                           hdfs_dir,
                                           patterns,
                                           expected_count,
                                           appId=None):
        """
          Validate the wordcount results, written into HDFS directories by a streaming job.
          Use wildcards in the 'hdfs_dir' to recursively read sub-directories.

          :param hdfs_dir: HDFS directory from where contents will be read
          :param patterns: list of words to check
          :param expected_count: the expected number of occurence for each word in the 'patterns'
          :param appId: application ID (Optional parameter)
          :return:
          """
        count = 0
        word_count = {}
        # initialize word_count dictonary
        for p in patterns:
            word_count[p] = 0

        exit_code, cat_content = HDFS.cat(hdfs_dir, logoutput=True)
        assert exit_code == 0, "Could not read from %s, Error: %s, appId: %s" % (
            hdfs_dir, cat_content, appId)
        for line in cat_content:
            words = line.split()
            for word in words:
                if word in word_count.keys():
                    word_count[word] = word_count[word] + 1

        logger.info(word_count)
        for key, value in word_count.iteritems():
            assert value >= expected_count, "%s wordcount is %s. expected_count is %s, appId: %s" % \
                                            (key, value, expected_count, appId)

Exemple #2

0

Afficher le fichier

Fichier : ruYarn.py Projet : thakkardharmik/beaver

    def checkClasspathVersion(cls, Version_Num, config=None):
        Local_Test_dir = os.path.join(Config.getEnv("WORKSPACE"), "tests",
                                      "rolling_upgrade", "yarn")
        Multi_Version_App_Dir = os.path.join(Local_Test_dir, "data")
        Mapper = "data/versionVerifyMapper.py"
        Reducer = "data/versionVerifyReducer.py"
        Verify_File_Name = "test.txt"
        Verify_Test_File = os.path.join(Multi_Version_App_Dir,
                                        Verify_File_Name)
        # Set up env
        mapred_app_path = MAPRED.getConfigValue(
            "mapreduce.application.framework.path", None)
        mapred_classpath = MAPRED.getConfigValue(
            "mapreduce.application.classpath", None)
        env = {
            "mapreduce.application.framework.path": mapred_app_path,
            "mapreduce.application.classpath": mapred_classpath
        }
        verifyInput = cls._hdfs_input + "/verify"
        HDFS.createDirectory(verifyInput, None, "777", False)
        # Copy template files for the verifier streaming job
        templateFile = open(Verify_Test_File, 'w')
        templateFile.write(Version_Num)
        templateFile.close()
        HDFS.copyFromLocal(Verify_Test_File,
                           verifyInput,
                           user=Config.get('hadoop', 'HADOOPQA_USER'))
        # Submit the special streaming job
        shortStreamingId = HadoopJobHelper.runStreamJob(
            Mapper,
            Reducer,
            verifyInput,
            cls._hdfs_output_verify,
            files=Multi_Version_App_Dir,
            config=config,
            extraJobArg=cls._jobArgs,
            env=env,
            proposedJobName=cls._shortStreamingName)
        MAPRED.waitForJobDoneOrTimeout(shortStreamingId, timeoutInSec=180)
        # Make sure task succeeded
        #assert YARN.getAppFinalStateFromID(appId) == 'SUCCEEDED'

        # Check result content
        retVal, checkContent = HDFS.cat(cls._hdfs_output_verify +
                                        '/part-00000')
        logger.info("CHECK CLASSPATH VERSION OUTPUT")
        logger.info(retVal)
        logger.info(checkContent)
        ruAssert("YARN", retVal == 0)
        ruAssert("YARN", 'True' in checkContent,
                 "[VersionVerify] Stream job returns false: " + checkContent)
        #assert retVal == 0
        #assert 'True' in checkContent, "Stream job returns false: " + checkContent
        #assert 'False' not in checkContent, "Stream job returns false: " + checkContent
        HDFS.deleteDirectory(cls._hdfs_output_verify,
                             user=Config.get('hadoop', 'HADOOPQA_USER'))

Exemple #3

0

Afficher le fichier

Fichier : ruStorm.py Projet : thakkardharmik/beaver

    def verify_hdfs_topology(cls, topologyName, targetDir, lines, type,
                             useStandaloneCmd):
        """
            Verifies the hdfs topologies produced expected output
        """
        #Slider app is killed before log running job verification so disabling topology activation checks.
        if useStandaloneCmd == True:
            ruAssert(
                "Storm",
                Storm.getTopologyStatus(
                    topologyName,
                    logoutput=True,
                    useStandaloneCmd=useStandaloneCmd) == 'ACTIVE')

        exit_code, stdout = HDFS.lsr(targetDir, False, True)
        hdfsListOutput = stdout.splitlines()

        #Picking the second last line as the first file might not have enough content and last file gets into transient
        #HDFS issues.
        if len(hdfsListOutput) >= 2:
            fileLine = hdfsListOutput[-2]
            sampleoutfile = fileLine.split(" ")[-1].strip()

            # Hecky solution as the test code for trident and core topologies writes under same directory.
            # if fileLine.endswith(".txt") and type == "cat":
            #     sampleoutfile = fileLine.split(" ")[-1].strip()
            # if fileLine.endswith(".seq") and type == "text":
            #     sampleoutfile = fileLine.split(" ")[-1].strip()

            logger.info("Taking sampleoutput file : %s" % (sampleoutfile))

            if type == "text":
                exit_code, stdout = HDFS.text(sampleoutfile, None)
            else:
                exit_code, stdout = HDFS.cat(sampleoutfile, None)
            for line in lines:
                ruAssert(
                    "Storm",
                    stdout.find(line) >= 0,
                    "[StormHDFSVerify] expected line : %s in %s" %
                    (line, sampleoutfile))
        else:
            ruAssert("Storm", False,
                     "hdfsListOutput must have at least 2 lines")

Exemple #4

0

Afficher le fichier

Fichier : ruYarn.py Projet : thakkardharmik/beaver

    def verifyLongRunningJob(cls, config=None):
        '''
        Verify long running background job after it finishes
        :return:
        '''
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode
        retVal, checkContent = HDFS.cat(cls._hdfs_output + '/part-00000')
        #assert retVal == 0
        if retVal == 0:
            UpgradePerNode.reportProgress(
                "[PASSED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, retVal = 0. Successful check "
            )
        else:
            UpgradePerNode.reportProgress(
                "[FAILED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, retVal != 0. Failed check "
            )
        #assert 'true' in checkContent, "Stream job returns false: " + checkContent
        if 'true' in checkContent:
            UpgradePerNode.reportProgress(
                "[PASSED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, true in checkContent. Successful check "
            )
        else:
            UpgradePerNode.reportProgress(
                "[FAILED][YARN][BGJobCheck] verifyLongRunning Job for Yarn, true not in checkContent. Failed check  "
            )

        #verify application's attempt doesn't increase and no failed tasks.
        appID = cls._background_job_appId
        jobID = cls._background_job_jobId
        # temporarily skipping check
        #assert YARN.getNumAttemptsForApp(appID) == 1
        #YARN.verifyMRTasksCount(jobID, appID, 0, skipAssert=True)

        from beaver.component.rollingupgrade.ruCommon import hdpRelease
        Version_Num = hdpRelease.getCurrentRelease("hadoop-client")
        cls.checkClasspathVersion(Version_Num, config)