Python Spark 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: beaver.component.spark

클래스/타입: Spark

hotexamples.com에서의 예제들: 9

Python Spark - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 beaver.component.spark.Spark에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

createTmpClientFile(3)

submitSparkApplication(2)

getCorrectApplicationJsonData(1)

getLzoJar(1)

getSparkATSAppUrl(1)

getSparkHistoryServerHostname(1)

getSparkHistoryServerUrl(1)

getSparkHome(1)

getSparkLogDir(1)

hitSparkURL(1)

matchparamater(1)

startNetcatServerinBackground(1)

stopNetcatServer(1)

submitSparkPyApplication(1)

validateSparkHSCompletedApps(1)

validateSparkHSJobs(1)

예제 #1

파일 보기

 def run(self, messages=None, host="localhost", port="9999"):  # pylint: disable=unused-argument
     if not messages:
         messages = ["hello world"]
     for message in self.messages:
         Spark.startNetcatServerinBackground(message=message)
         time.sleep(4)
         Spark.stopNetcatServer(message=message)

예제 #2

파일 보기

파일: spark_ha.py 프로젝트: thakkardharmik/beaver

    def start_LongRunning_Federation_HDFS_stream_job(
            cls,
            inputDir,
            outputDir,
            num_executor,
            mode="yarn-client",
            inBackground=True,
            clientfile=None,
            pythonFile="federation_hdfs_wordcount.py",
            srcDir=None,
            keytab=None,
            principal=None):
        """
          Starts Spark-HDFS Streaming application using python file
          :param inputDir:
          :param outputDir:
          :param num_executor:
          :param mode:
          :param inBackground:
          :param clientfile:
          :param pythonFile: Python file which need to be run as spark streaming application
          :param srcDir: Path of the Python file
          :return: (application ID, Local client log)
          """
        if clientfile == None:
            Local_clientlog = Spark.createTmpClientFile(pythonFile + "_" +
                                                        mode)
        else:
            Local_clientlog = Spark.createTmpClientFile(clientfile)

        if pythonFile == "federation_hdfs_wordcount.py":
            srcDir = os.path.join(Config.getEnv("WORKSPACE"), "tests", "spark",
                                  "examples", "streaming")

        arg = " %s %s 2>&1 | tee %s" % (inputDir, outputDir, Local_clientlog)

        Spark.submitSparkPyApplication(pythonFile,
                                       mode,
                                       arg,
                                       num_executor=num_executor,
                                       inBackground=inBackground,
                                       srcDir=srcDir,
                                       timeout=120,
                                       clientfile=clientfile,
                                       conf=None,
                                       keytab=keytab,
                                       principal=principal)

        f = open(Local_clientlog, "r")
        stdout = f.read()
        f.close()
        appId = YARN.getApplicationIDFromStdout(stdout)
        return appId, Local_clientlog

예제 #3

파일 보기

    def run_smoke_test(cls, config=None):
        '''
        Run smoke test for spark
        '''
        logger.info("config = %s", config)
        from beaver.component.spark import Spark
        from beaver.component.rollingupgrade.ruUpgrade import UpgradePerNode
        UpgradePerNode.reportProgress(
            "[INFO][Spark][Smoke] Smoke test for Spark started ")
        exit_code, _ = Spark.submitSparkApplication(
            "org.apache.spark.examples.SparkPi", "yarn-cluster", "3")
        if exit_code != 0:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Smoke Test Failed in Yarn-cluster mode"
            )
            return

        exit_code, stdout2 = Spark.submitSparkApplication(
            "org.apache.spark.examples.SparkPi", "yarn-client", "3")
        if exit_code != 0:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Smoke Test Failed in Yarn-client mode"
            )
            return

        if Machine.isWindows():
            appName_pi = "SparkPi"
        else:
            appName_pi = "Spark Pi"
        HADOOP_QA = Config.get('hadoop', 'HADOOPQA_USER')
        appId = YARN.getApplicationIDFromStdout(stdout2).strip()
        logger.info(
            "Validate http://<host>:<port>/ws/v1/timeline/spark_event_v01/<appId>"
        )
        Spark.getSparkATSAppUrl(appId)
        time.sleep(30)
        # Spark-ats check. We will enable it once Ambari enables Spark-ATS by default
        #cls.validate_ApplicationEntry(appId, appName_pi, HADOOP_QA, mode="yarn-client", url=url)
        Spark.hitSparkURL()
        time.sleep(50)
        result_HS_completeApp = Spark.validateSparkHSCompletedApps(
            appId, appName_pi, HADOOP_QA)
        if not result_HS_completeApp:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Spark HS complete App Validation failed"
            )
            return
        result_HS_Jobs = Spark.validateSparkHSJobs(appId, "1/1", "3/3")
        if not result_HS_Jobs:
            UpgradePerNode.reportProgress(
                "[FAILED][Spark][Smoke] SparkPi Spark HS Job page validation failed"
            )
            return

예제 #4

파일 보기

 def Spark_getSparkLogDir(cls, logoutput=True):
     try:
         from beaver.component.spark import Spark
         return Spark.getSparkLogDir()
     except Exception as e:
         if logoutput:
             logger.error(
                 "Exception occured during Spark_getSparkLogDir() call: %s",
                 str(e))
         return None

예제 #5

파일 보기

파일: spark_ha.py 프로젝트: thakkardharmik/beaver

    def start_LongRunning_HDFS_stream_job(cls,
                                          inputDir,
                                          num_executor,
                                          mode="yarn-client",
                                          inBackground=True,
                                          clientfile=None):
        '''
          Start Spark-HDFS Streaming application
          '''
        className = "org.apache.spark.examples.streaming.HdfsWordCount"
        if mode == "yarn-client" and not HDFS.isASV():
            jars = Spark.getLzoJar()
        else:
            jars = None
        if clientfile == None:
            Local_clientlog = Spark.createTmpClientFile(className + "_" + mode)
        else:
            Local_clientlog = Spark.createTmpClientFile(clientfile)
        arg = " %s 2>&1 | tee %s" % (inputDir, Local_clientlog)
        if Hadoop.isSecure():
            keytab = Machine.getHeadlessUserKeytab(
                Config.get('hadoop', 'HADOOPQA_USER'))
            principal = Machine.get_user_principal(
                Config.get('hadoop', 'HADOOPQA_USER'))
        else:
            keytab = None
            principal = None

        Spark.submitSparkApplication(className,
                                     mode,
                                     arg,
                                     jars=jars,
                                     num_executor=num_executor,
                                     inBackground=inBackground,
                                     timeout=120,
                                     keytab=keytab,
                                     principal=principal)
        f = open(Local_clientlog, "r")
        stdout = f.read()
        f.close()
        appId = YARN.getApplicationIDFromStdout(stdout)
        return appId, Local_clientlog

예제 #6

파일 보기

파일: oozie.py 프로젝트: thakkardharmik/beaver

    def getdnjars(cls):
        # Workaround BUG-58287 org.datanucleus.api.jdo.JDOPersistenceManagerFactory
        spark_lib_dir = os.path.join(Spark.getSparkHome(), "lib")
        dn_jars = util.findMatchingFiles(spark_lib_dir, "datanucleus*.jar")
        jars = ''
        for jar in dn_jars:
            jars = jar + "," + jars

        # remove the last "," in the list
        jars = jars[:-1]
        return jars

예제 #7

파일 보기

    def __init__(self, is_proxy=False):
        self.is_proxy = is_proxy
        self.ambari_connector = ZeppelinAmbariAPIUtil()
        self.driver = SparkUIClientSession.__instantiate_webdriver()
        assert self.driver, "Could not initialize selenium webdriver"

        if self.is_proxy:
            self.shs_proxy_url = self.get_shs_proxy_url()
            assert self.shs_proxy_url, "Failed to find SHS knox proxy URL"

        self.shs_direct_url = Spark.getSparkHistoryServerUrl()
        assert self.shs_direct_url, "Failed to find SHS direct URL"

        self.ambari_url = self.get_ambari_url()
        assert self.ambari_url, "Failed to find ambari web URL"

예제 #8

파일 보기

파일: spark_ha.py 프로젝트: thakkardharmik/beaver

    def validate_HDFS_stream_job(cls,
                                 appId,
                                 mode,
                                 patterns,
                                 expected_count,
                                 clientfile=None):
        '''
          count the occurance of word in the yarn logs.
            -> check clientfile for yarn-client mode
            -> check yarn logs for yarn-cluster mode

          appId : application Id
          mode : mode of execution 
          patterns : list of words to check in log
          expected_count : the expected number of occurence for each word in patterns
          clientfile : jobclient output for app
          '''
        if mode == "yarn-client":
            file_to_read = clientfile
        else:
            file_to_read = Spark.createTmpClientFile(appId + ".log")
            YARN.getLogsApplicationID(appId,
                                      appOwner=None,
                                      nodeAddress=None,
                                      containerId=None,
                                      logoutput=False,
                                      grepFilter=None,
                                      pipeToFileOutput=file_to_read,
                                      config=None)

        count = 0
        word_count = {}
        # initialize word_count dictonary
        for p in patterns:
            word_count[p] = 0
        with open(file_to_read) as f:
            for line in f:
                words = line.split()
                for word in words:
                    if word in word_count.keys():
                        word_count[word] = word_count[word] + 1

        logger.info(word_count)
        for key, value in word_count.iteritems():
            assert value >= expected_count, "%s wordcount is %s. expected_count is %s" % (
                key, value, expected_count)

예제 #9

파일 보기

    def validate_ApplicationEntry(cls,
                                  appId,
                                  appName,
                                  appUser,
                                  mode="yarn-client",
                                  url=None):
        '''
        Validate Application entry
        :param entities: Its output from getCorrectApplicationJsonData
        :param appId: Application Id
        :param appName: Application name
        :param appUser: Application user
        :return:
        '''
        from beaver.component.spark import Spark
        if not url:
            entities = Spark.getCorrectApplicationJsonData(appId)
        else:
            entities = Spark.getCorrectApplicationJsonData(
                appId, url, gatherAppSpecificJson=False)

        logger.info("***** entities *****")
        logger.info(entities)
        logger.info("********************")

        if mode == "yarn-cluster":
            ruAssert(
                "Spark", entities["entity"] == YARN.createAttemptIdFromAppId(
                    appId, "1"), "[Smoke] attemptid entity not found in ATS")
        else:
            ruAssert("Spark", entities["entity"] == appId,
                     "[Smoke] appid entity not found in ATS")
        ruAssert("Spark", entities["domain"] == "DEFAULT",
                 "[Smoke] domain is not default")
        ruAssert("Spark", entities["entitytype"] == "spark_event_v01",
                 "[Smoke] entitytype is not spark_event_v01")
        ruAssert(
            "Spark", entities["primaryfilters"]["endApp"] == [
                'SparkListenerApplicationEnd'
            ], "[Smoke] endapp event missing from ats")
        ruAssert(
            "Spark", entities["primaryfilters"]["startApp"] == [
                'SparkListenerApplicationStart'
            ], "[Smoke] startapp event missing from ats")
        if not Machine.isLinux() and appName == "Spark Pi":
            ruAssert("Spark", entities["otherinfo"]["appName"] == "SparkPi",
                     "[Smoke] otherinfo -> appname is missing from ats")
        else:
            ruAssert("Spark", entities["otherinfo"]["appName"] == appName,
                     "[Smoke] otherinfo -> appname is missing from ats")

        ruAssert("Spark", entities["otherinfo"]["appUser"] == appUser,
                 "[Smoke] otherinfo -> appuser is missing from ats")
        ruAssert(
            "Spark",
            Spark.matchparamater(entities["otherinfo"]["startTime"],
                                 "[0-9]{13}"),
            "[Smoke] otherinfo -> starttime is missing from ats")
        ruAssert(
            "Spark",
            Spark.matchparamater(entities["otherinfo"]["endTime"],
                                 "[0-9]{13}"),
            "[Smoke] otherinfo -> endtime is missing from ats")