def start(self): jvm = self.gateway.jvm self.job.addMrGeoProperties() dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders( ) for prop in dpf_properties: self.job.setSetting(prop, dpf_properties[prop]) if self.job.isDebug(): master = "local" elif self.job.isSpark(): # TODO: get the master for spark master = "" elif self.job.isYarn(): master = "yarn-client" else: cpus = (multiprocessing.cpu_count() / 4) * 3 if cpus < 2: master = "local" else: master = "local[" + str(cpus) + "]" set_field( self.job, "jars", jvm.StringUtils.concatUnique( jvm.DependencyLoader.getAndCopyDependencies( "org.mrgeo.mapalgebra.MapAlgebra", None), jvm.DependencyLoader.getAndCopyDependencies( jvm.MapOpFactory.getMapOpClassNames(), None))) conf = jvm.MrGeoDriver.prepareJob(self.job) # need to override the yarn mode to "yarn-client" for python if self.job.isYarn(): conf.set("spark.master", "yarn-client") mem = jvm.SparkUtils.humantokb(conf.get("spark.executor.memory")) workers = int( conf.get("spark.executor.instances")) + 1 # one for the driver conf.set("spark.executor.memory", jvm.SparkUtils.kbtohuman(long(mem / workers), "m")) # for a in conf.getAll(): # print(a._1(), a._2()) # jsc = jvm.JavaSparkContext(master, appName, sparkHome, jars) jsc = jvm.JavaSparkContext(conf) self.sparkContext = jsc.sc() self.sparkPyContext = SparkContext(master=master, appName=self.job.name(), jsc=jsc, gateway=self.gateway)
def _create_job(self): if not self._job: jvm = self._get_jvm() java_import(jvm, "org.mrgeo.job.*") appname = "PyMrGeo" self._job = jvm.JobArguments() java_gateway.set_field(self._job, "name", appname) # Yarn in the default self.useyarn()
def start(self, context=None): if not context: jvm = self._get_jvm() job = self._get_job() job.addMrGeoProperties() dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders() for prop in dpf_properties: job.setSetting(prop, dpf_properties[prop]) if job.isYarn(): job.loadYarnSettings() java_gateway.set_field(job, "jars", jvm.StringUtils.concatUnique( jvm.DependencyLoader.getAndCopyDependencies("org.mrgeo.mapalgebra.MapAlgebra", None), jvm.DependencyLoader.getAndCopyDependencies(jvm.MapOpFactory.getMapOpClassNames(), None))) conf = jvm.MrGeoDriver.prepareJob(job) if job.isYarn(): # need to override the yarn mode to "yarn-client" for python conf.set("spark.master", "yarn-client") if not conf.getBoolean("spark.dynamicAllocation.enabled", False): conf.set("spark.executor.instances", str(job.executors())) conf.set("spark.executor.cores", str(job.cores())) # in yarn-cluster, this is the total memory in the cluster, but here in yarn-client, it is # the memory per executor. Go figure! mem = job.executorMemKb() overhead = conf.getInt("spark.yarn.executor.memoryOverhead", 384) if (mem * 0.1) > overhead: overhead = mem * 0.1 if overhead < 384: overhead = 384 mem -= (overhead * 2) # overhead is 1x for driver and 1x for application master (am) conf.set("spark.executor.memory", jvm.SparkUtils.kbtohuman(long(mem), "m")) jsc = jvm.JavaSparkContext(conf) jsc.setCheckpointDir(jvm.HadoopFileUtils.createJobTmp(jsc.hadoopConfiguration()).toString()) self.sparkContext = jsc.sc() else: self.sparkContext = context
def _create_job(self): jvm = self.gateway.jvm java_import(jvm, "org.mrgeo.data.DataProviderFactory") java_import(jvm, "org.mrgeo.job.*") java_import(jvm, "org.mrgeo.utils.DependencyLoader") java_import(jvm, "org.mrgeo.utils.StringUtils") appname = "PyMrGeo" self.job = jvm.JobArguments() set_field(self.job, "name", appname) # Yarn in the default self.useyarn()
def start(self): jvm = self.gateway.jvm self.job.addMrGeoProperties() dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders() for prop in dpf_properties: self.job.setSetting(prop, dpf_properties[prop]) if self.job.isDebug(): master = "local" elif self.job.isSpark(): # TODO: get the master for spark master = "" elif self.job.isYarn(): master = "yarn-client" else: cpus = (multiprocessing.cpu_count() / 4) * 3 if cpus < 2: master = "local" else: master = "local[" + str(cpus) + "]" set_field(self.job, "jars", jvm.StringUtils.concatUnique( jvm.DependencyLoader.getAndCopyDependencies("org.mrgeo.mapalgebra.MapAlgebra", None), jvm.DependencyLoader.getAndCopyDependencies(jvm.MapOpFactory.getMapOpClassNames(), None))) conf = jvm.MrGeoDriver.prepareJob(self.job) # need to override the yarn mode to "yarn-client" for python if self.job.isYarn(): conf.set("spark.master", "yarn-client") if not conf.getBoolean("spark.dynamicAllocation.enabled", False): mem = jvm.SparkUtils.humantokb(conf.get("spark.executor.memory")) workers = int(conf.get("spark.executor.instances")) + 1 # one for the driver conf.set("spark.executor.memory", jvm.SparkUtils.kbtohuman(long(mem / workers), "m")) # for a in conf.getAll(): # print(a._1(), a._2()) # jsc = jvm.JavaSparkContext(master, appName, sparkHome, jars) jsc = jvm.JavaSparkContext(conf) self.sparkContext = jsc.sc() self.sparkPyContext = SparkContext(master=master, appName=self.job.name(), jsc=jsc, gateway=self.gateway)
def start(self): if self._started: # print("MrGeo is already started") return jvm = self._get_jvm() job = self._get_job() job.addMrGeoProperties() dpf_properties = jvm.DataProviderFactory.getConfigurationFromProviders( ) for prop in dpf_properties: job.setSetting(prop, dpf_properties[prop]) jvm.DependencyLoader.setPrintMissingDependencies(False) jvm.DependencyLoader.resetMissingDependencyList() java_gateway.set_field( job, "jars", jvm.StringUtils.concatUnique( jvm.DependencyLoader.getAndCopyDependencies( "org.mrgeo.mapalgebra.MapAlgebra", None), jvm.DependencyLoader.getAndCopyDependencies( jvm.MapOpFactory.getMapOpClassNames(), None))) conf = jvm.MrGeoDriver.prepareJob(job) jvm.DependencyLoader.printMissingDependencies() if self._localGateway: if job.isYarn(): job.loadYarnSettings() # need to override the yarn mode to "yarn-client" for python conf.set("spark.master", "yarn-client") if not conf.getBoolean("spark.dynamicAllocation.enabled", False): conf.set("spark.executor.instances", str(job.executors())) conf.set("spark.executor.cores", str(job.cores())) # in yarn-cluster, this is the total memory in the cluster, but here in yarn-client, it is # the memory per executor. Go figure! mem = job.executorMemKb() overhead = conf.getInt("spark.yarn.executor.memoryOverhead", 384) if (mem * 0.1) > overhead: overhead = mem * 0.1 if overhead < 384: overhead = 384 mem -= ( overhead * 2 ) # overhead is 1x for driver and 1x for application master (am) conf.set("spark.executor.memory", jvm.SparkUtils.kbtohuman(long(mem), "m")) jsc = jvm.JavaSparkContext(conf) jsc.setCheckpointDir( jvm.HadoopFileUtils.createJobTmp( jsc.hadoopConfiguration()).toString()) self.sparkContext = jsc.sc() self._started = True