def runProduction(): nCores = cluster.getNCores() nSlaves = cluster.getNSlaves() maxSlaves = int(nCores / 2) memory = cluster.getMemory() cluster.runOnMaster('hdfs dfs -rm -r -f /brisbane1-T4000-BP__BP_R150cm_T2000.avro') command = ' '.join([ 'spark-submit', # '--jars /mnt/data/package/softs/jars/AvroToPythonConverters-1.3.0.jar', '--jars /mnt/data/package/softs/jars/spark-avro_2.10-2.0.1-SNAPSHOT.jar', 'irt/processor/scripts/prodLauncher.py', 'runSparkAvro brisbane1-T4000-BP', '--mode cluster', '--block 1000', '--band P', '--res 1.5', '--modeStore memory', '--tile 2000', '--nbThreads 2', '--nbSlaves {0}'.format(nSlaves), '--maxSlaves {0}'.format(maxSlaves), '--modeProduction L2area']) cluster.runOnMasterX(command)
def syncEvalToMaster(): listCommand = [] (listnodes, nodes) = cluster.instanceListAll() for namenode in listnodes['slaves']: listCommand.append('rsync -lva {0}:/mnt/data/tmp/DEEPEVAL /mnt/data/tmp/'.format(namenode)) command = ';'.join(listCommand) cluster.runOnMaster(command)
def addjars(jars): try: jars.split(':') jars = [jars] except: pass for j in jars: cluster.runOnMaster("echo '' >> setup_spark.sh".format(j)) addJar = [] addJar.append('if [[ -z "$SPARKJARS" ]]; then ') addJar.append(' export SPARKJARS={0}'.format(j)) addJar.append('else ') addJar.append(' export SPARKJARS={0},$SPARKJARS'.format(j)) addJar.append('fi') c = ["echo '" + a + "' >> setup_spark.sh" for a in addJar] print c cluster.runOnAllNodesAsync(';'.join(c)) jars = ':'.join(jars) command = [] command.append( 'echo "" >> {0}'.format(softdir + '/conf/spark-defaults.conf')) command.append('echo "spark.driver.extraClassPath {0}" >> {1}'.format( jars, softdir + '/conf/spark-defaults.conf')) command.append('echo "spark.executor.extraClassPath {0}" >> {1}'.format( jars, softdir + '/conf/spark-defaults.conf')) command = ';'.join(command) cluster.runOnAllNodesAsync(command)
def cleanHdfs(): cluster.runOnMaster('hdfs dfs -rm -r -f /brisbane1-*')
def setupConfigurationFiles(): """Deploy spark configuration files""" (listnodes, nodes) = cluster.instanceListAll() # We create here a fake python link. # This python is used as the main spark driver # If we need to change the spark python driver, # we just have to overwrite this link. cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython)) # ------------------------------------------- # handling of slaves # ------------------------------------------- # The slave file contains information # about which hosts have to be used outSlave = utils.getLocalTempFile('slaves') with open(outSlave, 'w') as streamOut: for namenode in listnodes['slaves']: streamOut.write(namenode + '\n') # ------------------------------------------- # handling of spark configuration # ------------------------------------------- if cluster.getNSlaves() > 0: sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName()) else: sparkMaster = 'local[{0}]'.format((cluster.nCores - 1)) inConf = utils.getTemplateFile('spark-spark-defaults.conf') outConf = utils.getLocalTempFile('spark-defaults.conf') maxSlaves = int(cluster.nCores - 1) utils.stringReplaceInFile( inConf, outConf, { 'XX-DRIVER-MEM-XX': mastermemory, 'XX-EXECUTOR-MEM-XX': executormemory, 'XX-SPARKMASTER-XX': sparkMaster, 'XX-LOCAL-DIR-XX': localtempdir, 'XX-CORES-XX': '{0}'.format(maxSlaves) }) inEnv = utils.getTemplateFile('spark-spark-env.sh') outEnv = utils.getLocalTempFile('spark-env.sh') utils.stringReplaceInFile( inEnv, outEnv, { 'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-SPARKMASTER-XX': sparkMaster, 'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath), 'XX-LOCAL-DIR-XX': localtempdir, 'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster()) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outSlave, softdir + '/conf/slaves') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outConf, softdir + '/conf/spark-defaults.conf') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outEnv, softdir + '/conf/spark-env.sh') cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events') # we remove info level display from spark... cluster.runOnMaster( 'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir)) cluster.runOnMaster( 'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir)) # -------------------------------- # handling of connector # -------------------------------- # We install here the hadoop connector for google cloud storage # This connector permits writing data on google cs directly # from spark if useCloudConnector and cluster.getNSlaves() > 0: cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector)) # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar') cluster.runOnAllNodesAsync( 'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir)) cluster.runOnAllNodesAsync( 'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir)) # ------------------------------------------ # Deployment of spark overloading scripts # ------------------------------------------ # # One problem with spark in standalone mode, is that # we have to use the client mode. # with the client mode, we cannot use the spark default conf # for setting additional jars at launch. # # We therefore use two scripts, one for spark-submit, one for pyspark # for overloading the calls. # # These scripts tests for the existence of jar variables # and make the call accordingly inPyspark = utils.getTemplateFile('pyspark-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars') inSubmit = utils.getTemplateFile('spark-submit-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')
def stopAll(): cluster.runOnMaster('{0}/sbin/stop-all.sh'.format(softdir))