def runBigPerou(): nCores = cluster.getNCores() nSlaves = cluster.getNSlaves() memory = cluster.getMemory() maxSlaves = int(nCores / 2) # cluster.runOnMasterX( # 'python2.7 irt/processor/scripts/prodLauncher.py write perou 2000 P') # cluster.runOnMasterX('hdfs dfs -put $PYIRT_DIR_WORK/perou-T2000-BP* /') command = ' '.join([ 'spark-submit', '--jars /mnt/data/package/softs/jars/AvroToPythonConverters-1.3.0.jar', 'irt/processor/scripts/prodLauncher.py', 'runSparkAvro perou-T2000-BP', '--mode cluster', '--block 1000', '--band P', '--res 1', # '--modeStore memory', '--tile 2000', '--nbThreads 2', '--nbSlaves {0}'.format(nSlaves), '--maxSlaves {0}'.format(maxSlaves), '--modeProduction L2area']) cluster.runOnMasterX(command)
def runProduction(): nCores = cluster.getNCores() nSlaves = cluster.getNSlaves() maxSlaves = int(nCores / 2) memory = cluster.getMemory() cluster.runOnMaster('hdfs dfs -rm -r -f /brisbane1-T4000-BP__BP_R150cm_T2000.avro') command = ' '.join([ 'spark-submit', # '--jars /mnt/data/package/softs/jars/AvroToPythonConverters-1.3.0.jar', '--jars /mnt/data/package/softs/jars/spark-avro_2.10-2.0.1-SNAPSHOT.jar', 'irt/processor/scripts/prodLauncher.py', 'runSparkAvro brisbane1-T4000-BP', '--mode cluster', '--block 1000', '--band P', '--res 1.5', '--modeStore memory', '--tile 2000', '--nbThreads 2', '--nbSlaves {0}'.format(nSlaves), '--maxSlaves {0}'.format(maxSlaves), '--modeProduction L2area']) cluster.runOnMasterX(command)
def setupFullWithFormat(): if cluster.getNSlaves() > 0: setupSoftFiles() setupConfigurationFiles() setupBashrc() formatFileSystem() startDfs()
def setupBashrc(): inStartup = utils.getTemplateFile('spark-setup_spark.sh') outStartup = utils.getLocalTempFile('setup_spark.sh') utils.stringReplaceInFile( inStartup, outStartup, { 'XXSPARKLOCALPATHXX': "'{0}'".format(softdir) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outStartup, '~/setup_spark.sh') command = 'echo "source setup_spark.sh" >> setup.sh' cluster.runOnAllNodesAsync(command) if useCloudConnector and cluster.getNSlaves() > 0: addjars('{0}/lib/gcs-connector-latest-hadoop2.jar'.format(softdir))
def setupConfigurationFiles(): """Deploy spark configuration files""" (listnodes, nodes) = cluster.instanceListAll() # We create here a fake python link. # This python is used as the main spark driver # If we need to change the spark python driver, # we just have to overwrite this link. cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython)) # ------------------------------------------- # handling of slaves # ------------------------------------------- # The slave file contains information # about which hosts have to be used outSlave = utils.getLocalTempFile('slaves') with open(outSlave, 'w') as streamOut: for namenode in listnodes['slaves']: streamOut.write(namenode + '\n') # ------------------------------------------- # handling of spark configuration # ------------------------------------------- if cluster.getNSlaves() > 0: sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName()) else: sparkMaster = 'local[{0}]'.format((cluster.nCores - 1)) inConf = utils.getTemplateFile('spark-spark-defaults.conf') outConf = utils.getLocalTempFile('spark-defaults.conf') maxSlaves = int(cluster.nCores - 1) utils.stringReplaceInFile( inConf, outConf, { 'XX-DRIVER-MEM-XX': mastermemory, 'XX-EXECUTOR-MEM-XX': executormemory, 'XX-SPARKMASTER-XX': sparkMaster, 'XX-LOCAL-DIR-XX': localtempdir, 'XX-CORES-XX': '{0}'.format(maxSlaves) }) inEnv = utils.getTemplateFile('spark-spark-env.sh') outEnv = utils.getLocalTempFile('spark-env.sh') utils.stringReplaceInFile( inEnv, outEnv, { 'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-SPARKMASTER-XX': sparkMaster, 'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath), 'XX-LOCAL-DIR-XX': localtempdir, 'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster()) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outSlave, softdir + '/conf/slaves') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outConf, softdir + '/conf/spark-defaults.conf') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outEnv, softdir + '/conf/spark-env.sh') cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events') # we remove info level display from spark... cluster.runOnMaster( 'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir)) cluster.runOnMaster( 'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir)) # -------------------------------- # handling of connector # -------------------------------- # We install here the hadoop connector for google cloud storage # This connector permits writing data on google cs directly # from spark if useCloudConnector and cluster.getNSlaves() > 0: cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector)) # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar') cluster.runOnAllNodesAsync( 'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir)) cluster.runOnAllNodesAsync( 'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir)) # ------------------------------------------ # Deployment of spark overloading scripts # ------------------------------------------ # # One problem with spark in standalone mode, is that # we have to use the client mode. # with the client mode, we cannot use the spark default conf # for setting additional jars at launch. # # We therefore use two scripts, one for spark-submit, one for pyspark # for overloading the calls. # # These scripts tests for the existence of jar variables # and make the call accordingly inPyspark = utils.getTemplateFile('pyspark-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars') inSubmit = utils.getTemplateFile('spark-submit-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')