def setupBashrc(): inStartup = utils.getTemplateFile('hdfs-setup_hdfs.sh') outStartup = utils.getLocalTempFile('setup_hdfs.sh') utils.stringReplaceInFile( inStartup, outStartup, { 'XXHADOOPLOCALPATHXX': "'{0}'".format(softdir) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync(outStartup, '~/setup_hdfs.sh') command = 'echo "source setup_hdfs.sh" >> setup.sh' cluster.runOnAllNodesAsync(command)
def setupProfile(): inStartup = utils.getTemplateFile('package-setup_profile_package.sh') outStartup = utils.getLocalTempFile('setup_profile_package.sh') utils.stringReplaceInFile( inStartup, outStartup, { # 'XX-PYTHON-XX': "'{0}'".format(pythonbin), 'XX-SOURCESCRIPT-XX': sourcescript }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outStartup, '~/setup_profile_package.sh') command = 'echo "source $HOME/setup_profile_package.sh" >> setup_profile.sh' cluster.runOnAllNodesAsync(command)
def setupBashrc(): inStartup = utils.getTemplateFile('spark-setup_spark.sh') outStartup = utils.getLocalTempFile('setup_spark.sh') utils.stringReplaceInFile( inStartup, outStartup, { 'XXSPARKLOCALPATHXX': "'{0}'".format(softdir) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outStartup, '~/setup_spark.sh') command = 'echo "source setup_spark.sh" >> setup.sh' cluster.runOnAllNodesAsync(command) if useCloudConnector and cluster.getNSlaves() > 0: addjars('{0}/lib/gcs-connector-latest-hadoop2.jar'.format(softdir))
def setupConfigurationFiles(): """Deploy hadoop""" (listnodes, nodes) = cluster.instanceListAll() mastername = listnodes['master'][0] inCoreSite = utils.getTemplateFile('hdfs-core-site.xml') outCoreSite = utils.getLocalTempFile('core-site.xml') inHdfsSite = utils.getTemplateFile('hdfs-hdfs-site.xml') outHdfsSite = utils.getLocalTempFile('hdfs-site.xml') outSlave = utils.getLocalTempFile('slaves') print '[ Configuring Hadoop ]' utils.stringReplaceInFile( inCoreSite, outCoreSite, { 'PUT-MASTER-IP': mastername, 'XX-PROJECTID-XX': utils.getProjectProperties()['Project'] }) pathnamenode = os.path.join(datadir, 'namenode') pathdatanode = os.path.join(datadir, 'datanode') utils.stringReplaceInFile( inHdfsSite, outHdfsSite, { 'XXREPLICATIONXX': '3', 'XXNAMENODEXX': pathnamenode, 'XXDATANODEXX': pathdatanode, }) with open(outSlave, 'w') as streamOut: for namenode in listnodes['slaves']: streamOut.write(namenode + '\n') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outCoreSite, softdir + '/etc/hadoop/core-site.xml') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outHdfsSite, softdir + '/etc/hadoop/hdfs-site.xml') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outSlave, softdir + '/etc/hadoop/slaves') if useCloudConnector: cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector)) # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar') cluster.runOnAllNodesAsync( 'cp gcs-connector-latest-hadoop2.jar {0}/share/hadoop/common/'.format(softdir)) listCommand = [] listCommand.append("mkdir -p {0}".format(pathnamenode)) listCommand.append("mkdir -p {0}".format(pathdatanode)) listCommand.append("chmod -R a+rwx {0}".format(pathnamenode)) listCommand.append("chmod -R a+rwx {0}".format(pathdatanode)) command = ';'.join(listCommand) cluster.runOnAllNodesAsync(command)
def setupConfigurationFiles(): """Deploy spark configuration files""" (listnodes, nodes) = cluster.instanceListAll() # We create here a fake python link. # This python is used as the main spark driver # If we need to change the spark python driver, # we just have to overwrite this link. cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython)) # ------------------------------------------- # handling of slaves # ------------------------------------------- # The slave file contains information # about which hosts have to be used outSlave = utils.getLocalTempFile('slaves') with open(outSlave, 'w') as streamOut: for namenode in listnodes['slaves']: streamOut.write(namenode + '\n') # ------------------------------------------- # handling of spark configuration # ------------------------------------------- if cluster.getNSlaves() > 0: sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName()) else: sparkMaster = 'local[{0}]'.format((cluster.nCores - 1)) inConf = utils.getTemplateFile('spark-spark-defaults.conf') outConf = utils.getLocalTempFile('spark-defaults.conf') maxSlaves = int(cluster.nCores - 1) utils.stringReplaceInFile( inConf, outConf, { 'XX-DRIVER-MEM-XX': mastermemory, 'XX-EXECUTOR-MEM-XX': executormemory, 'XX-SPARKMASTER-XX': sparkMaster, 'XX-LOCAL-DIR-XX': localtempdir, 'XX-CORES-XX': '{0}'.format(maxSlaves) }) inEnv = utils.getTemplateFile('spark-spark-env.sh') outEnv = utils.getLocalTempFile('spark-env.sh') utils.stringReplaceInFile( inEnv, outEnv, { 'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-SPARKMASTER-XX': sparkMaster, 'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath), 'XX-LOCAL-DIR-XX': localtempdir, 'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster()) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outSlave, softdir + '/conf/slaves') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outConf, softdir + '/conf/spark-defaults.conf') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outEnv, softdir + '/conf/spark-env.sh') cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events') # we remove info level display from spark... cluster.runOnMaster( 'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir)) cluster.runOnMaster( 'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir)) # -------------------------------- # handling of connector # -------------------------------- # We install here the hadoop connector for google cloud storage # This connector permits writing data on google cs directly # from spark if useCloudConnector and cluster.getNSlaves() > 0: cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector)) # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar') cluster.runOnAllNodesAsync( 'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir)) cluster.runOnAllNodesAsync( 'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir)) # ------------------------------------------ # Deployment of spark overloading scripts # ------------------------------------------ # # One problem with spark in standalone mode, is that # we have to use the client mode. # with the client mode, we cannot use the spark default conf # for setting additional jars at launch. # # We therefore use two scripts, one for spark-submit, one for pyspark # for overloading the calls. # # These scripts tests for the existence of jar variables # and make the call accordingly inPyspark = utils.getTemplateFile('pyspark-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars') inSubmit = utils.getTemplateFile('spark-submit-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')
def getCacheListFile(): """Return file for storing list of instances""" cacheFile = utils.getLocalTempFile('listInstances.pck') return cacheFile
def getStartupFile(): """Return path to local temporary startup file """ return utils.getLocalTempFile('startup.sh')