def stopDfs(): (listnodes, nodes) = cluster.instanceListAll() mastername = listnodes['master'][0] master = nodes[mastername] masterIp = master['externalIp'] utils.ssh(masterIp, "stop-dfs.sh")
def getStatus(): (listnodes, nodes) = cluster.instanceListAll() mastername = listnodes['master'][0] master = nodes[mastername] masterIp = master['externalIp'] utils.ssh(masterIp, "hdfs dfsadmin -report")
def formatFileSystem(): (listnodes, nodes) = cluster.instanceListAll() mastername = listnodes['master'][0] master = nodes[mastername] masterIp = master['externalIp'] utils.ssh(masterIp, "hdfs namenode -format")
def syncEvalToMaster(): listCommand = [] (listnodes, nodes) = cluster.instanceListAll() for namenode in listnodes['slaves']: listCommand.append('rsync -lva {0}:/mnt/data/tmp/DEEPEVAL /mnt/data/tmp/'.format(namenode)) command = ';'.join(listCommand) cluster.runOnMaster(command)
def launchUi(): (listnodes, nodes) = cluster.instanceListAll() mastername = listnodes['master'][0] master = nodes[mastername] masterIp = master['externalIp'] utils.exec_command('firefox --new-window') command = 'firefox --new-tab ' + \ masterIp + ':8080 --new-tab ' + masterIp + ':4040' utils.exec_command(command)
def fullCleanup(): (listnodes, nodes) = cluster.instanceListAll() stopDfs(listnodes, nodes) pathnamenode = os.path.join(datadir, 'namenode') pathdatanode = os.path.join(datadir, 'datanode') for namenode in listnodes['all']: node = nodes[namenode] nodeip = node['externalIp'] utils.ssh(nodeip, "rm -rf {0}".format(pathnamenode)) utils.ssh(nodeip, "rm -rf {0}".format(pathdatanode)) utils.ssh(nodeip, "mkdir -p {0}".format(pathnamenode)) utils.ssh(nodeip, "mkdir -p {0}".format(pathdatanode)) utils.ssh(nodeip, "chmod -R a+rwx {0}".format(pathnamenode)) utils.ssh(nodeip, "chmod -R a+rwx {0}".format(pathdatanode)) formatFileSystem(listnodes, nodes) startDfs(listnodes, nodes)
def setupConfigurationFiles(): """Deploy hadoop""" (listnodes, nodes) = cluster.instanceListAll() mastername = listnodes['master'][0] inCoreSite = utils.getTemplateFile('hdfs-core-site.xml') outCoreSite = utils.getLocalTempFile('core-site.xml') inHdfsSite = utils.getTemplateFile('hdfs-hdfs-site.xml') outHdfsSite = utils.getLocalTempFile('hdfs-site.xml') outSlave = utils.getLocalTempFile('slaves') print '[ Configuring Hadoop ]' utils.stringReplaceInFile( inCoreSite, outCoreSite, { 'PUT-MASTER-IP': mastername, 'XX-PROJECTID-XX': utils.getProjectProperties()['Project'] }) pathnamenode = os.path.join(datadir, 'namenode') pathdatanode = os.path.join(datadir, 'datanode') utils.stringReplaceInFile( inHdfsSite, outHdfsSite, { 'XXREPLICATIONXX': '3', 'XXNAMENODEXX': pathnamenode, 'XXDATANODEXX': pathdatanode, }) with open(outSlave, 'w') as streamOut: for namenode in listnodes['slaves']: streamOut.write(namenode + '\n') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outCoreSite, softdir + '/etc/hadoop/core-site.xml') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outHdfsSite, softdir + '/etc/hadoop/hdfs-site.xml') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outSlave, softdir + '/etc/hadoop/slaves') if useCloudConnector: cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector)) # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar') cluster.runOnAllNodesAsync( 'cp gcs-connector-latest-hadoop2.jar {0}/share/hadoop/common/'.format(softdir)) listCommand = [] listCommand.append("mkdir -p {0}".format(pathnamenode)) listCommand.append("mkdir -p {0}".format(pathdatanode)) listCommand.append("chmod -R a+rwx {0}".format(pathnamenode)) listCommand.append("chmod -R a+rwx {0}".format(pathdatanode)) command = ';'.join(listCommand) cluster.runOnAllNodesAsync(command)
def setupConfigurationFiles(): """Deploy spark configuration files""" (listnodes, nodes) = cluster.instanceListAll() # We create here a fake python link. # This python is used as the main spark driver # If we need to change the spark python driver, # we just have to overwrite this link. cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython)) # ------------------------------------------- # handling of slaves # ------------------------------------------- # The slave file contains information # about which hosts have to be used outSlave = utils.getLocalTempFile('slaves') with open(outSlave, 'w') as streamOut: for namenode in listnodes['slaves']: streamOut.write(namenode + '\n') # ------------------------------------------- # handling of spark configuration # ------------------------------------------- if cluster.getNSlaves() > 0: sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName()) else: sparkMaster = 'local[{0}]'.format((cluster.nCores - 1)) inConf = utils.getTemplateFile('spark-spark-defaults.conf') outConf = utils.getLocalTempFile('spark-defaults.conf') maxSlaves = int(cluster.nCores - 1) utils.stringReplaceInFile( inConf, outConf, { 'XX-DRIVER-MEM-XX': mastermemory, 'XX-EXECUTOR-MEM-XX': executormemory, 'XX-SPARKMASTER-XX': sparkMaster, 'XX-LOCAL-DIR-XX': localtempdir, 'XX-CORES-XX': '{0}'.format(maxSlaves) }) inEnv = utils.getTemplateFile('spark-spark-env.sh') outEnv = utils.getLocalTempFile('spark-env.sh') utils.stringReplaceInFile( inEnv, outEnv, { 'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-SPARKMASTER-XX': sparkMaster, 'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython), 'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath), 'XX-LOCAL-DIR-XX': localtempdir, 'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster()) }) cluster.rsyncOnAllNodesLocalhostToLocalAsync( outSlave, softdir + '/conf/slaves') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outConf, softdir + '/conf/spark-defaults.conf') cluster.rsyncOnAllNodesLocalhostToLocalAsync( outEnv, softdir + '/conf/spark-env.sh') cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events') # we remove info level display from spark... cluster.runOnMaster( 'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir)) cluster.runOnMaster( 'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir)) # -------------------------------- # handling of connector # -------------------------------- # We install here the hadoop connector for google cloud storage # This connector permits writing data on google cs directly # from spark if useCloudConnector and cluster.getNSlaves() > 0: cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector)) # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar') cluster.runOnAllNodesAsync( 'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir)) cluster.runOnAllNodesAsync( 'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir)) # ------------------------------------------ # Deployment of spark overloading scripts # ------------------------------------------ # # One problem with spark in standalone mode, is that # we have to use the client mode. # with the client mode, we cannot use the spark default conf # for setting additional jars at launch. # # We therefore use two scripts, one for spark-submit, one for pyspark # for overloading the calls. # # These scripts tests for the existence of jar variables # and make the call accordingly inPyspark = utils.getTemplateFile('pyspark-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars') inSubmit = utils.getTemplateFile('spark-submit-jars') cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')