Exemple #1
0
def stopDfs():

    (listnodes, nodes) = cluster.instanceListAll()

    mastername = listnodes['master'][0]
    master = nodes[mastername]
    masterIp = master['externalIp']

    utils.ssh(masterIp, "stop-dfs.sh")
Exemple #2
0
def getStatus():

    (listnodes, nodes) = cluster.instanceListAll()

    mastername = listnodes['master'][0]
    master = nodes[mastername]
    masterIp = master['externalIp']

    utils.ssh(masterIp, "hdfs dfsadmin -report")
Exemple #3
0
def formatFileSystem():

    (listnodes, nodes) = cluster.instanceListAll()

    mastername = listnodes['master'][0]
    master = nodes[mastername]
    masterIp = master['externalIp']

    utils.ssh(masterIp, "hdfs namenode -format")
Exemple #4
0
def syncEvalToMaster():
    listCommand = []

    (listnodes, nodes) = cluster.instanceListAll()


    for namenode in listnodes['slaves']:
        listCommand.append('rsync -lva {0}:/mnt/data/tmp/DEEPEVAL /mnt/data/tmp/'.format(namenode))

    command = ';'.join(listCommand)
    cluster.runOnMaster(command)
Exemple #5
0
def launchUi():

    (listnodes, nodes) = cluster.instanceListAll()

    mastername = listnodes['master'][0]
    master = nodes[mastername]
    masterIp = master['externalIp']

    utils.exec_command('firefox --new-window')
    command = 'firefox --new-tab ' + \
        masterIp + ':8080 --new-tab ' + masterIp + ':4040'
    utils.exec_command(command)
Exemple #6
0
def fullCleanup():

    (listnodes, nodes) = cluster.instanceListAll()

    stopDfs(listnodes, nodes)

    pathnamenode = os.path.join(datadir, 'namenode')
    pathdatanode = os.path.join(datadir, 'datanode')

    for namenode in listnodes['all']:

        node = nodes[namenode]
        nodeip = node['externalIp']

        utils.ssh(nodeip, "rm -rf {0}".format(pathnamenode))
        utils.ssh(nodeip, "rm -rf {0}".format(pathdatanode))
        utils.ssh(nodeip, "mkdir -p {0}".format(pathnamenode))
        utils.ssh(nodeip, "mkdir -p {0}".format(pathdatanode))
        utils.ssh(nodeip, "chmod -R a+rwx {0}".format(pathnamenode))
        utils.ssh(nodeip, "chmod -R a+rwx {0}".format(pathdatanode))

    formatFileSystem(listnodes, nodes)
    startDfs(listnodes, nodes)
Exemple #7
0
def setupConfigurationFiles():
    """Deploy hadoop"""

    (listnodes, nodes) = cluster.instanceListAll()
    mastername = listnodes['master'][0]

    inCoreSite = utils.getTemplateFile('hdfs-core-site.xml')
    outCoreSite = utils.getLocalTempFile('core-site.xml')

    inHdfsSite = utils.getTemplateFile('hdfs-hdfs-site.xml')
    outHdfsSite = utils.getLocalTempFile('hdfs-site.xml')

    outSlave = utils.getLocalTempFile('slaves')

    print '[ Configuring Hadoop ]'

    utils.stringReplaceInFile(
        inCoreSite,
        outCoreSite,
        {
            'PUT-MASTER-IP': mastername,
            'XX-PROJECTID-XX': utils.getProjectProperties()['Project']
        })

    pathnamenode = os.path.join(datadir, 'namenode')
    pathdatanode = os.path.join(datadir, 'datanode')

    utils.stringReplaceInFile(
        inHdfsSite,
        outHdfsSite,
        {
            'XXREPLICATIONXX': '3',
            'XXNAMENODEXX': pathnamenode,
            'XXDATANODEXX': pathdatanode,
        })

    with open(outSlave, 'w') as streamOut:
        for namenode in listnodes['slaves']:
            streamOut.write(namenode + '\n')

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outCoreSite, softdir + '/etc/hadoop/core-site.xml')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outHdfsSite, softdir + '/etc/hadoop/hdfs-site.xml')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outSlave, softdir + '/etc/hadoop/slaves')

    if useCloudConnector:
        cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector))
        # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar')
        cluster.runOnAllNodesAsync(
            'cp gcs-connector-latest-hadoop2.jar {0}/share/hadoop/common/'.format(softdir))

    listCommand = []
    listCommand.append("mkdir -p {0}".format(pathnamenode))
    listCommand.append("mkdir -p {0}".format(pathdatanode))
    listCommand.append("chmod -R a+rwx {0}".format(pathnamenode))
    listCommand.append("chmod -R a+rwx {0}".format(pathdatanode))

    command = ';'.join(listCommand)

    cluster.runOnAllNodesAsync(command)
Exemple #8
0
def setupConfigurationFiles():
    """Deploy spark configuration files"""

    (listnodes, nodes) = cluster.instanceListAll()

    # We create here a fake python link.
    # This python is used as the main spark driver
    # If we need to change the spark python driver,
    # we just have to overwrite this link.

    cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython))

    # -------------------------------------------
    # handling of slaves
    # -------------------------------------------
    # The slave file contains information
    # about which hosts have to be used
    outSlave = utils.getLocalTempFile('slaves')

    with open(outSlave, 'w') as streamOut:
        for namenode in listnodes['slaves']:
            streamOut.write(namenode + '\n')

    # -------------------------------------------
    # handling of spark configuration
    # -------------------------------------------

    if cluster.getNSlaves() > 0:
        sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName())
    else:
        sparkMaster = 'local[{0}]'.format((cluster.nCores - 1))

    inConf = utils.getTemplateFile('spark-spark-defaults.conf')
    outConf = utils.getLocalTempFile('spark-defaults.conf')

    maxSlaves = int(cluster.nCores - 1)

    utils.stringReplaceInFile(
        inConf,
        outConf,
        {
            'XX-DRIVER-MEM-XX': mastermemory,
            'XX-EXECUTOR-MEM-XX': executormemory,
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-CORES-XX': '{0}'.format(maxSlaves)
        })

    inEnv = utils.getTemplateFile('spark-spark-env.sh')
    outEnv = utils.getLocalTempFile('spark-env.sh')

    utils.stringReplaceInFile(
        inEnv,
        outEnv,
        {
            'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath),
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster())
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outSlave, softdir + '/conf/slaves')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outConf, softdir + '/conf/spark-defaults.conf')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outEnv, softdir + '/conf/spark-env.sh')
    cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events')

    # we remove info level display from spark...
    cluster.runOnMaster(
        'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir))
    cluster.runOnMaster(
        'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir))

    # --------------------------------
    # handling of connector
    # --------------------------------
    # We install here the hadoop connector for google cloud storage
    # This connector permits writing data on google cs directly
    # from spark

    if useCloudConnector and cluster.getNSlaves() > 0:
        cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector))

        # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar')
        cluster.runOnAllNodesAsync(
            'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir))
        cluster.runOnAllNodesAsync(
            'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir))

    # ------------------------------------------
    # Deployment of spark overloading scripts
    # ------------------------------------------
    #
    # One problem with spark in standalone mode, is that
    # we have to use the client mode.
    # with the client mode, we cannot use the spark default conf
    # for setting additional jars at launch.
    #
    # We therefore use two scripts, one for spark-submit, one for pyspark
    # for overloading the calls.
    #
    # These scripts tests for the existence of jar variables
    # and make the call accordingly

    inPyspark = utils.getTemplateFile('pyspark-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars')

    inSubmit = utils.getTemplateFile('spark-submit-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')