Ejemplo n.º 1
0
def runProduction():

    nCores = cluster.getNCores()
    nSlaves = cluster.getNSlaves()
    maxSlaves = int(nCores / 2)
    memory = cluster.getMemory()

    cluster.runOnMaster('hdfs dfs -rm -r -f /brisbane1-T4000-BP__BP_R150cm_T2000.avro')

    command = ' '.join([
        'spark-submit',
        # '--jars /mnt/data/package/softs/jars/AvroToPythonConverters-1.3.0.jar',
        '--jars /mnt/data/package/softs/jars/spark-avro_2.10-2.0.1-SNAPSHOT.jar',
        'irt/processor/scripts/prodLauncher.py',
        'runSparkAvro brisbane1-T4000-BP',
        '--mode cluster',
        '--block 1000',
        '--band P',
        '--res 1.5',
        '--modeStore memory',
        '--tile 2000',
        '--nbThreads 2',
        '--nbSlaves {0}'.format(nSlaves),
        '--maxSlaves {0}'.format(maxSlaves),
        '--modeProduction L2area'])

    cluster.runOnMasterX(command)
Ejemplo n.º 2
0
def syncEvalToMaster():
    listCommand = []

    (listnodes, nodes) = cluster.instanceListAll()


    for namenode in listnodes['slaves']:
        listCommand.append('rsync -lva {0}:/mnt/data/tmp/DEEPEVAL /mnt/data/tmp/'.format(namenode))

    command = ';'.join(listCommand)
    cluster.runOnMaster(command)
Ejemplo n.º 3
0
def addjars(jars):

    try:
        jars.split(':')
        jars = [jars]
    except:
        pass

    for j in jars:
        cluster.runOnMaster("echo '' >> setup_spark.sh".format(j))

        addJar = []
        addJar.append('if [[ -z "$SPARKJARS" ]]; then ')
        addJar.append('     export SPARKJARS={0}'.format(j))
        addJar.append('else ')
        addJar.append('     export SPARKJARS={0},$SPARKJARS'.format(j))
        addJar.append('fi')

        c = ["echo '" + a + "' >> setup_spark.sh" for a in addJar]

        print c

        cluster.runOnAllNodesAsync(';'.join(c))

    jars = ':'.join(jars)

    command = []
    command.append(
        'echo "" >> {0}'.format(softdir + '/conf/spark-defaults.conf'))

    command.append('echo "spark.driver.extraClassPath {0}" >> {1}'.format(
        jars, softdir + '/conf/spark-defaults.conf'))
    command.append('echo "spark.executor.extraClassPath {0}" >> {1}'.format(
        jars, softdir + '/conf/spark-defaults.conf'))

    command = ';'.join(command)

    cluster.runOnAllNodesAsync(command)
Ejemplo n.º 4
0
def cleanHdfs():
    cluster.runOnMaster('hdfs dfs -rm -r -f /brisbane1-*')
Ejemplo n.º 5
0
def setupConfigurationFiles():
    """Deploy spark configuration files"""

    (listnodes, nodes) = cluster.instanceListAll()

    # We create here a fake python link.
    # This python is used as the main spark driver
    # If we need to change the spark python driver,
    # we just have to overwrite this link.

    cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython))

    # -------------------------------------------
    # handling of slaves
    # -------------------------------------------
    # The slave file contains information
    # about which hosts have to be used
    outSlave = utils.getLocalTempFile('slaves')

    with open(outSlave, 'w') as streamOut:
        for namenode in listnodes['slaves']:
            streamOut.write(namenode + '\n')

    # -------------------------------------------
    # handling of spark configuration
    # -------------------------------------------

    if cluster.getNSlaves() > 0:
        sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName())
    else:
        sparkMaster = 'local[{0}]'.format((cluster.nCores - 1))

    inConf = utils.getTemplateFile('spark-spark-defaults.conf')
    outConf = utils.getLocalTempFile('spark-defaults.conf')

    maxSlaves = int(cluster.nCores - 1)

    utils.stringReplaceInFile(
        inConf,
        outConf,
        {
            'XX-DRIVER-MEM-XX': mastermemory,
            'XX-EXECUTOR-MEM-XX': executormemory,
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-CORES-XX': '{0}'.format(maxSlaves)
        })

    inEnv = utils.getTemplateFile('spark-spark-env.sh')
    outEnv = utils.getLocalTempFile('spark-env.sh')

    utils.stringReplaceInFile(
        inEnv,
        outEnv,
        {
            'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath),
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster())
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outSlave, softdir + '/conf/slaves')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outConf, softdir + '/conf/spark-defaults.conf')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outEnv, softdir + '/conf/spark-env.sh')
    cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events')

    # we remove info level display from spark...
    cluster.runOnMaster(
        'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir))
    cluster.runOnMaster(
        'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir))

    # --------------------------------
    # handling of connector
    # --------------------------------
    # We install here the hadoop connector for google cloud storage
    # This connector permits writing data on google cs directly
    # from spark

    if useCloudConnector and cluster.getNSlaves() > 0:
        cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector))

        # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar')
        cluster.runOnAllNodesAsync(
            'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir))
        cluster.runOnAllNodesAsync(
            'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir))

    # ------------------------------------------
    # Deployment of spark overloading scripts
    # ------------------------------------------
    #
    # One problem with spark in standalone mode, is that
    # we have to use the client mode.
    # with the client mode, we cannot use the spark default conf
    # for setting additional jars at launch.
    #
    # We therefore use two scripts, one for spark-submit, one for pyspark
    # for overloading the calls.
    #
    # These scripts tests for the existence of jar variables
    # and make the call accordingly

    inPyspark = utils.getTemplateFile('pyspark-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars')

    inSubmit = utils.getTemplateFile('spark-submit-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')
Ejemplo n.º 6
0
def stopAll():

    cluster.runOnMaster('{0}/sbin/stop-all.sh'.format(softdir))