コード例 #1
0
ファイル: hadoop.py プロジェクト: totor31/gce-setup-anduse
def setupBashrc():

    inStartup = utils.getTemplateFile('hdfs-setup_hdfs.sh')
    outStartup = utils.getLocalTempFile('setup_hdfs.sh')

    utils.stringReplaceInFile(
        inStartup,
        outStartup,
        {
            'XXHADOOPLOCALPATHXX': "'{0}'".format(softdir)
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(outStartup, '~/setup_hdfs.sh')

    command = 'echo "source setup_hdfs.sh" >> setup.sh'
    cluster.runOnAllNodesAsync(command)
コード例 #2
0
ファイル: package.py プロジェクト: totor31/gce-setup-anduse
def setupProfile():

    inStartup = utils.getTemplateFile('package-setup_profile_package.sh')
    outStartup = utils.getLocalTempFile('setup_profile_package.sh')

    utils.stringReplaceInFile(
        inStartup,
        outStartup,
        {
            # 'XX-PYTHON-XX': "'{0}'".format(pythonbin),
            'XX-SOURCESCRIPT-XX': sourcescript
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outStartup, '~/setup_profile_package.sh')

    command = 'echo "source $HOME/setup_profile_package.sh" >> setup_profile.sh'
    cluster.runOnAllNodesAsync(command)
コード例 #3
0
ファイル: spark.py プロジェクト: totor31/gce-setup-anduse
def setupBashrc():

    inStartup = utils.getTemplateFile('spark-setup_spark.sh')
    outStartup = utils.getLocalTempFile('setup_spark.sh')

    utils.stringReplaceInFile(
        inStartup,
        outStartup,
        {
            'XXSPARKLOCALPATHXX': "'{0}'".format(softdir)
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outStartup, '~/setup_spark.sh')

    command = 'echo "source setup_spark.sh" >> setup.sh'
    cluster.runOnAllNodesAsync(command)

    if useCloudConnector and cluster.getNSlaves() > 0:
        addjars('{0}/lib/gcs-connector-latest-hadoop2.jar'.format(softdir))
コード例 #4
0
ファイル: hadoop.py プロジェクト: totor31/gce-setup-anduse
def setupConfigurationFiles():
    """Deploy hadoop"""

    (listnodes, nodes) = cluster.instanceListAll()
    mastername = listnodes['master'][0]

    inCoreSite = utils.getTemplateFile('hdfs-core-site.xml')
    outCoreSite = utils.getLocalTempFile('core-site.xml')

    inHdfsSite = utils.getTemplateFile('hdfs-hdfs-site.xml')
    outHdfsSite = utils.getLocalTempFile('hdfs-site.xml')

    outSlave = utils.getLocalTempFile('slaves')

    print '[ Configuring Hadoop ]'

    utils.stringReplaceInFile(
        inCoreSite,
        outCoreSite,
        {
            'PUT-MASTER-IP': mastername,
            'XX-PROJECTID-XX': utils.getProjectProperties()['Project']
        })

    pathnamenode = os.path.join(datadir, 'namenode')
    pathdatanode = os.path.join(datadir, 'datanode')

    utils.stringReplaceInFile(
        inHdfsSite,
        outHdfsSite,
        {
            'XXREPLICATIONXX': '3',
            'XXNAMENODEXX': pathnamenode,
            'XXDATANODEXX': pathdatanode,
        })

    with open(outSlave, 'w') as streamOut:
        for namenode in listnodes['slaves']:
            streamOut.write(namenode + '\n')

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outCoreSite, softdir + '/etc/hadoop/core-site.xml')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outHdfsSite, softdir + '/etc/hadoop/hdfs-site.xml')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outSlave, softdir + '/etc/hadoop/slaves')

    if useCloudConnector:
        cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector))
        # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar')
        cluster.runOnAllNodesAsync(
            'cp gcs-connector-latest-hadoop2.jar {0}/share/hadoop/common/'.format(softdir))

    listCommand = []
    listCommand.append("mkdir -p {0}".format(pathnamenode))
    listCommand.append("mkdir -p {0}".format(pathdatanode))
    listCommand.append("chmod -R a+rwx {0}".format(pathnamenode))
    listCommand.append("chmod -R a+rwx {0}".format(pathdatanode))

    command = ';'.join(listCommand)

    cluster.runOnAllNodesAsync(command)
コード例 #5
0
ファイル: spark.py プロジェクト: totor31/gce-setup-anduse
def setupConfigurationFiles():
    """Deploy spark configuration files"""

    (listnodes, nodes) = cluster.instanceListAll()

    # We create here a fake python link.
    # This python is used as the main spark driver
    # If we need to change the spark python driver,
    # we just have to overwrite this link.

    cluster.runOnAllNodesAsync('ln -fs `which python` {0}'.format(pathpython))

    # -------------------------------------------
    # handling of slaves
    # -------------------------------------------
    # The slave file contains information
    # about which hosts have to be used
    outSlave = utils.getLocalTempFile('slaves')

    with open(outSlave, 'w') as streamOut:
        for namenode in listnodes['slaves']:
            streamOut.write(namenode + '\n')

    # -------------------------------------------
    # handling of spark configuration
    # -------------------------------------------

    if cluster.getNSlaves() > 0:
        sparkMaster = 'spark://{0}:7077'.format(cluster.getMasterName())
    else:
        sparkMaster = 'local[{0}]'.format((cluster.nCores - 1))

    inConf = utils.getTemplateFile('spark-spark-defaults.conf')
    outConf = utils.getLocalTempFile('spark-defaults.conf')

    maxSlaves = int(cluster.nCores - 1)

    utils.stringReplaceInFile(
        inConf,
        outConf,
        {
            'XX-DRIVER-MEM-XX': mastermemory,
            'XX-EXECUTOR-MEM-XX': executormemory,
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-CORES-XX': '{0}'.format(maxSlaves)
        })

    inEnv = utils.getTemplateFile('spark-spark-env.sh')
    outEnv = utils.getLocalTempFile('spark-env.sh')

    utils.stringReplaceInFile(
        inEnv,
        outEnv,
        {
            'XX-PYSPARK_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-SPARKMASTER-XX': sparkMaster,
            'XX-PYSPARK_DRIVER_PYTHON-XX': '"{0}"'.format(pathpython),
            'XX-PYTHONPATH-XX': '"{0}"'.format(pythonpath),
            'XX-LOCAL-DIR-XX': localtempdir,
            'XX-MASTER-IP-XX': '"{0}"'.format(cluster.ipGetMaster())
        })

    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outSlave, softdir + '/conf/slaves')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outConf, softdir + '/conf/spark-defaults.conf')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(
        outEnv, softdir + '/conf/spark-env.sh')
    cluster.runOnAllNodesAsync('mkdir -p /tmp/spark-events')

    # we remove info level display from spark...
    cluster.runOnMaster(
        'sed -i "s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g" {0}/conf/log4j.properties.template'.format(softdir))
    cluster.runOnMaster(
        'cp {0}/conf/log4j.properties.template {0}/conf/log4j.properties'.format(softdir))

    # --------------------------------
    # handling of connector
    # --------------------------------
    # We install here the hadoop connector for google cloud storage
    # This connector permits writing data on google cs directly
    # from spark

    if useCloudConnector and cluster.getNSlaves() > 0:
        cluster.runOnAllNodesAsync('gsutil cp {0} .'.format(bucketconnector))

        # cluster.runOnAllNodesAsync('wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop2.jar')
        cluster.runOnAllNodesAsync(
            'cp gcs-connector-latest-hadoop2.jar {0}/lib/'.format(softdir))
        cluster.runOnAllNodesAsync(
            'cp {0}/etc/hadoop/core-site.xml {1}/conf/'.format(hadoop.softdir, softdir))

    # ------------------------------------------
    # Deployment of spark overloading scripts
    # ------------------------------------------
    #
    # One problem with spark in standalone mode, is that
    # we have to use the client mode.
    # with the client mode, we cannot use the spark default conf
    # for setting additional jars at launch.
    #
    # We therefore use two scripts, one for spark-submit, one for pyspark
    # for overloading the calls.
    #
    # These scripts tests for the existence of jar variables
    # and make the call accordingly

    inPyspark = utils.getTemplateFile('pyspark-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inPyspark, 'pyspark-jars')

    inSubmit = utils.getTemplateFile('spark-submit-jars')
    cluster.rsyncOnAllNodesLocalhostToLocalAsync(inSubmit, 'spark-submit-jars')
コード例 #6
0
ファイル: cluster.py プロジェクト: totor31/gce-setup-anduse
def getCacheListFile():
    """Return file for storing list of instances"""

    cacheFile = utils.getLocalTempFile('listInstances.pck')
    return cacheFile
コード例 #7
0
ファイル: cluster.py プロジェクト: totor31/gce-setup-anduse
def getStartupFile():
    """Return path to local temporary startup file

    """

    return utils.getLocalTempFile('startup.sh')