def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): """Modified version of the setup_cluster function (borrowed from spark-ec.py) in order to manually set the folder with the deploy code""" master = master_nodes[0].public_dns_name if deploy_ssh_key: print "Generating cluster's SSH key on master..." key_setup = """ [ -f ~/.ssh/id_rsa ] || (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys) """ ssh(master, opts, key_setup) dot_ssh_tar = ssh_read(master, opts, ["tar", "c", ".ssh"]) print "Transferring cluster's SSH key to slaves..." for slave in slave_nodes: print slave.public_dns_name ssh_write(slave.public_dns_name, opts, ["tar", "x"], dot_ssh_tar) modules = ["spark", "shark", "ephemeral-hdfs", "persistent-hdfs", "mapreduce", "spark-standalone", "tachyon"] if opts.hadoop_major_version == "1": modules = filter(lambda x: x != "mapreduce", modules) if opts.ganglia: modules.append("ganglia") ssh(master, opts, "rm -rf spark-ec2 && git clone https://github.com/mesos/spark-ec2.git -b v3") print "Deploying files to master..." deploy_folder = os.path.join(os.environ["SPARK_HOME"], "ec2", "deploy.generic") deploy_files(conn, deploy_folder, opts, master_nodes, slave_nodes, modules) print "Running setup on master..." setup_spark_cluster(master, opts) print "Done!"
def load_data(master, opts): """ Load an example data set into a Spark EC2 cluster""" print "Transferring example data to the cluster..." ssh(master, opts, "/root/ephemeral-hdfs/bin/start-all.sh") time.sleep(10) (s3_access_key, s3_secret_key) = get_s3_keys() ssh(master, opts, "/root/ephemeral-hdfs/bin/hadoop distcp " "s3n://" + s3_access_key + ":" + s3_secret_key + "@thunder.datasets/test/iris.txt hdfs:///data") print "Done!"
def load_data(master, opts): """ Load an example data set into a Spark EC2 cluster""" print "Transferring example data to the cluster..." ssh(master, opts, "/root/ephemeral-hdfs/bin/start-all.sh") time.sleep(10) (s3_access_key, s3_secret_key) = get_s3_keys() ssh( master, opts, "/root/ephemeral-hdfs/bin/hadoop distcp " "s3n://" + s3_access_key + ":" + s3_secret_key + "@thunder.datasets/test/iris.txt hdfs:///data") print "Done!"
def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): """Modified version of the setup_cluster function (borrowed from spark-ec.py) in order to manually set the folder with the deploy code""" master = master_nodes[0].public_dns_name if deploy_ssh_key: print "Generating cluster's SSH key on master..." key_setup = """ [ -f ~/.ssh/id_rsa ] || (ssh-keygen -q -t rsa -N '' -f ~/.ssh/id_rsa && cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys) """ ssh(master, opts, key_setup) dot_ssh_tar = ssh_read(master, opts, ['tar', 'c', '.ssh']) print "Transferring cluster's SSH key to slaves..." for slave in slave_nodes: print slave.public_dns_name ssh_write(slave.public_dns_name, opts, ['tar', 'x'], dot_ssh_tar) modules = [ 'spark', 'shark', 'ephemeral-hdfs', 'persistent-hdfs', 'mapreduce', 'spark-standalone', 'tachyon' ] if opts.hadoop_major_version == "1": modules = filter(lambda x: x != "mapreduce", modules) if opts.ganglia: modules.append('ganglia') ssh( master, opts, "rm -rf spark-ec2 && git clone https://github.com/mesos/spark-ec2.git -b v2" ) print "Deploying files to master..." deploy_folder = os.path.join(os.environ['SPARK_HOME'], "ec2", "deploy.generic") deploy_files(conn, deploy_folder, opts, master_nodes, slave_nodes, modules) print "Running setup on master..." setup_spark_cluster(master, opts) print "Done!"
def install_thunder(master, opts): """ Install Thunder and dependencies on a Spark EC2 cluster""" print "Installing Thunder on the cluster..." ssh(master, opts, "git clone https://github.com/freeman-lab/thunder.git") ssh(master, opts, "chmod u+x thunder/helper/ec2/setup.sh") ssh(master, opts, "thunder/helper/ec2/setup.sh") print "Done!"
def load_data(master, opts): """ Load an example data set into a Spark EC2 cluster TODO: replace with URL once we've hosted public data """ print "Transferring example data to the cluster..." ssh(master, opts, "/root/ephemeral-hdfs/bin/stop-all.sh") ssh(master, opts, "/root/ephemeral-hdfs/bin/start-all.sh") time.sleep(10) ssh(master, opts, "/root/ephemeral-hdfs/bin/hadoop distcp s3n://thunder.datasets/test/iris.txt hdfs:///data") print "\n\n" print "-------------------------------" print "Example data successfully loaded!" print "-------------------------------" print "\n"
def install_thunder(master, opts): """ Install Thunder and dependencies on a Spark EC2 cluster""" print "Installing Thunder on the cluster..." ssh(master, opts, "rm -rf thunder && git clone https://github.com/freeman-lab/thunder.git") ssh(master, opts, "chmod u+x thunder/python/bin/build") ssh(master, opts, "thunder/python/bin/build") ssh(master, opts, "source ~/.bash_profile && pip install mpld3") ssh(master, opts, "echo 'export SPARK_HOME=/root/spark' >> /root/.bash_profile") ssh(master, opts, "echo 'export PYTHONPATH=/root/thunder/python' >> /root/.bash_profile") ssh(master, opts, "echo 'export IPYTHON=1' >> /root/.bash_profile") ssh(master, opts, "echo 'export PATH=/root/thunder/python/bin:$PATH' >> /root/.bash_profile") print "\n\n" print "-------------------------------" print "Thunder successfully installed!" print "-------------------------------" print "\n"
def install_thunder(master, opts): """ Install Thunder and dependencies on a Spark EC2 cluster""" print "Installing Thunder on the cluster..." # download and build thunder ssh(master, opts, "rm -rf thunder && git clone https://github.com/freeman-lab/thunder.git") ssh(master, opts, "chmod u+x thunder/python/bin/build") ssh(master, opts, "thunder/python/bin/build") # copy local data examples to all workers ssh(master, opts, "yum install -y pssh") ssh(master, opts, "pssh -h /root/spark-ec2/slaves mkdir -p /root/thunder/python/thunder/utils/data/") ssh(master, opts, "~/spark-ec2/copy-dir /root/thunder/python/thunder/utils/data/") # install pip ssh(master, opts, "wget http://pypi.python.org/packages/source/p/pip/pip-1.1.tar.gz" "#md5=62a9f08dd5dc69d76734568a6c040508") ssh(master, opts, "tar -xvf pip*.gz") ssh(master, opts, "cd pip* && sudo python setup.py install") # install libraries ssh(master, opts, "source ~/.bash_profile && pip install mpld3 && pip install seaborn " "&& pip install jinja2 && pip install -U scikit-learn") # install ipython 1.1 ssh(master, opts, "pip uninstall -y ipython") ssh(master, opts, "git clone https://github.com/ipython/ipython.git") ssh(master, opts, "cd ipython && git checkout tags/rel-1.1.0") ssh(master, opts, "cd ipython && sudo python setup.py install") # set environmental variables ssh(master, opts, "echo 'export SPARK_HOME=/root/spark' >> /root/.bash_profile") ssh(master, opts, "echo 'export PYTHONPATH=/root/thunder/python' >> /root/.bash_profile") ssh(master, opts, "echo 'export IPYTHON=1' >> /root/.bash_profile") ssh(master, opts, "echo 'export PATH=/root/thunder/python/bin:$PATH' >> /root/.bash_profile") # customize spark configuration parameters ssh(master, opts, "echo 'spark.akka.frameSize=10000' >> /root/spark/conf/spark-defaults.conf") ssh(master, opts, "echo 'spark.kryoserializer.buffer.max.mb=1024' >> /root/spark/conf/spark-defaults.conf") ssh(master, opts, "echo 'export SPARK_DRIVER_MEMORY=20g' >> /root/spark/conf/spark-env.sh") # add AWS credentials to core-site.xml configstring = "<property><name>fs.s3n.awsAccessKeyId</name><value>ACCESS</value></property><property>" \ "<name>fs.s3n.awsSecretAccessKey</name><value>SECRET</value></property>" access, secret = get_s3_keys() filled = configstring.replace('ACCESS', access).replace('SECRET', secret) ssh(master, opts, "sed -i'f' 's,.*</configuration>.*,"+filled+"&,' /root/ephemeral-hdfs/conf/core-site.xml") # add AWS credentials to ~/.boto credentialstring = "[Credentials]\naws_access_key_id = ACCESS\naws_secret_access_key = SECRET\n" credentialsfilled = credentialstring.replace('ACCESS', access).replace('SECRET', secret) ssh(master, opts, "printf '"+credentialsfilled+"' > /root/.boto") ssh(master, opts, "pscp.pssh -h /root/spark-ec2/slaves /root/.boto /root/.boto") # configure requester pays ssh(master, opts, "touch /root/spark/conf/jets3t.properties") ssh(master, opts, "echo 'httpclient.requester-pays-buckets-enabled = true' >> /root/spark/conf/jets3t.properties") ssh(master, opts, "~/spark-ec2/copy-dir /root/spark/conf") print "\n\n" print "-------------------------------" print "Thunder successfully installed!" print "-------------------------------" print "\n"
def install_thunder(master, opts): """ Install Thunder and dependencies on a Spark EC2 cluster""" print "Installing Thunder on the cluster..." ssh(master, opts, "rm -rf thunder && git clone https://github.com/freeman-lab/thunder.git") ssh(master, opts, "chmod u+x thunder/python/bin/build") ssh(master, opts, "thunder/python/bin/build") ssh(master, opts, "source ~/.bash_profile && pip install mpld3 && pip install seaborn") ssh(master, opts, "rm /root/pyspark_notebook_example.ipynb") ssh(master, opts, "echo 'export SPARK_HOME=/root/spark' >> /root/.bash_profile") ssh(master, opts, "echo 'export PYTHONPATH=/root/thunder/python' >> /root/.bash_profile") ssh(master, opts, "echo 'export IPYTHON=1' >> /root/.bash_profile") ssh(master, opts, "echo 'export PATH=/root/thunder/python/bin:$PATH' >> /root/.bash_profile") configstring = "<property><name>fs.s3n.awsAccessKeyId</name><value>ACCESS</value></property><property>" \ "<name>fs.s3n.awsSecretAccessKey</name><value>SECRET</value></property>" access, secret = get_s3_keys() filled = configstring.replace('ACCESS', access).replace('SECRET', secret) ssh(master, opts, "sed -i'f' 's,.*</configuration>.*,"+filled+"&,' /root/ephemeral-hdfs/conf/core-site.xml") print "\n\n" print "-------------------------------" print "Thunder successfully installed!" print "-------------------------------" print "\n"
def install_thunder(master, opts): """ Install Thunder and dependencies on a Spark EC2 cluster""" print "Installing Thunder on the cluster..." ssh( master, opts, "rm -rf thunder && git clone https://github.com/freeman-lab/thunder.git" ) ssh(master, opts, "chmod u+x thunder/python/bin/build") ssh(master, opts, "thunder/python/bin/build") ssh(master, opts, "source ~/.bash_profile && pip install mpld3") ssh(master, opts, "echo 'export SPARK_HOME=/root/spark' >> /root/.bash_profile") ssh( master, opts, "echo 'export PYTHONPATH=/root/thunder/python' >> /root/.bash_profile") ssh(master, opts, "echo 'export IPYTHON=1' >> /root/.bash_profile") ssh( master, opts, "echo 'export PATH=/root/thunder/python/bin:$PATH' >> /root/.bash_profile" ) print "\n\n" print "-------------------------------" print "Thunder successfully installed!" print "-------------------------------" print "\n"
def install_thunder(master, opts): """ Install Thunder and dependencies on a Spark EC2 cluster""" print "Installing Thunder on the cluster..." ssh(master, opts, "rm -rf thunder && git clone https://github.com/freeman-lab/thunder.git") ssh(master, opts, "chmod u+x thunder/python/bin/build") ssh(master, opts, "thunder/python/bin/build") ssh(master, opts, "source ~/.bash_profile && pip install mpld3") ssh(master, opts, "rm /root/pyspark_notebook_example.ipynb") ssh(master, opts, "echo 'export SPARK_HOME=/root/spark' >> /root/.bash_profile") ssh(master, opts, "echo 'export PYTHONPATH=/root/thunder/python' >> /root/.bash_profile") ssh(master, opts, "echo 'export IPYTHON=1' >> /root/.bash_profile") ssh(master, opts, "echo 'export PATH=/root/thunder/python/bin:$PATH' >> /root/.bash_profile") configstring = "<property><name>fs.s3n.awsAccessKeyId</name><value>ACCESS</value></property><property>" \ "<name>fs.s3n.awsSecretAccessKey</name><value>SECRET</value></property>" access, secret = get_s3_keys() filled = configstring.replace('ACCESS', access).replace('SECRET', secret) ssh(master, opts, "sed -i'f' 's,.*</configuration>.*,"+filled+"&,' /root/ephemeral-hdfs/conf/core-site.xml") print "\n\n" print "-------------------------------" print "Thunder successfully installed!" print "-------------------------------" print "\n"