def setup(self): template = [] cur_dir = os.path.dirname(__file__) yaml = os.path.join(cur_dir, 'cassandra.yaml') basedata = '' with open(yaml, 'r') as f: basedata = f.read() def write_yaml(vm): pernodedata = CassandraBase % (self.name, vm.intf_ip('eth0'), vm.intf_ip('eth0'), self.snitch) vm_parts = vm.data_directories() data_dirs = ("/data\n - ").join(vm_parts) commit_dirs = vm_parts[0] if len(vm_parts) > 1: data_dirs = ("/data\n - ").join(vm_parts[1:]) conndata = CassandraTemplate % (','.join( self.seed_ips), data_dirs, commit_dirs) config = "\n".join([basedata, pernodedata, conndata]) vm.script( 'sudo cat <<EOT > {0}/conf/cassandra.yaml\n{1}\nEOT'.format( CASSANDRA_PATH, config)) parallel(write_yaml, self.nodes)
def restart_hdfs(self): self.master.script('sudo service hadoop-hdfs-namenode restart') parallel(lambda vm: vm.script('sudo service hadoop-hdfs-datanode restart'), self.workers) self.master.script('sudo -u hdfs hdfs dfs -mkdir -p /tmp/hadoop-yarn') self.master.script('sudo -u hdfs hdfs dfs -chmod -R 1777 /tmp') self.master.script('sudo -u hdfs hdfs dfs -chmod -R 1777 /tmp/hadoop-yarn')
def stop(self): def stop_entity(entity): if hasattr(entity, 'stop'): entity.stop() return True parallel(stop_entity, self._entities)
def tpch(vms, env): tpch_scale = int(env.param('tpch:scale')) hive = setup_hive(vms, env) hive.master.script( tpch_cmd('./tpch-setup.sh {0} >/dev/null 2>&1'.format(tpch_scale))) directory = 'tpch-' + hive.master.type + '-' + str(len(vms)) + "-results" makedirectory(directory) def execute_query(query): tpch_run_query(hive.master, query, tpch_scale) for iteration in range(1, int(env.param('tpch:runs'))): # Drop file caches to be more accurate for amount of reads and writes parallel( lambda vm: vm.script("sync; echo 3 > /proc/sys/vm/drop_caches"), vms) argos_start(vms, directory, iteration) start = time.time() parallel(execute_query, TPCH_QUERIES) end = time.time() argos_finish(vms, directory, iteration) file_name = str(time.time()) + '-' + hive.master.type with open( os.path.join(directory, str(iteration), hive.master.type + '.time'), 'w+') as f: f.write('0,%s' % str(end - start))
def setup(self): if not self.cloudera.install('Hadoop'): return False if not self.cloudera.install('Spark'): return False def install_hive(vm): for package_name in ClouderaHive.HivePackages: vm.package_manager.install(package_name) parallel(install_hive, self.nodes) def setup_mysql(): vm = self.master sqlFile = '/usr/lib/hive/setup-mysql-cloudbench.sql' vm.install('mysql') vm.script(write_template('hive-mysql', sqlFile)) vm.script('cat {0} | mysql -u root'.format(sqlFile)) def setup_hive(vm): vm.script(write_template('hive-site', '/usr/lib/hive/conf/hive-site.xml', master=self.master.name)) # Install mysql on the master setup_mysql() parallel(setup_hive, self.nodes) return True
def run_on_testers(self, func): result = Queue.Queue() parallel(lambda vm: result.put(func(vm)), self._test_vms) out = [] while not result.empty(): out.append(result.get()) return out
def monitor_start(vms): # Start IO monitor # parallel(lambda vm: vm.monitor(), vms) # Start Argos parallel(lambda vm: vm.script('rm -rf ~/argos/proc'), vms) parallel(lambda vm: vm.script('cd argos; sudo nohup src/argos >argos.out 2>&1 &'), vms) time.sleep(2)
def format_hdfs(self): remove_hdfs_dir = self.hadoop_user_cmd('"rm -rf {0}/hdfs"'.format( self.hdfs_path(self.master))) parallel(lambda vm: vm.script(remove_hdfs_dir), self.all_nodes()) remove_hdfs_dir = self.hadoop_user_cmd('"rm -rf {0}/tmp"'.format( self.hdfs_path(self.master))) parallel(lambda vm: vm.script(remove_hdfs_dir), self.all_nodes()) self.master.script( self.hadoop_user_cmd('"hdfs namenode -format -force"'))
def tpch(vms, env): hive = setup_hive(env, vms) hive.master.script(tpch_cmd('./tpch-setup.sh {0}'.format(TPCH_SCALE))) def execute_query(num): tpch_run_query(hive.master, num, TPCH_SCALE) start = time.time() parallel(execute_query, TPCH_QUERIES) end = time.time() print "Total time: %.2f" % (end - start)
def setup_spark(vms, env): setup_disks(vms, env) setup_base(vms, env) ce = Cloudera(vms) ce.install('Hadoop') ce.install('Spark') # Make sure spark can be written by anyone parallel(lambda vm: vm.script('chown -R ubuntu:ubuntu /var/lib/spark/work'), vms) parallel(lambda vm: vm.script('sudo -u hdfs hdfs dfs -chmod 777 /user/spark'), vms) return ce['Spark']
def setup_disks(env, vms): def setup_vm_disks(vm): root = vm.root_disk() disks = vm.disks() disk_id = 2 for disk in disks: if root.startswith(disk): continue vm.mount(disk, '/data/%d' % disk_id, force_format=True) disk_id += 1 parallel(setup_vm_disks, vms)
def terasort_with_argos_run(vms, env): parallel(lambda vm: vm.install('hadoop'), vms) parallel(lambda vm: vm.install('ntp'), vms) parallel(lambda vm: vm.install('argos'), vms) parallel(lambda vm: vm.install('jq'), vms) cluster = HadoopCluster(vms[0], vms[1:], env.param('terasort:use_local_disk') != 'False') cluster.setup() cluster.reset() output = cluster.execute('"/usr/bin/time -f \'%e\' -o terasort.out hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar teragen -Dmapred.map.tasks={1} {2} {0}"'.format(TERASORT_INPUT, env.param('terasort:mappers'), env.param('terasort:rows'))) teragen_time = cluster.master.script('sudo su - hduser -c "tail -n1 terasort.out"').strip() argos_start(vms) cluster.execute('"/usr/bin/time -f \'%e\' -o terasort.out hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar terasort -Dmapred.reduce.tasks={2} {0} {1} >output.log 2>&1"'.format(TERASORT_INPUT, TERASORT_OUTPUT, env.param('terasort:reducers'))) argos_finish(vms) collect_terasort_stats(vms) terasort_time = cluster.master.script('sudo su - hduser -c "tail -n1 terasort.out"').strip() terasort_out = cluster.master.script('sudo su - hduser -c "cat output.log"').strip() file_name = str(time.time()) + '-' + cluster.master.type with open(file_name + ".time", 'w+') as f: f.write(str(teragen_time) + "," + str(terasort_time)) with open(file_name + ".out", 'w+') as f: f.write(terasort_out)
def create_security_group(self, ep): """ Create endpoints in the microsoft terms """ ret = True # TODO: Can parallelize here def create_endpoint(vm): cmd = ['azure', 'vm', 'endpoint', 'create'] cmd += [self.unique(vm), ep.public_port, ep.private_port] cmd += ['--name', self.unique(ep.name)[-15:] ] # Endpoint name should be at most 15 characters cmd += ['--protocol', ep.protocol] self.execute(cmd) parallel(create_endpoint, ep.virtual_machines()) return ret
def __init__(self, master_, slaves_, local_disk_=True): self.master = master_ self.slaves_ = slaves_ self.local_disk_ = local_disk_ super(HadoopCluster, self).__init__(self.all_nodes(), HADOOP_USER) def setup_hdfs_permissions(vm): path = self.hdfs_path(vm) if 'home' not in path: vm.mount('/dev/xvdb', path, 'ext4', True) vm.script('chown -R %s:%s %s' % (HADOOP_USER, HADOOP_GROUP, path)) vm.script("chmod -R 755 %s" % path) parallel(setup_hdfs_permissions, self.all_nodes())
def tpch(vms, env): hive = setup_hive(vms, env) parallel(lambda vm: vm.install('tpch'), vms) hive.master.script(tpch_cmd('./tpch-setup.sh {0}'.format(TPCH_SCALE))) argos_start(vms) start = time.time() for query in TPCH_QUERIES: tpch_run_query(hive.master, query, TPCH_SCALE) end = time.time() argos_finish(vms) file_name = str(time.time()) + '-' + hive.master.type with open(file_name + '.time', 'w+') as f: f.write(str(end - start))
def setup_disks(vms, env): def setup_vm_disks(vm): vm.script('rm -rf /data/1/') root = vm.root_disk() disks = vm.disks() disk_id = 2 if len(disks) == 0 or vm.type == 'i2.8xlarge': disks = vm.local_disks_except_root() for disk in disks: if root.startswith(disk): continue vm.mount(disk, '/data/%d' % disk_id, force_format=True) disk_id += 1 parallel(setup_vm_disks, vms)
def setup_directories(self): def create_yarn_dfs_folders(vm): if len(vm.data_directories()) == 0: vm.script("mkdir -p /data/1/") for dd in vm.data_directories(): vm.script("rm -r {base}/yarn".format(base=dd)) vm.script("rm -r {base}/dfs".format(base=dd)) vm.script("mkdir -p {base}/yarn/logs".format(base=dd)) vm.script("mkdir -p {base}/yarn/local".format(base=dd)) vm.script("chown -R yarn:yarn {base}/yarn".format(base=dd)) vm.script("mkdir -p {base}/dfs/nn".format(base=dd)) vm.script("mkdir -p {base}/dfs/dn".format(base=dd)) vm.script("chown -R hdfs:hdfs {base}/dfs".format(base=dd)) parallel(create_yarn_dfs_folders, self.nodes)
def traverse_dag(self, check, execute, direction='dependencies'): """ Traverse from the leaves upward to root, making sure all the leaves of a node have executed the "execute" function before the node is executed """ def satisfied(ent): """ Returns true if the requirement of an entity are satisfied """ for dep in getattr(ent, direction): deps = getattr(ent, dep)() if not deps: continue if not isinstance(deps, list): deps = [deps] # if any of the dependencies are not satisfied, return False if any(map(lambda x: not check(x), deps)): return False return True # Collect all entities everything = set() for ent in self.entities().values(): everything = everything.union(set(ent.values())) while everything: to_remove = set() to_execute = set() lock = RLock() def satisfy(x): if satisfied(x): if not check(x): with lock: to_execute.add(x) else: with lock: to_remove.add(x) parallel(satisfy, everything) parallel(lambda x: execute(x), to_execute) to_remove = to_remove.union( set(filter(lambda x: check(x), to_execute))) everything = everything - to_remove
def run_ycsb(vms, env, cluster, workload, record_count, operation_count): lock = RLock() insert_start = [0] insert_count = [record_count / len(vms)] op_count = [operation_count / len(vms)] def run_workload(vm): start = 0 with lock: start = insert_start[0] insert_start[0] += insert_count[0] cmd = "./bin/ycsb run cassandra2-cql -P workloads/workload{3} -p hosts='{0}' -p recordcount={1} -p operationcount={2} -p insertstart={4} -p insertcount={5} -s -threads 1000 >~/run.log 2>&1" cmd = cmd.format(','.join(cluster.node_ip_list()), record_count, op_count[0], workload, start, insert_count[0]) vm.script("cd {0} && {1}".format(YCSB_PATH, cmd)) parallel(run_workload, vms)
def setup_hive(vms, env): parallel(lambda vm: vm.install('hadoop'), vms) parallel(lambda vm: vm.install('hive'), vms) parallel(lambda vm: vm.install('mahout'), vms) parallel(lambda vm: vm.install('bigbench'), vms) parallel(lambda vm: vm.install('argos'), vms) vms[0].install('bigbench') hadoop = HadoopCluster(vms[0], vms[1:], env.param('terasort:use_local_disk')) hadoop.setup() hadoop.reset() hive = HiveCluster(hadoop) hive.setup() return hive
def terasort(vms, env): hadoop = setup_hadoop(env, vms) print "Master is: %s" % hadoop.master.name directory = 'terasort-' + hadoop.master.type + '-' + str( len(vms)) + "-results" makedirectory(directory) iteration = str(1) extra_teragen_params = "-Ddfs.blocksize=512M -Dmapreduce.task.io.sort.mb=256" hadoop.master.execute("sudo service hadoop-hdfs-namenode restart") hadoop.master.execute("sudo service hadoop-hdfs-datanode restart") hadoop.master.execute("sudo service hadoop-yarn-resourcemanager restart") mapper_count = int(4 * int(sum(map(lambda vm: vm.cpus(), vms))) * 0.8) hadoop.execute( 'sudo -u hdfs hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples-2.6.0-mr1-cdh5*.jar teragen {2} -D mapred.map.tasks={0} {1} /terasort-input' .format(mapper_count, env.param('terasort:rows'), extra_teragen_params)) # Drop file caches to be more accurate for amount of reads and writes parallel(lambda vm: vm.script("sync; echo 3 > /proc/sys/vm/drop_caches"), vms) reducer_count = int(sum(map(lambda vm: vm.cpus(), vms)) * 0.8) extra_terasort_params = "-Ddfs.blocksize=512M -Dmapreduce.task.io.sort.factor=100 -Dmapreduce.task.io.sort.mb=384 -Dio.file.buffer.size=131072" monitor_start(vms) hadoop.execute( '/usr/bin/time -f \'%e\' -o terasort.out sudo -u hdfs hadoop jar /usr/lib/hadoop-0.20-mapreduce/hadoop-examples-2.6.0-mr1-cdh5*.jar terasort {1} -D mapred.reduce.tasks={0} /terasort-input /terasort-output >output.log 2>&1' .format(str(reducer_count), extra_terasort_params)) monitor_finish(vms, directory, iteration) terasort_time = hadoop.master.script('tail -n1 terasort.out').strip() terasort_out = hadoop.master.script('cat output.log').strip() file_name = hadoop.master.type with open(os.path.join(directory, str(iteration), file_name + ".time"), 'w+') as f: f.write("0," + str(terasort_time)) with open(os.path.join(directory, str(iteration), file_name + ".out"), 'w+') as f: f.write(terasort_out)
def setup_spark_perf(env, vms): path = Config.path('tools', 'spark-perf.tar.gz') parallel(lambda vm: vm.send(path, '/home/ubuntu'), vms) parallel(lambda vm: vm.script('rm -rf /home/ubuntu/spark-perf'), vms) parallel(lambda vm: vm.script('tar -xzf spark-perf.tar.gz'), vms) num_cores = len(vms) * vms[0].cpus() def replace_line(vm): vm.script( "cd spark-perf; sed -i '/OptionSet(\"num-partitions\", \[128\], can_scale=True),/c\ OptionSet(\"num-partitions\", [%d], can_scale=False),' config/config.py" % num_cores) parallel(replace_line, vms)
def setup_core_site(self): config = """ <property> <name>hadoop.tmp.dir</name> <value>file://{0}/tmp</value> <description>Temporary Directory.</description> </property> <property> <name>fs.defaultFS</name> <value>hdfs://{1}:54310</value> <description>Use HDFS as file storage engine</description> </property> """ config = CoreSiteTemplate.format( config.format(self.hdfs_path(self.master), self.master.name)) command = modify_hadoop_config(config, '/etc/hadoop/core-site.xml') # Upload the file in parallel parallel(lambda node: node.script(command), self.all_nodes())
def start_entities(self, entities): """ Start the entities for a job """ dead_entities = set() lock = RLock() def entity_up(entity): if not isinstance(entity, Preemptable): return True entity.start() entity.wait(180) if entity.stale: with lock: dead_entities.add(entity) parallel(entity_up, entities) if len(dead_entities) > 0: self.add_dead_entities(dead_entities) return False return True
def setup(self): hadoop = self.cloudera.install('Hadoop') if not hadoop: return False def install_spark(vm): for package_name in ClouderaSpark.SparkPackages: vm.package_manager.install(package_name) parallel(install_spark, self.nodes) hadoop.execute('sudo -u hdfs hdfs dfs -mkdir -p /user/spark') hadoop.execute('sudo -u hdfs hdfs dfs -mkdir -p /user/spark/share/lib') hadoop.execute('sudo -u hdfs hdfs dfs -mkdir -p /user/spark/applicationHistory') hadoop.execute('sudo -u hdfs hdfs dfs -chown -R spark:spark /user/spark') hadoop.execute('sudo -u hdfs hdfs dfs -chmod 1777 /user/spark/applicationHistory') per_node_cpu= self.master.cpus() cluster_cpu = per_node_cpu* len(self.nodes) total_memory = int(self.master.memory() / (1024 * 1024)) - 1024 # executor_memory = int(total_memory/(per_node_cpu*1024)) # executor_count = cluster_cpu # executor_cores = 1 executor_count = cluster_cpu executor_cores = 1 executor_memory = int(math.ceil(total_memory*executor_cores/per_node_cpu)*0.5)#int(math.ceil((total_memory - 5.0*1024/len(self.nodes) - 1024)*0.90)) self.master.script(write_template('spark-defaults.conf', '/etc/spark/conf/spark-defaults.conf', master=self.master.name, instances=executor_count, cores=executor_cores, memory=(str(executor_memory) + 'm'))) self.master.script('sudo service spark-history-server restart') return True
def setup_hdfs_site(self): dirs = ["{0}/hdfs/datanode", "{0}/hdfs/namenode"] def create_hdfs_dirs(vm): for d in map(lambda x: x.format(self.hdfs_path(vm)), dirs): vm.script('sudo su - {0} -c "mkdir -p {1}"'.format( HADOOP_USER, d)) parallel(create_hdfs_dirs, self.all_nodes()) config = """ <property> <name>dfs.replication</name> <value>1</value> <description>Default block replication. The actual number of replications can be specified when the file is created. The default is used if replication is not specified in create time. </description> </property> <property> <name>dfs.namenode.name.dir</name> <value>{0}/hdfs/namenode</value> <description>Determines where on the local filesystem the DFS name node should store the name table(fsimage). If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy. </description> </property> <property> <name>dfs.datanode.data.dir</name> <value>{0}/hdfs/datanode</value> <description>Determines where on the local filesystem an DFS data node should store its blocks. If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. Directories that do not exist are ignored. </description> </property> """ config = HdfsSiteTemplate.format( config.format(self.hdfs_path(self.master))) command = modify_hadoop_config(config, '/etc/hadoop/hdfs-site.xml') parallel(lambda vm: vm.script(command), self.all_nodes())