base_num_map_tasks = num_machines * cores_per_machine base_num_reduce_tasks = num_machines * cores_per_machine num_tasks_multiplier_values = [1, 2, 4, 8, 16, 32] longs_per_value = 6 num_shuffles = 6 sortByKey = False cacheRdd = False slaves = [slave_line.strip("\n") for slave_line in open("/root/spark/conf/slaves").readlines()] print "Running experiment assuming slaves %s" % slaves for items_per_partition in items_per_partition_values: for num_tasks_multiplier in num_tasks_multiplier_values: num_reduce_tasks = num_tasks_multiplier * base_num_reduce_tasks num_map_tasks = num_tasks_multiplier * base_num_map_tasks print "*************Running experiment with {} shuffle values".format( items_per_partition) parameters = [num_map_tasks, num_reduce_tasks, items_per_partition, longs_per_value, num_shuffles, sortByKey, cacheRdd] stringified_parameters = ["{}".format(p) for p in parameters] command = "/root/spark/bin/run-example monotasks.ShuffleJob {}".format( " ".join(stringified_parameters)) print command subprocess.check_call(command, shell=True) utils.copy_and_zip_all_logs(stringified_parameters, slaves) end = time.time() completion = (end - start) print "Shuffle script took " + str(completion)
cols_per_block = 4096 # This is the number of times the shuffle stage will happen. # To use less memory, reduce this number, and instead increase # num_repeats (there's one RDD stored in memory for each shuffle # block). num_col_blocks = 5 # The number of times to repeat the whole computation. num_repeats = 1 for total_rows in total_rows_values: for num_row_blocks in num_row_blocks_values: rows_per_block = int(total_rows / num_row_blocks) parameters = [ rows_per_block, num_row_blocks, cols_per_block, num_col_blocks, num_repeats ] stringified_parameters = [str(p) for p in parameters] master_url = "spark://{}:7077".format(MASTER_HOSTNAME) command = ( "/root/spark/bin/spark-submit --class " + "edu.berkeley.cs.amplab.mlmatrix.BlockCoordinateDescent --driver-memory 20G " + "--driver-class-path /root/ml-matrix/target/scala-2.10/mlmatrix-assembly-0.1.jar " + "/root/ml-matrix/target/scala-2.10/mlmatrix-assembly-0.1.jar " + "{} {}".format(master_url, " ".join(stringified_parameters))) print "Running job with command: " + command subprocess.check_call(command, shell=True) # Copy the logs back. utils.copy_and_zip_all_logs(stringified_parameters, workers)
for num_threads_per_disk in num_threads_per_disk_values: # Change the number of threads per disk by resetting the Spark config. change_num_threads_command = ( "sed -i \"s/spark\.monotasks\.threadsPerDisk .*/" + "spark.monotasks.threadsPerDisk {}/\" {}".format( num_threads_per_disk, spark_defaults_filepath)) print "Changing the number of threads per disk using command: {}".format( change_num_threads_command) subprocess.check_call(change_num_threads_command, shell=True) # For consistency, clear the buffer cache before each experiment. print "Clearing the OS buffer cache using command: {}".format( clear_slave_cache_command) subprocess.check_call(clear_slave_cache_command, shell=True) subprocess.check_call(start_all_command, shell=True) parameters = [ num_partitions, items_per_partition, longs_per_item, num_iterations ] stringified_parameters = [str(p) for p in parameters] experiment_command = ( "{} monotasks.disk.DiskThroughputExperiment {}".format( run_example_command, " ".join(stringified_parameters))) print "Running experiment using command: {}".format(experiment_command) subprocess.check_call(experiment_command, shell=True) # Stop Spark in order to finalize the logs. subprocess.check_call(stop_all_command, shell=True) utils.copy_and_zip_all_logs( stringified_parameters + [str(num_threads_per_disk)], workers)
relative_path="spark/conf/spark-defaults.conf") run_example_command = utils.get_full_path( relative_path="spark/bin/run-example") run_on_slaves_command = utils.get_full_path( relative_path="ephemeral-hdfs/sbin/slaves.sh") for max_concurrent_tasks in max_concurrent_tasks_values: # Change the maximum number of concurrent tasks by resetting the Spark config. change_max_concurrent_tasks_command = ( "sed -i \"s/maxConcurrentTasks .*/" + "maxConcurrentTasks {}/\" {}".format(max_concurrent_tasks, spark_defaults_filepath)) print "Changing the maximum number of concurrent tasks using command: {}".format( change_max_concurrent_tasks_command) subprocess.check_call(change_max_concurrent_tasks_command, shell=True) total_num_items = target_total_data_gb / (4.9 + values_per_key * 1.92) * ( 64 * 4000000) items_per_task = int(total_num_items / num_tasks) parameters = [ num_tasks, num_tasks, items_per_task, values_per_key, num_shuffles ] stringified_parameters = [str(p) for p in parameters] experiment_command = ("{} monotasks.MemorySortJob {}".format( run_example_command, " ".join(stringified_parameters))) print "Running experiment using command: %s" % experiment_command subprocess.check_call(experiment_command, shell=True) utils.copy_and_zip_all_logs( stringified_parameters + [str(max_concurrent_tasks)], workers)
"sed -i s/SPARK_WORKER_CORES=.*/SPARK_WORKER_CORES=" + "{}/ spark/conf/spark-env.sh".format(num_concurrent_tasks)) print "Changing the number of Spark cores using command ", change_cores_command subprocess.check_call(change_cores_command, shell=True) copy_config_command = "/root/spark-ec2/copy-dir --delete /root/spark/conf/" print "Copying the new configuration to the cluster with command ", copy_config_command subprocess.check_call(copy_config_command, shell=True) # Need to stop and re-start Spark, so that the new number of cores per worker takes effect. subprocess.check_call("/root/spark/sbin/stop-all.sh") subprocess.check_call("/root/spark/sbin/start-all.sh") parameters = [ num_disk_tasks, items_per_partition, values_per_item, target_seconds, available_cores, num_compute_tasks ] stringified_parameters = ["{}".format(p) for p in parameters] command = ("/root/spark/bin/run-example DiskAndComputeJobs " + " ".join(stringified_parameters)) print command subprocess.check_call(command, shell=True) utils.copy_and_zip_all_logs(parameters, slaves) # Clear the buffer cache, to sidestep issue with machines dying. subprocess.check_call( "/root/ephemeral-hdfs/bin/slaves.sh /root/spark-ec2/clear-cache.sh", shell=True)