def disk_benchmark( input_directory, output_directory, benchmark_size_per_disk, **kwargs): if output_directory is None: output_directory = utils.sibling_directory( input_directory, "disk_speeds") (input_url, output_url) = utils.generate_urls( input_directory, output_directory, None) config = utils.mapreduce_job( input_dir = input_url, output_dir = output_url, map_function = "DiskBenchmarkMapFunction", reduce_function = "DiskBenchmarkReduceFunction") utils.force_single_partition(config) data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B")) config_params = { "DISK_BENCHMARK_DATA_SIZE" : data_size_bytes } if "params" not in config: config["params"] = {} for key, value in config_params.items(): config["params"][key] = value return config
def disk_benchmark(input_directory, output_directory, benchmark_size_per_disk, **kwargs): if output_directory is None: output_directory = utils.sibling_directory(input_directory, "disk_speeds") (input_url, output_url) = utils.generate_urls(input_directory, output_directory, None) config = utils.mapreduce_job(input_dir=input_url, output_dir=output_url, map_function="DiskBenchmarkMapFunction", reduce_function="DiskBenchmarkReduceFunction") utils.force_single_partition(config) data_size_bytes = int(uc.parse_and_convert(benchmark_size_per_disk, "B")) config_params = {"DISK_BENCHMARK_DATA_SIZE": data_size_bytes} if "params" not in config: config["params"] = {} for key, value in config_params.items(): config["params"][key] = value return config
def generate_graysort_inputs( redis_host, redis_port, redis_db, total_data_size, method, debug, gensort_command, username, no_sudo, skew, graysort_compatibility_mode, num_files_per_disk, multiple, replica_number, parallelism, intermediate_disks): disk_list_key = "node_io_disks" if intermediate_disks: # Write to intermediate disks instead. disk_list_key = "node_local_disks" # Connect to Redis. redis_client = redis.StrictRedis( host=redis_host, port=redis_port, db=redis_db) # Get list of hosts from redis. hosts = list(redis_client.smembers("nodes")) hosts.sort() # Generate a mapping of hosts -> io disks io_disk_map = {} first_partition_map = {} num_partitions = 0 for host in hosts: disks = list(redis_client.smembers("%s:%s" % (disk_list_key, host))) disks.sort() disk_list = [] for disk in disks: for f in xrange(num_files_per_disk): disk_list.append(disk) io_disk_map[host] = disk_list first_partition_map[host] = num_partitions num_partitions += len(disk_list) local_fqdn = socket.getfqdn() if replica_number > 1: hosts = io_disk_map.keys() host_index = hosts.index(local_fqdn) # Act as if we are a different host for the purposes of generating # replica files host_index = (host_index + (replica_number - 1)) % len(io_disk_map) local_fqdn = hosts[host_index] # Get list of input disks for this local machine. input_disks = io_disk_map[local_fqdn] first_partition = first_partition_map[local_fqdn] job_name = "Graysort" if not graysort_compatibility_mode: job_name = "Graysort-MapReduceHeaders" if replica_number > 1: job_name += "_replica" input_directory = os.path.join(username, "inputs", job_name) # Compute input file names. input_file_relative_paths = map( lambda x: os.path.join( input_directory, "%08d.partition" % (first_partition + x)), xrange(len(input_disks))) input_files = [os.path.abspath(os.path.join(disk, relative_path)) for disk, relative_path in zip(input_disks, input_file_relative_paths)] # Find out if data already exists. existing_files = filter(os.path.exists, input_files) if len(existing_files) == len(input_files): print >>sys.stderr, "Data already exists" sys.exit(0) # Data needs to be generated. Delete any existing files. for input_file in existing_files: os.remove(input_file) # Convert human-readable data size to records or bytes. unit = "R" data_size = unitconversion.parse_and_convert(total_data_size, unit) # Compute record assignments to disks in the cluster. assignments = generate_data_assignment(io_disk_map, data_size, multiple) local_assignments = assignments[local_fqdn] # Finally we're ready to create input files on each disk. gensort_commands = [] for disk_index, input_file in enumerate(input_files): # Manually create directories. # Create input directory directory = os.path.abspath(os.path.join(input_file, os.pardir)) if no_sudo: cmd = mkdir["-p", directory] else: cmd = sudo[mkdir["-p", directory]] if debug: print cmd else: cmd() # Change ownership of input directory from root to USER if not no_sudo: cmd = sudo[chown["-R", username, os.path.dirname(directory)]] if debug: print cmd else: cmd() # PREPARE gensort command. (disk_data_offset, disk_data_size) = local_assignments[disk_index] command_options = {} command_args = [] destination_filename = input_file # Set skew and MapReduce mode options if skew: command_args.append("-s") if not graysort_compatibility_mode: command_args.append("-m") command_args.extend([ "-b%d" % (disk_data_offset), str(disk_data_size), destination_filename]) options_str = ' '.join(('%s %s' % (k, v) for k, v in command_options.items())) args_str = ' '.join(command_args) command = "%s %s %s" % (gensort_command, options_str, args_str) # Finally start the gensort process for this disk. if debug: print command else: command = shlex.split(command) gensort_commands.append(command) # Generate files in random order to attempt to more evenly utilize disks. random.shuffle(gensort_commands) status = 0 while len(gensort_commands) > 0: running_commands = [] gensort_processes = [] for command in gensort_commands: print "Running '%s'" % (command) gensort_processes.append(subprocess.Popen( command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)) running_commands.append(command) if parallelism > 0 and len(running_commands) >= parallelism: # Don't launch more parallel gensort processes break for command in running_commands: gensort_commands.remove(command) # Wait for all gensort processes to finish. for process in gensort_processes: (stdout_data, stderr_data) = process.communicate() print "gensort instance completed with return code %d" % ( process.returncode) if process.returncode != 0: print stderr_data status = 1 return status
def generate_graysort_inputs(redis_host, redis_port, redis_db, total_data_size, method, debug, gensort_command, username, no_sudo, skew, graysort_compatibility_mode, num_files_per_disk, multiple, replica_number, parallelism, intermediate_disks): disk_list_key = "node_io_disks" if intermediate_disks: # Write to intermediate disks instead. disk_list_key = "node_local_disks" # Connect to Redis. redis_client = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db) # Get list of hosts from redis. hosts = list(redis_client.smembers("nodes")) hosts.sort() # Generate a mapping of hosts -> io disks io_disk_map = {} first_partition_map = {} num_partitions = 0 for host in hosts: disks = list(redis_client.smembers("%s:%s" % (disk_list_key, host))) disks.sort() disk_list = [] for disk in disks: for f in xrange(num_files_per_disk): disk_list.append(disk) io_disk_map[host] = disk_list first_partition_map[host] = num_partitions num_partitions += len(disk_list) local_fqdn = socket.getfqdn() if replica_number > 1: host_index = hosts.index(local_fqdn) # Act as if we are a different host for the purposes of generating # replica files host_index = (host_index + (replica_number - 1)) % len(io_disk_map) local_fqdn = hosts[host_index] # Get list of input disks for this local machine. input_disks = io_disk_map[local_fqdn] first_partition = first_partition_map[local_fqdn] job_name = "Graysort" if not graysort_compatibility_mode: job_name = "Graysort-MapReduceHeaders" if replica_number > 1: job_name += "_replica" input_directory = os.path.join(username, "inputs", job_name) # Compute input file names. input_file_relative_paths = map( lambda x: os.path.join(input_directory, "%08d.partition" % (first_partition + x)), xrange(len(input_disks))) input_files = [ os.path.abspath(os.path.join(disk, relative_path)) for disk, relative_path in zip(input_disks, input_file_relative_paths) ] # Find out if data already exists. existing_files = filter(os.path.exists, input_files) if len(existing_files) == len(input_files): print >> sys.stderr, "Data already exists" sys.exit(0) # Data needs to be generated. Delete any existing files. for input_file in existing_files: os.remove(input_file) # Convert human-readable data size to records or bytes. unit = "R" data_size = unitconversion.parse_and_convert(total_data_size, unit) # Compute record assignments to disks in the cluster. assignments = generate_data_assignment(io_disk_map, data_size, multiple) local_assignments = assignments[local_fqdn] # Finally we're ready to create input files on each disk. gensort_commands = [] for disk_index, input_file in enumerate(input_files): # Manually create directories. # Create input directory directory = os.path.abspath(os.path.join(input_file, os.pardir)) if no_sudo: cmd = mkdir["-p", directory] else: cmd = sudo[mkdir["-p", directory]] if debug: print cmd else: cmd() # Change ownership of input directory from root to USER if not no_sudo: cmd = sudo[chown["-R", username, os.path.dirname(directory)]] if debug: print cmd else: cmd() # PREPARE gensort command. (disk_data_offset, disk_data_size) = local_assignments[disk_index] command_options = {} command_args = [] destination_filename = input_file # Set skew and MapReduce mode options if skew: command_args.append("-s") if not graysort_compatibility_mode: command_args.append("-m") command_args.extend([ "-b%d" % (disk_data_offset), str(disk_data_size), destination_filename ]) options_str = ' '.join( ('%s %s' % (k, v) for k, v in command_options.items())) args_str = ' '.join(command_args) command = "%s %s %s" % (gensort_command, options_str, args_str) # Finally start the gensort process for this disk. if debug: print command else: command = shlex.split(command) gensort_commands.append(command) # Generate files in random order to attempt to more evenly utilize disks. random.shuffle(gensort_commands) status = 0 while len(gensort_commands) > 0: running_commands = [] gensort_processes = [] for command in gensort_commands: print "Running '%s'" % (command) gensort_processes.append( subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)) running_commands.append(command) if parallelism > 0 and len(running_commands) >= parallelism: # Don't launch more parallel gensort processes break for command in running_commands: gensort_commands.remove(command) # Wait for all gensort processes to finish. for process in gensort_processes: (stdout_data, stderr_data) = process.communicate() print "gensort instance completed with return code %d" % ( process.returncode) if process.returncode != 0: print stderr_data status = 1 return status
def generate_graysort_inputs(redis_host, redis_port, redis_db, total_data_size, method, debug, pareto_a, pareto_b, max_key_len, max_val_len, min_key_len, min_val_len, large_tuples, hdfs_namenode, hdfs_replication, gensort_command, username, no_sudo, transfer_size, skew, graysort_compatibility_mode, num_files_per_disk, multiple, replica_number, parallelism, intermediate_disks): disk_list_key = "node_io_disks" if intermediate_disks: # Write to intermediate disks instead. disk_list_key = "node_local_disks" # Connect to Redis. redis_client = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db) # Get list of hosts from redis. hosts = list(redis_client.smembers("nodes")) hosts.sort() # Generate a mapping of hosts -> io disks io_disk_map = {} first_partition_map = {} num_partitions = 0 for host in hosts: disks = list(redis_client.smembers("%s:%s" % (disk_list_key, host))) disks.sort() disk_list = [] for disk in disks: for f in xrange(num_files_per_disk): disk_list.append(disk) io_disk_map[host] = disk_list first_partition_map[host] = num_partitions num_partitions += len(disk_list) local_fqdn = socket.getfqdn() if replica_number > 1: hosts = io_disk_map.keys() host_index = hosts.index(local_fqdn) # Act as if we are a different host for the purposes of generating # replica files host_index = (host_index + (replica_number - 1)) % len(io_disk_map) local_fqdn = hosts[host_index] # Get list of input disks for this local machine. input_disks = io_disk_map[local_fqdn] first_partition = first_partition_map[local_fqdn] job_name = "Graysort" if not graysort_compatibility_mode: job_name = "Graysort-MapReduceHeaders" if replica_number > 1: job_name += "_replica" input_directory = os.path.join(username, "inputs", job_name) # Compute input file names. input_file_relative_paths = map( lambda x: os.path.join(input_directory, "%08d.partition" % (first_partition + x)), xrange(len(input_disks))) input_files = [ os.path.abspath(os.path.join(disk, relative_path)) for disk, relative_path in zip(input_disks, input_file_relative_paths) ] # Find out if data already exists. existing_files = filter(os.path.exists, input_files) if len(existing_files) == len(input_files): print >> sys.stderr, "Data already exists" sys.exit(0) # Data needs to be generated. Delete any existing files. for input_file in existing_files: os.remove(input_file) # Convert human-readable data size to records or bytes. unit = "R" if method == "pareto": # Pareto uses bytes since records are variably sized. unit = "B" data_size = unitconversion.parse_and_convert(total_data_size, unit) # Compute record assignments to disks in the cluster. assignments = generate_data_assignment(io_disk_map, data_size, multiple) local_assignments = assignments[local_fqdn] # If large tuples were specified, assign them round robin across disks. if large_tuples is not None: large_tuple_assignments = {} # Convert comma-delimited list into a list of triples. large_tuples = large_tuples.split(",") large_tuples = zip(large_tuples[0::3], large_tuples[1::3], large_tuples[2::3]) # Round robin the triples across disks. for index, large_tuple in enumerate(large_tuples): disk_index = index % len(input_disks) if disk_index not in large_tuple_assignments: large_tuple_assignments[disk_index] = list(large_tuple) else: large_tuple_assignments[disk_index].extend(list(large_tuple)) # Finally we're ready to create input files on each disk. gensort_commands = [] for disk_index, input_file in enumerate(input_files): # Unless we're using HDFS, we need to manually create directories. if method != "gensort_hdfs": # Create input directory directory = os.path.abspath(os.path.join(input_file, os.pardir)) if no_sudo: cmd = mkdir["-p", directory] else: cmd = sudo[mkdir["-p", directory]] if debug: print cmd else: cmd() # Change ownership of input directory from root to USER if not no_sudo: cmd = sudo[chown["-R", username, os.path.dirname(directory)]] if debug: print cmd else: cmd() # Prepare gensort command. (disk_data_offset, disk_data_size) = local_assignments[disk_index] command_options = {} command_args = [] if method == "pareto": # Set all pareto distribution options in an options string. command_options["-pareto_a"] = "%f" % (pareto_a) command_options["-pareto_b"] = "%f" % (pareto_b) command_options["-maxKeyLen"] = "%d" % (max_key_len) command_options["-maxValLen"] = "%d" % (max_val_len) command_options["-minKeyLen"] = "%d" % (min_key_len) command_options["-minValLen"] = "%d" % (min_val_len) # If large tuples were specified, add them to the options string. if (large_tuples is not None and disk_index in large_tuple_assignments and len(large_tuple_assignments[disk_index]) > 0): command_options["-largeTuples"] = ','.join( map(str, large_tuple_assignments[disk_index])) command_args = [input_file, "pareto", str(int(disk_data_size))] else: # The other methods use some form of gensort, but the filename # depends on whether we're writing to HDFS or not. if method == "gensort_hdfs": # Get local IP address from redis so we can pass it to HDFS. ip_address = redis_client.hget("ipv4_address", local_fqdn) destination_filename = "http://%s/webhdfs/v1/%s/%d/%s" % ( hdfs_namenode, ip_address, disk_index, input_file_relative_paths[disk_index]) command_args.append("-r%d" % (hdfs_replication)) else: destination_filename = input_file if method == "gensort_2013": if transfer_size is not None: destination_filename += ",trans=%s" % transfer_size if skew: command_args.append("-s") if not graysort_compatibility_mode: command_args.append("-m") # Not including offset or replication as options because of # gensort's ridiculously bad command parsing command_args.extend([ "-b%d" % (disk_data_offset), str(disk_data_size), destination_filename ]) options_str = ' '.join( ('%s %s' % (k, v) for k, v in command_options.items())) args_str = ' '.join(command_args) command = "%s %s %s" % (gensort_command, options_str, args_str) # Finally start the gensort process for this disk. if debug: print command else: command = shlex.split(command) gensort_commands.append(command) # Generate files in random order to attempt to more evenly utilize disks. random.shuffle(gensort_commands) status = 0 while len(gensort_commands) > 0: running_commands = [] gensort_processes = [] for command in gensort_commands: print "Running '%s'" % (command) gensort_processes.append( subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)) running_commands.append(command) if parallelism > 0 and len(running_commands) >= parallelism: # Don't launch more parallel gensort processes break for command in running_commands: gensort_commands.remove(command) # Wait for all gensort processes to finish. for process in gensort_processes: (stdout_data, stderr_data) = process.communicate() print "gensort instance completed with return code %d" % ( process.returncode) if process.returncode != 0: print stderr_data status = 1 return status