def reload_read_request_queues(job_description_file, job_ids, redis_host, redis_port, redis_db, skip_phase_zero, skip_phase_one, phase_zero_sample_size): with open(job_description_file, 'r') as fp: job_description = json.load(fp) coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db) input_files = input_file_utils.gather_input_file_paths( coordinator_db, job_description["input_directory"]) phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) read_requests = input_file_utils.generate_read_requests( input_files, phase_zero_sample_size, job_ids, phases) input_file_utils.load_read_requests(coordinator_db, read_requests)
def reload_read_request_queues( job_description_file, job_ids, redis_host, redis_port, redis_db, skip_phase_zero, skip_phase_one, phase_zero_sample_size): with open(job_description_file, 'r') as fp: job_description = json.load(fp) coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db) input_files = input_file_utils.gather_input_file_paths( coordinator_db, job_description["input_directory"]) phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) read_requests = input_file_utils.generate_read_requests( input_files, phase_zero_sample_size, job_ids, phases) input_file_utils.load_read_requests(coordinator_db, read_requests)
def test_multi_job_scan_share(self): job_ids = [1, 2] phase_zero_prefix_size = 4242 worker_inputs = [] worker_inputs.append({ "host_A" : { 0 : [("file_A_1", 1000), ("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500), ("file_A_5", 6000)], 3 : [("file_A_6", 1000), ("file_A_7", 2000), ("file_A_8", 1000)] }, "host_B" : { 0 : [("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500)], 3 : [("file_A_6", 1000)] } }) worker_inputs.append({ "host_A" : { 0 : [("file_A_1", 1000), ("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500), ("file_A_5", 6000)], 3 : [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B" : { 0 : [("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500)], 3 : [("file_A_7", 2000)] } }) read_requests = generate_read_requests( worker_inputs, phase_zero_prefix_size, job_ids) # Expected file assignments after scan-sharing merge expected_assignments = { "host_A" : { 0 : [("file_A_1", 1000, [1,2]), ("file_A_2", 3000, [1,2])], 1 : [("file_A_3", 1000, [1,2])], 2 : [("file_A_4", 500, [1,2]), ("file_A_5", 6000, [1,2])], 3 : [("file_A_6", 1000, [1,2]), ("file_A_7", 2000, [1,2]), ("file_A_8", 1000, [1])] }, "host_B" : { 0 : [("file_A_2", 3000, [1,2])], 1 : [("file_A_3", 1000, [1,2])], 2 : [("file_A_4", 500, [1,2])], 3 : [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])] } } for host, worker in utils.flattened_keys(expected_assignments): assignments = expected_assignments[host][worker] reqs = read_requests[host][worker] self.assertEqual((len(assignments) + 1) * 3, len(reqs)) req_index = 0 # Should first have a phase zero prefix for each file for job_id in job_ids: for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual( phase_zero_prefix_size, read_request["length"]) self.assertEqual([job_id], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual([job_id], reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual(assignment[1], read_request["length"]) self.assertEqual(assignment[2], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual(job_ids, reqs[req_index]["job_ids"])
def test_single_job(self): worker_inputs = { "host_A" : { 0 : [("file_A_1", 1000), ("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500), ("file_A_5", 6000)], 3 : [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B" : { 0 : [("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500)], 3 : [("file_A_6", 1000), ("file_A_7", 2000)] } } phase_zero_prefix_size = 4242 job_ids = [1] read_requests = generate_read_requests( [worker_inputs], phase_zero_prefix_size, job_ids) for host, worker in utils.flattened_keys(worker_inputs): worker_reqs = read_requests[host][worker] self.assertEqual( 2 * (len(worker_inputs[host][worker]) + 1), len(worker_reqs)) req_index = 0 # Should first have a phase zero prefix for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(phase_zero_prefix_size, req["length"]) req_index += 1 # Halt request for phase zero should come after that self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(length, req["length"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1
def test_multi_job_scan_share(self): job_ids = [1, 2] phase_zero_prefix_size = 4242 worker_inputs = [] worker_inputs.append({ "host_A": { 0: [("file_A_1", 1000), ("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500), ("file_A_5", 6000)], 3: [("file_A_6", 1000), ("file_A_7", 2000), ("file_A_8", 1000)] }, "host_B": { 0: [("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500)], 3: [("file_A_6", 1000)] } }) worker_inputs.append({ "host_A": { 0: [("file_A_1", 1000), ("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500), ("file_A_5", 6000)], 3: [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B": { 0: [("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500)], 3: [("file_A_7", 2000)] } }) read_requests = generate_read_requests(worker_inputs, phase_zero_prefix_size, job_ids) # Expected file assignments after scan-sharing merge expected_assignments = { "host_A": { 0: [("file_A_1", 1000, [1, 2]), ("file_A_2", 3000, [1, 2])], 1: [("file_A_3", 1000, [1, 2])], 2: [("file_A_4", 500, [1, 2]), ("file_A_5", 6000, [1, 2])], 3: [("file_A_6", 1000, [1, 2]), ("file_A_7", 2000, [1, 2]), ("file_A_8", 1000, [1])] }, "host_B": { 0: [("file_A_2", 3000, [1, 2])], 1: [("file_A_3", 1000, [1, 2])], 2: [("file_A_4", 500, [1, 2])], 3: [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])] } } for host, worker in utils.flattened_keys(expected_assignments): assignments = expected_assignments[host][worker] reqs = read_requests[host][worker] self.assertEqual((len(assignments) + 1) * 3, len(reqs)) req_index = 0 # Should first have a phase zero prefix for each file for job_id in job_ids: for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual(phase_zero_prefix_size, read_request["length"]) self.assertEqual([job_id], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual([job_id], reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual(assignment[1], read_request["length"]) self.assertEqual(assignment[2], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual(job_ids, reqs[req_index]["job_ids"])
def test_single_job(self): worker_inputs = { "host_A": { 0: [("file_A_1", 1000), ("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500), ("file_A_5", 6000)], 3: [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B": { 0: [("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500)], 3: [("file_A_6", 1000), ("file_A_7", 2000)] } } phase_zero_prefix_size = 4242 job_ids = [1] read_requests = generate_read_requests([worker_inputs], phase_zero_prefix_size, job_ids) for host, worker in utils.flattened_keys(worker_inputs): worker_reqs = read_requests[host][worker] self.assertEqual(2 * (len(worker_inputs[host][worker]) + 1), len(worker_reqs)) req_index = 0 # Should first have a phase zero prefix for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(phase_zero_prefix_size, req["length"]) req_index += 1 # Halt request for phase zero should come after that self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(length, req["length"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1
def run_batch(self, batch_jobs, batch_inputs): batch_id = self.coordinator_db.next_batch_id log.info("Running batch %d with the following job(s): %s" % (batch_id, ', '.join(map(str, batch_jobs)))) # Create log directory for the current batch batch_logs = create_batch_directory(self.log_directory, batch_id) # Copy description files to the log directory description_dir = os.path.join( os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "tritonsort", "mapreduce", "description") shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs) shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs) # Copy config file to log directory shutil.copy(self.config_file, batch_logs) self.ready_for_next_batch = False # Pull out relevant phase zero parameters phase_zero_sample_rate = 1 # Sample 100% by default if "SAMPLE_RATE" in self.config: phase_zero_sample_rate = float(self.config["SAMPLE_RATE"]) phase_zero_sample_points_per_file = 1 # Sample prefixes by default if "SAMPLES_PER_FILE" in self.config: phase_zero_sample_points_per_file = \ int(self.config["SAMPLES_PER_FILE"]) fixed_key_length = None if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config: fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"]) fixed_value_length = None if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config: fixed_value_length = \ int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"]) # If the application config file (yaml) or the job spec file (json) # skips a phase, we should not load read requests for that phase. The # job spec file should override the application config file. skip_phase_zero = 0 skip_phase_one = 0 skip_phase_two = 0 skip_phase_three = 0 if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]: skip_phase_zero = 1 if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]: skip_phase_one = 1 if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]: skip_phase_two = 1 if "SKIP_PHASE_THREE" in self.config and \ self.config["SKIP_PHASE_THREE"]: skip_phase_three = 1 # The run_job.py script verifies that all jobs in the batch have the # same value of these skip parameters in the job specs, so we can just # check the first one. for key, value in ( self.coordinator_db.job_params(batch_jobs[0]).items()): if key == "SKIP_PHASE_ZERO": skip_phase_zero = value if key == "SKIP_PHASE_ONE": skip_phase_one = value if key == "SKIP_PHASE_TWO": skip_phase_two = value if key == "SKIP_PHASE_THREE": skip_phase_three = value if key == "MAP_INPUT_FIXED_KEY_LENGTH": fixed_key_length = int(value) if key == "MAP_INPUT_FIXED_VALUE_LENGTH": fixed_value_length = int(value) fixed_tuple_length = None if fixed_key_length != None and fixed_value_length != None: fixed_tuple_length = fixed_key_length + fixed_value_length use_replication = False if "OUTPUT_REPLICATION_LEVEL" in self.config and \ int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1: use_replication = True phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) if not skip_phase_two and use_replication: # If we're using replication, phase two will have network transfer, # use barriers to guarantee sockets are connected. phases.append(2) if not skip_phase_three and use_replication: # If we're using replication, phase three will have network # transfer, use barriers to guarantee sockets are connected. phases.append(3) # Setup barriers self.coordinator_db.create_barriers(phases, batch_id, batch_jobs) # Generate read requests for the jobs in the batch read_requests = generate_read_requests( job_inputs = batch_inputs, phase_zero_sample_rate = phase_zero_sample_rate, phase_zero_sample_points_per_file =\ phase_zero_sample_points_per_file, tuple_start_offset = fixed_tuple_length, job_ids = batch_jobs, phases=phases) # Load read requests into read request queue for each worker load_read_requests(self.coordinator_db, read_requests) start_time = time.time() # Mark phase zero as starting now. self.coordinator_db.begin_phase(batch_id, "phase_zero") self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time) log.info("Running phase_zero...") print_keyboard_commands() for job_id in batch_jobs: self.coordinator_db.update_job_status( job_id, { "start_time" : str(start_time), "batch_id" : batch_id, "date" : time.asctime()}) self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs) self.coordinator_db.mark_batch_incomplete(batch_id) # Setting current_batch will cause all node coordinators to start work # on that batch self.coordinator_db.add_batch_to_node_coordinator_batch_queues(batch_id)
def run_batch(self, batch_jobs, batch_inputs): batch_id = self.coordinator_db.next_batch_id log.info("Running batch %d with the following job(s): %s" % (batch_id, ', '.join(map(str, batch_jobs)))) # Create log directory for the current batch batch_logs = create_batch_directory(self.log_directory, batch_id) # Copy description files to the log directory description_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, "tritonsort", "mapreduce", "description") shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs) shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs) # Copy config file to log directory shutil.copy(self.config_file, batch_logs) self.ready_for_next_batch = False # Pull out relevant phase zero parameters phase_zero_sample_rate = 1 # Sample 100% by default if "SAMPLE_RATE" in self.config: phase_zero_sample_rate = float(self.config["SAMPLE_RATE"]) phase_zero_sample_points_per_file = 1 # Sample prefixes by default if "SAMPLES_PER_FILE" in self.config: phase_zero_sample_points_per_file = \ int(self.config["SAMPLES_PER_FILE"]) fixed_key_length = None if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config: fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"]) fixed_value_length = None if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config: fixed_value_length = \ int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"]) # If the application config file (yaml) or the job spec file (json) # skips a phase, we should not load read requests for that phase. The # job spec file should override the application config file. skip_phase_zero = 0 skip_phase_one = 0 skip_phase_two = 0 skip_phase_three = 0 if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]: skip_phase_zero = 1 if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]: skip_phase_one = 1 if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]: skip_phase_two = 1 if "SKIP_PHASE_THREE" in self.config and \ self.config["SKIP_PHASE_THREE"]: skip_phase_three = 1 # The run_job.py script verifies that all jobs in the batch have the # same value of these skip parameters in the job specs, so we can just # check the first one. for key, value in (self.coordinator_db.job_params( batch_jobs[0]).items()): if key == "SKIP_PHASE_ZERO": skip_phase_zero = value if key == "SKIP_PHASE_ONE": skip_phase_one = value if key == "SKIP_PHASE_TWO": skip_phase_two = value if key == "SKIP_PHASE_THREE": skip_phase_three = value if key == "MAP_INPUT_FIXED_KEY_LENGTH": fixed_key_length = int(value) if key == "MAP_INPUT_FIXED_VALUE_LENGTH": fixed_value_length = int(value) fixed_tuple_length = None if fixed_key_length != None and fixed_value_length != None: fixed_tuple_length = fixed_key_length + fixed_value_length use_replication = False if "OUTPUT_REPLICATION_LEVEL" in self.config and \ int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1: use_replication = True phases = [] if not skip_phase_zero: phases.append(0) if not skip_phase_one: phases.append(1) if not skip_phase_two and use_replication: # If we're using replication, phase two will have network transfer, # use barriers to guarantee sockets are connected. phases.append(2) if not skip_phase_three and use_replication: # If we're using replication, phase three will have network # transfer, use barriers to guarantee sockets are connected. phases.append(3) # Setup barriers self.coordinator_db.create_barriers(phases, batch_id, batch_jobs) # Generate read requests for the jobs in the batch read_requests = generate_read_requests( job_inputs = batch_inputs, phase_zero_sample_rate = phase_zero_sample_rate, phase_zero_sample_points_per_file =\ phase_zero_sample_points_per_file, tuple_start_offset = fixed_tuple_length, job_ids = batch_jobs, phases=phases) # Load read requests into read request queue for each worker load_read_requests(self.coordinator_db, read_requests) start_time = time.time() # Mark phase zero as starting now. self.coordinator_db.begin_phase(batch_id, "phase_zero") self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time) log.info("Running phase_zero...") print_keyboard_commands() for job_id in batch_jobs: self.coordinator_db.update_job_status( job_id, { "start_time": str(start_time), "batch_id": batch_id, "date": time.asctime() }) self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs) self.coordinator_db.mark_batch_incomplete(batch_id) # Setting current_batch will cause all node coordinators to start work # on that batch self.coordinator_db.add_batch_to_node_coordinator_batch_queues( batch_id)