def test_iterate(self): test_dict = {1 : {2 : 3, 4 : 5}, 6 : {7 : 8}, 9 : 10} expected_keys = [(1,2), (1,4), (6,7), (9,)] self.assertEqual(expected_keys, [x for x in flattened_keys(test_dict)])
def test_sort_order(self): test_dict = {1 : {2 : 3, 4 : 5}, 6 : {7 : 8}, 9 : 10} key_order = [6,9,7,2,4,1] def my_sort_function(x): return key_order.index(x[0]) expected_keys = [(6,7), (9,), (1,2), (1,4),] self.assertEqual(expected_keys, [x for x in flattened_keys( test_dict, sort_function = my_sort_function)])
def test_sort_order(self): test_dict = {1: {2: 3, 4: 5}, 6: {7: 8}, 9: 10} key_order = [6, 9, 7, 2, 4, 1] def my_sort_function(x): return key_order.index(x[0]) expected_keys = [ (6, 7), (9, ), (1, 2), (1, 4), ] self.assertEqual(expected_keys, [ x for x in flattened_keys(test_dict, sort_function=my_sort_function) ])
def test_multi_job_scan_share(self): job_ids = [1, 2] phase_zero_prefix_size = 4242 worker_inputs = [] worker_inputs.append({ "host_A" : { 0 : [("file_A_1", 1000), ("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500), ("file_A_5", 6000)], 3 : [("file_A_6", 1000), ("file_A_7", 2000), ("file_A_8", 1000)] }, "host_B" : { 0 : [("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500)], 3 : [("file_A_6", 1000)] } }) worker_inputs.append({ "host_A" : { 0 : [("file_A_1", 1000), ("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500), ("file_A_5", 6000)], 3 : [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B" : { 0 : [("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500)], 3 : [("file_A_7", 2000)] } }) read_requests = generate_read_requests( worker_inputs, phase_zero_prefix_size, job_ids) # Expected file assignments after scan-sharing merge expected_assignments = { "host_A" : { 0 : [("file_A_1", 1000, [1,2]), ("file_A_2", 3000, [1,2])], 1 : [("file_A_3", 1000, [1,2])], 2 : [("file_A_4", 500, [1,2]), ("file_A_5", 6000, [1,2])], 3 : [("file_A_6", 1000, [1,2]), ("file_A_7", 2000, [1,2]), ("file_A_8", 1000, [1])] }, "host_B" : { 0 : [("file_A_2", 3000, [1,2])], 1 : [("file_A_3", 1000, [1,2])], 2 : [("file_A_4", 500, [1,2])], 3 : [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])] } } for host, worker in utils.flattened_keys(expected_assignments): assignments = expected_assignments[host][worker] reqs = read_requests[host][worker] self.assertEqual((len(assignments) + 1) * 3, len(reqs)) req_index = 0 # Should first have a phase zero prefix for each file for job_id in job_ids: for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual( phase_zero_prefix_size, read_request["length"]) self.assertEqual([job_id], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual([job_id], reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual(assignment[1], read_request["length"]) self.assertEqual(assignment[2], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual(job_ids, reqs[req_index]["job_ids"])
def test_single_job(self): worker_inputs = { "host_A" : { 0 : [("file_A_1", 1000), ("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500), ("file_A_5", 6000)], 3 : [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B" : { 0 : [("file_A_2", 3000)], 1 : [("file_A_3", 1000)], 2 : [("file_A_4", 500)], 3 : [("file_A_6", 1000), ("file_A_7", 2000)] } } phase_zero_prefix_size = 4242 job_ids = [1] read_requests = generate_read_requests( [worker_inputs], phase_zero_prefix_size, job_ids) for host, worker in utils.flattened_keys(worker_inputs): worker_reqs = read_requests[host][worker] self.assertEqual( 2 * (len(worker_inputs[host][worker]) + 1), len(worker_reqs)) req_index = 0 # Should first have a phase zero prefix for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(phase_zero_prefix_size, req["length"]) req_index += 1 # Halt request for phase zero should come after that self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(length, req["length"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1
def generate_read_requests( job_inputs, phase_zero_sample_rate, phase_zero_sample_points_per_file, tuple_start_offset, job_ids, phases = list([0, 1])): assert phase_zero_sample_rate <= 1.0,\ "Cannot have a sample rate greater than 1. Got %f" % ( phase_zero_sample_rate) scan_shared_inputs = utils.NestedDict(3, list) for (job_id, worker_inputs) in itertools.izip(job_ids, job_inputs): for host, worker in utils.flattened_keys(worker_inputs): for file_info in worker_inputs[host][worker]: scan_shared_inputs[host][worker][file_info].append(job_id) read_requests = utils.NestedDict(2, list) phase_one_read_requests = utils.NestedDict(2, list) for phase in sorted(phases): if phase == 0: # Read file prefixes for phase zero sampling for each job for job_id in job_ids: for host, worker, file_info in utils.flattened_keys( scan_shared_inputs): file_url, file_length = file_info # Compute the number of sampled bytes from the sample rate sample_length = file_length * phase_zero_sample_rate # Compute the number of bytes per sample point from the # number of sample points within a file if phase_zero_sample_points_per_file > 1: assert tuple_start_offset != 0, "Cannot sample " \ "multiple points per file without specifying a " \ "tuple start offset." sample_length_per_sample_point = \ sample_length / phase_zero_sample_points_per_file sample_point_offset = \ file_length / phase_zero_sample_points_per_file # If we know tuple boundary offsets, then force whole tuples if tuple_start_offset > 0: sample_length_per_sample_point -= \ sample_length_per_sample_point % tuple_start_offset sample_point_offset -= \ sample_point_offset % tuple_start_offset # At this point we are guaranteed that # sample length <= sample offset # both are multiples of the tuple length if fixed size # tuples for i in xrange(phase_zero_sample_points_per_file): # Chunk up the sample data into fixed size samples # spread evenly across the file read_requests[host][worker].append( generate_read_request( [job_id], file_url, i * sample_point_offset, sample_length_per_sample_point)) # Add a halt request after all of the samples for this worker for host, worker in utils.flattened_keys(read_requests): read_requests[host][worker].append(generate_halt_request( [job_id])) elif phase == 1: for host, worker, file_info in utils.flattened_keys( scan_shared_inputs): file_url, file_length = file_info file_jobs = scan_shared_inputs[host][worker][file_info] phase_one_read_requests[host][worker].append( generate_read_request(file_jobs, file_url, 0, file_length)) for host, worker in utils.flattened_keys(phase_one_read_requests): # Randomly permute input files in phase one. requests = list(phase_one_read_requests[host][worker]) random.shuffle(requests) for request in requests: read_requests[host][worker].append(request) read_requests[host][worker].append(generate_halt_request( job_ids)) return read_requests
def test_iterate(self): test_dict = {1: {2: 3, 4: 5}, 6: {7: 8}, 9: 10} expected_keys = [(1, 2), (1, 4), (6, 7), (9, )] self.assertEqual(expected_keys, [x for x in flattened_keys(test_dict)])
def test_multi_job_scan_share(self): job_ids = [1, 2] phase_zero_prefix_size = 4242 worker_inputs = [] worker_inputs.append({ "host_A": { 0: [("file_A_1", 1000), ("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500), ("file_A_5", 6000)], 3: [("file_A_6", 1000), ("file_A_7", 2000), ("file_A_8", 1000)] }, "host_B": { 0: [("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500)], 3: [("file_A_6", 1000)] } }) worker_inputs.append({ "host_A": { 0: [("file_A_1", 1000), ("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500), ("file_A_5", 6000)], 3: [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B": { 0: [("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500)], 3: [("file_A_7", 2000)] } }) read_requests = generate_read_requests(worker_inputs, phase_zero_prefix_size, job_ids) # Expected file assignments after scan-sharing merge expected_assignments = { "host_A": { 0: [("file_A_1", 1000, [1, 2]), ("file_A_2", 3000, [1, 2])], 1: [("file_A_3", 1000, [1, 2])], 2: [("file_A_4", 500, [1, 2]), ("file_A_5", 6000, [1, 2])], 3: [("file_A_6", 1000, [1, 2]), ("file_A_7", 2000, [1, 2]), ("file_A_8", 1000, [1])] }, "host_B": { 0: [("file_A_2", 3000, [1, 2])], 1: [("file_A_3", 1000, [1, 2])], 2: [("file_A_4", 500, [1, 2])], 3: [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])] } } for host, worker in utils.flattened_keys(expected_assignments): assignments = expected_assignments[host][worker] reqs = read_requests[host][worker] self.assertEqual((len(assignments) + 1) * 3, len(reqs)) req_index = 0 # Should first have a phase zero prefix for each file for job_id in job_ids: for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual(phase_zero_prefix_size, read_request["length"]) self.assertEqual([job_id], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual([job_id], reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for assignment in assignments: read_request = reqs[req_index] self.assertEqual(assignment[0], read_request["path"]) self.assertEqual(assignment[1], read_request["length"]) self.assertEqual(assignment[2], read_request["job_ids"]) self.assertEqual(0, read_request["offset"]) self.assertEqual(0, read_request["type"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, reqs[req_index]["type"]) self.assertEqual(job_ids, reqs[req_index]["job_ids"])
def test_single_job(self): worker_inputs = { "host_A": { 0: [("file_A_1", 1000), ("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500), ("file_A_5", 6000)], 3: [("file_A_6", 1000), ("file_A_7", 2000)] }, "host_B": { 0: [("file_A_2", 3000)], 1: [("file_A_3", 1000)], 2: [("file_A_4", 500)], 3: [("file_A_6", 1000), ("file_A_7", 2000)] } } phase_zero_prefix_size = 4242 job_ids = [1] read_requests = generate_read_requests([worker_inputs], phase_zero_prefix_size, job_ids) for host, worker in utils.flattened_keys(worker_inputs): worker_reqs = read_requests[host][worker] self.assertEqual(2 * (len(worker_inputs[host][worker]) + 1), len(worker_reqs)) req_index = 0 # Should first have a phase zero prefix for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(phase_zero_prefix_size, req["length"]) req_index += 1 # Halt request for phase zero should come after that self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1 # Next, should have a full phase one read request for each file for filename, length in worker_inputs[host][worker]: req = worker_reqs[req_index] self.assertEqual(job_ids, req["job_ids"]) self.assertEqual(filename, req["path"]) self.assertEqual(0, req["offset"]) self.assertEqual(0, req["type"]) self.assertEqual(length, req["length"]) req_index += 1 # These read requests should be followed by a halt request self.assertEqual(1, worker_reqs[req_index]["type"]) self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"]) req_index += 1