Beispiel #1
0
def reload_read_request_queues(job_description_file, job_ids, redis_host,
                               redis_port, redis_db, skip_phase_zero,
                               skip_phase_one, phase_zero_sample_size):

    with open(job_description_file, 'r') as fp:
        job_description = json.load(fp)

    coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port,
                                               redis_db)

    input_files = input_file_utils.gather_input_file_paths(
        coordinator_db, job_description["input_directory"])

    phases = []

    if not skip_phase_zero:
        phases.append(0)

    if not skip_phase_one:
        phases.append(1)

    read_requests = input_file_utils.generate_read_requests(
        input_files, phase_zero_sample_size, job_ids, phases)

    input_file_utils.load_read_requests(coordinator_db, read_requests)
def reload_read_request_queues(
    job_description_file, job_ids, redis_host, redis_port, redis_db,
    skip_phase_zero, skip_phase_one, phase_zero_sample_size):

    with open(job_description_file, 'r') as fp:
        job_description = json.load(fp)

    coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db)

    input_files = input_file_utils.gather_input_file_paths(
        coordinator_db, job_description["input_directory"])

    phases = []

    if not skip_phase_zero:
        phases.append(0)

    if not skip_phase_one:
        phases.append(1)

    read_requests = input_file_utils.generate_read_requests(
        input_files, phase_zero_sample_size, job_ids, phases)

    input_file_utils.load_read_requests(coordinator_db, read_requests)
    def test_multi_job_scan_share(self):
        job_ids = [1, 2]
        phase_zero_prefix_size = 4242

        worker_inputs = []
        worker_inputs.append({
                "host_A" : {
                    0 : [("file_A_1", 1000), ("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500), ("file_A_5", 6000)],
                    3 : [("file_A_6", 1000), ("file_A_7", 2000),
                         ("file_A_8", 1000)]
                    },
                "host_B" : {
                    0 : [("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500)],
                    3 : [("file_A_6", 1000)]
                    }
                })
        worker_inputs.append({
                "host_A" : {
                    0 : [("file_A_1", 1000), ("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500), ("file_A_5", 6000)],
                    3 : [("file_A_6", 1000), ("file_A_7", 2000)]
                    },
                "host_B" : {
                    0 : [("file_A_2", 3000)],
                    1 : [("file_A_3", 1000)],
                    2 : [("file_A_4", 500)],
                    3 : [("file_A_7", 2000)]
                    }
                })

        read_requests = generate_read_requests(
            worker_inputs, phase_zero_prefix_size, job_ids)

        # Expected file assignments after scan-sharing merge

        expected_assignments = {
            "host_A" : {
                0 : [("file_A_1", 1000, [1,2]), ("file_A_2", 3000, [1,2])],
                1 : [("file_A_3", 1000, [1,2])],
                2 : [("file_A_4", 500, [1,2]), ("file_A_5", 6000, [1,2])],
                3 : [("file_A_6", 1000, [1,2]), ("file_A_7", 2000, [1,2]),
                     ("file_A_8", 1000, [1])]
                },
            "host_B" : {
                0 : [("file_A_2", 3000, [1,2])],
                1 : [("file_A_3", 1000, [1,2])],
                2 : [("file_A_4", 500, [1,2])],
                3 : [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])]
                }
            }

        for host, worker in utils.flattened_keys(expected_assignments):
            assignments = expected_assignments[host][worker]
            reqs = read_requests[host][worker]

            self.assertEqual((len(assignments) + 1) * 3, len(reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for job_id in job_ids:
                for assignment in assignments:
                    read_request = reqs[req_index]

                    self.assertEqual(assignment[0], read_request["path"])
                    self.assertEqual(
                        phase_zero_prefix_size, read_request["length"])
                    self.assertEqual([job_id], read_request["job_ids"])
                    self.assertEqual(0, read_request["offset"])
                    self.assertEqual(0, read_request["type"])

                    req_index += 1

                # These read requests should be followed by a halt request
                self.assertEqual(1, reqs[req_index]["type"])
                self.assertEqual([job_id], reqs[req_index]["job_ids"])
                req_index += 1

            # Next, should have a full phase one read request for each file
            for assignment in assignments:
                read_request = reqs[req_index]

                self.assertEqual(assignment[0], read_request["path"])
                self.assertEqual(assignment[1], read_request["length"])
                self.assertEqual(assignment[2], read_request["job_ids"])
                self.assertEqual(0, read_request["offset"])
                self.assertEqual(0, read_request["type"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, reqs[req_index]["type"])
            self.assertEqual(job_ids, reqs[req_index]["job_ids"])
    def test_single_job(self):
        worker_inputs = {
            "host_A" : {
                0 : [("file_A_1", 1000), ("file_A_2", 3000)],
                1 : [("file_A_3", 1000)],
                2 : [("file_A_4", 500), ("file_A_5", 6000)],
                3 : [("file_A_6", 1000), ("file_A_7", 2000)]
                },
            "host_B" : {
                0 : [("file_A_2", 3000)],
                1 : [("file_A_3", 1000)],
                2 : [("file_A_4", 500)],
                3 : [("file_A_6", 1000), ("file_A_7", 2000)]
                }
            }

        phase_zero_prefix_size = 4242
        job_ids = [1]

        read_requests = generate_read_requests(
            [worker_inputs], phase_zero_prefix_size, job_ids)

        for host, worker in utils.flattened_keys(worker_inputs):
            worker_reqs = read_requests[host][worker]

            self.assertEqual(
                2 * (len(worker_inputs[host][worker]) + 1), len(worker_reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(phase_zero_prefix_size, req["length"])

                req_index += 1

            # Halt request for phase zero should come after that
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1

            # Next, should have a full phase one read request for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(length, req["length"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1
Beispiel #5
0
    def test_multi_job_scan_share(self):
        job_ids = [1, 2]
        phase_zero_prefix_size = 4242

        worker_inputs = []
        worker_inputs.append({
            "host_A": {
                0: [("file_A_1", 1000), ("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500), ("file_A_5", 6000)],
                3: [("file_A_6", 1000), ("file_A_7", 2000), ("file_A_8", 1000)]
            },
            "host_B": {
                0: [("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500)],
                3: [("file_A_6", 1000)]
            }
        })
        worker_inputs.append({
            "host_A": {
                0: [("file_A_1", 1000), ("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500), ("file_A_5", 6000)],
                3: [("file_A_6", 1000), ("file_A_7", 2000)]
            },
            "host_B": {
                0: [("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500)],
                3: [("file_A_7", 2000)]
            }
        })

        read_requests = generate_read_requests(worker_inputs,
                                               phase_zero_prefix_size, job_ids)

        # Expected file assignments after scan-sharing merge

        expected_assignments = {
            "host_A": {
                0: [("file_A_1", 1000, [1, 2]), ("file_A_2", 3000, [1, 2])],
                1: [("file_A_3", 1000, [1, 2])],
                2: [("file_A_4", 500, [1, 2]), ("file_A_5", 6000, [1, 2])],
                3: [("file_A_6", 1000, [1, 2]), ("file_A_7", 2000, [1, 2]),
                    ("file_A_8", 1000, [1])]
            },
            "host_B": {
                0: [("file_A_2", 3000, [1, 2])],
                1: [("file_A_3", 1000, [1, 2])],
                2: [("file_A_4", 500, [1, 2])],
                3: [("file_A_6", 1000, [1]), ("file_A_7", 2000, [2])]
            }
        }

        for host, worker in utils.flattened_keys(expected_assignments):
            assignments = expected_assignments[host][worker]
            reqs = read_requests[host][worker]

            self.assertEqual((len(assignments) + 1) * 3, len(reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for job_id in job_ids:
                for assignment in assignments:
                    read_request = reqs[req_index]

                    self.assertEqual(assignment[0], read_request["path"])
                    self.assertEqual(phase_zero_prefix_size,
                                     read_request["length"])
                    self.assertEqual([job_id], read_request["job_ids"])
                    self.assertEqual(0, read_request["offset"])
                    self.assertEqual(0, read_request["type"])

                    req_index += 1

                # These read requests should be followed by a halt request
                self.assertEqual(1, reqs[req_index]["type"])
                self.assertEqual([job_id], reqs[req_index]["job_ids"])
                req_index += 1

            # Next, should have a full phase one read request for each file
            for assignment in assignments:
                read_request = reqs[req_index]

                self.assertEqual(assignment[0], read_request["path"])
                self.assertEqual(assignment[1], read_request["length"])
                self.assertEqual(assignment[2], read_request["job_ids"])
                self.assertEqual(0, read_request["offset"])
                self.assertEqual(0, read_request["type"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, reqs[req_index]["type"])
            self.assertEqual(job_ids, reqs[req_index]["job_ids"])
Beispiel #6
0
    def test_single_job(self):
        worker_inputs = {
            "host_A": {
                0: [("file_A_1", 1000), ("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500), ("file_A_5", 6000)],
                3: [("file_A_6", 1000), ("file_A_7", 2000)]
            },
            "host_B": {
                0: [("file_A_2", 3000)],
                1: [("file_A_3", 1000)],
                2: [("file_A_4", 500)],
                3: [("file_A_6", 1000), ("file_A_7", 2000)]
            }
        }

        phase_zero_prefix_size = 4242
        job_ids = [1]

        read_requests = generate_read_requests([worker_inputs],
                                               phase_zero_prefix_size, job_ids)

        for host, worker in utils.flattened_keys(worker_inputs):
            worker_reqs = read_requests[host][worker]

            self.assertEqual(2 * (len(worker_inputs[host][worker]) + 1),
                             len(worker_reqs))

            req_index = 0

            # Should first have a phase zero prefix for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(phase_zero_prefix_size, req["length"])

                req_index += 1

            # Halt request for phase zero should come after that
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1

            # Next, should have a full phase one read request for each file
            for filename, length in worker_inputs[host][worker]:
                req = worker_reqs[req_index]

                self.assertEqual(job_ids, req["job_ids"])
                self.assertEqual(filename, req["path"])
                self.assertEqual(0, req["offset"])
                self.assertEqual(0, req["type"])
                self.assertEqual(length, req["length"])

                req_index += 1

            # These read requests should be followed by a halt request
            self.assertEqual(1, worker_reqs[req_index]["type"])
            self.assertEqual(job_ids, worker_reqs[req_index]["job_ids"])

            req_index += 1
    def run_batch(self, batch_jobs, batch_inputs):
        batch_id = self.coordinator_db.next_batch_id

        log.info("Running batch %d with the following job(s): %s" %
                 (batch_id, ', '.join(map(str, batch_jobs))))


        # Create log directory for the current batch
        batch_logs = create_batch_directory(self.log_directory, batch_id)

        # Copy description files to the log directory
        description_dir = os.path.join(
            os.path.dirname(__file__), os.pardir, os.pardir, os.pardir,
            "tritonsort", "mapreduce", "description")
        shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs)
        shutil.copy(os.path.join(description_dir, "structure.json"), batch_logs)

        # Copy config file to log directory
        shutil.copy(self.config_file, batch_logs)

        self.ready_for_next_batch = False

        # Pull out relevant phase zero parameters
        phase_zero_sample_rate = 1 # Sample 100% by default
        if "SAMPLE_RATE" in self.config:
            phase_zero_sample_rate = float(self.config["SAMPLE_RATE"])
        phase_zero_sample_points_per_file = 1 # Sample prefixes by default
        if "SAMPLES_PER_FILE" in self.config:
            phase_zero_sample_points_per_file = \
                int(self.config["SAMPLES_PER_FILE"])
        fixed_key_length = None
        if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config:
            fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"])
        fixed_value_length = None
        if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config:
            fixed_value_length = \
                int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"])

        # If the application config file (yaml) or the job spec file (json)
        # skips a phase, we should not load read requests for that phase. The
        # job spec file should override the application config file.
        skip_phase_zero = 0
        skip_phase_one = 0
        skip_phase_two = 0
        skip_phase_three = 0
        if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]:
            skip_phase_zero = 1
        if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]:
            skip_phase_one = 1
        if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]:
            skip_phase_two = 1
        if "SKIP_PHASE_THREE" in self.config and \
                self.config["SKIP_PHASE_THREE"]:
            skip_phase_three = 1

        # The run_job.py script verifies that all jobs in the batch have the
        # same value of these skip parameters in the job specs, so we can just
        # check the first one.
        for key, value in (
            self.coordinator_db.job_params(batch_jobs[0]).items()):
            if key == "SKIP_PHASE_ZERO":
                skip_phase_zero = value
            if key == "SKIP_PHASE_ONE":
                skip_phase_one = value
            if key == "SKIP_PHASE_TWO":
                skip_phase_two = value
            if key == "SKIP_PHASE_THREE":
                skip_phase_three = value
            if key == "MAP_INPUT_FIXED_KEY_LENGTH":
                fixed_key_length = int(value)
            if key == "MAP_INPUT_FIXED_VALUE_LENGTH":
                fixed_value_length = int(value)

        fixed_tuple_length = None
        if fixed_key_length != None and fixed_value_length != None:
            fixed_tuple_length = fixed_key_length + fixed_value_length

        use_replication = False
        if "OUTPUT_REPLICATION_LEVEL" in self.config and \
                int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1:
            use_replication = True

        phases = []
        if not skip_phase_zero:
            phases.append(0)
        if not skip_phase_one:
            phases.append(1)
        if not skip_phase_two and use_replication:
            # If we're using replication, phase two will have network transfer,
            # use barriers to guarantee sockets are connected.
            phases.append(2)
        if not skip_phase_three and use_replication:
            # If we're using replication, phase three will have network
            # transfer, use barriers to guarantee sockets are connected.
            phases.append(3)

        # Setup barriers
        self.coordinator_db.create_barriers(phases, batch_id, batch_jobs)

        # Generate read requests for the jobs in the batch
        read_requests = generate_read_requests(
            job_inputs = batch_inputs,
            phase_zero_sample_rate = phase_zero_sample_rate,
            phase_zero_sample_points_per_file =\
                phase_zero_sample_points_per_file,
            tuple_start_offset = fixed_tuple_length,
            job_ids = batch_jobs, phases=phases)

        # Load read requests into read request queue for each worker
        load_read_requests(self.coordinator_db, read_requests)

        start_time = time.time()
        # Mark phase zero as starting now.
        self.coordinator_db.begin_phase(batch_id, "phase_zero")
        self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time)
        log.info("Running phase_zero...")
        print_keyboard_commands()

        for job_id in batch_jobs:
            self.coordinator_db.update_job_status(
                job_id, { "start_time" : str(start_time),
                          "batch_id" : batch_id,
                          "date" : time.asctime()})

        self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs)

        self.coordinator_db.mark_batch_incomplete(batch_id)

        # Setting current_batch will cause all node coordinators to start work
        # on that batch
        self.coordinator_db.add_batch_to_node_coordinator_batch_queues(batch_id)
    def run_batch(self, batch_jobs, batch_inputs):
        batch_id = self.coordinator_db.next_batch_id

        log.info("Running batch %d with the following job(s): %s" %
                 (batch_id, ', '.join(map(str, batch_jobs))))

        # Create log directory for the current batch
        batch_logs = create_batch_directory(self.log_directory, batch_id)

        # Copy description files to the log directory
        description_dir = os.path.join(os.path.dirname(__file__), os.pardir,
                                       os.pardir, os.pardir, "tritonsort",
                                       "mapreduce", "description")
        shutil.copy(os.path.join(description_dir, "stages.json"), batch_logs)
        shutil.copy(os.path.join(description_dir, "structure.json"),
                    batch_logs)

        # Copy config file to log directory
        shutil.copy(self.config_file, batch_logs)

        self.ready_for_next_batch = False

        # Pull out relevant phase zero parameters
        phase_zero_sample_rate = 1  # Sample 100% by default
        if "SAMPLE_RATE" in self.config:
            phase_zero_sample_rate = float(self.config["SAMPLE_RATE"])
        phase_zero_sample_points_per_file = 1  # Sample prefixes by default
        if "SAMPLES_PER_FILE" in self.config:
            phase_zero_sample_points_per_file = \
                int(self.config["SAMPLES_PER_FILE"])
        fixed_key_length = None
        if "MAP_INPUT_FIXED_KEY_LENGTH" in self.config:
            fixed_key_length = int(self.config["MAP_INPUT_FIXED_KEY_LENGTH"])
        fixed_value_length = None
        if "MAP_INPUT_FIXED_VALUE_LENGTH" in self.config:
            fixed_value_length = \
                int(self.config["MAP_INPUT_FIXED_VALUE_LENGTH"])

        # If the application config file (yaml) or the job spec file (json)
        # skips a phase, we should not load read requests for that phase. The
        # job spec file should override the application config file.
        skip_phase_zero = 0
        skip_phase_one = 0
        skip_phase_two = 0
        skip_phase_three = 0
        if "SKIP_PHASE_ZERO" in self.config and self.config["SKIP_PHASE_ZERO"]:
            skip_phase_zero = 1
        if "SKIP_PHASE_ONE" in self.config and self.config["SKIP_PHASE_ONE"]:
            skip_phase_one = 1
        if "SKIP_PHASE_TWO" in self.config and self.config["SKIP_PHASE_TWO"]:
            skip_phase_two = 1
        if "SKIP_PHASE_THREE" in self.config and \
                self.config["SKIP_PHASE_THREE"]:
            skip_phase_three = 1

        # The run_job.py script verifies that all jobs in the batch have the
        # same value of these skip parameters in the job specs, so we can just
        # check the first one.
        for key, value in (self.coordinator_db.job_params(
                batch_jobs[0]).items()):
            if key == "SKIP_PHASE_ZERO":
                skip_phase_zero = value
            if key == "SKIP_PHASE_ONE":
                skip_phase_one = value
            if key == "SKIP_PHASE_TWO":
                skip_phase_two = value
            if key == "SKIP_PHASE_THREE":
                skip_phase_three = value
            if key == "MAP_INPUT_FIXED_KEY_LENGTH":
                fixed_key_length = int(value)
            if key == "MAP_INPUT_FIXED_VALUE_LENGTH":
                fixed_value_length = int(value)

        fixed_tuple_length = None
        if fixed_key_length != None and fixed_value_length != None:
            fixed_tuple_length = fixed_key_length + fixed_value_length

        use_replication = False
        if "OUTPUT_REPLICATION_LEVEL" in self.config and \
                int(self.config["OUTPUT_REPLICATION_LEVEL"]) > 1:
            use_replication = True

        phases = []
        if not skip_phase_zero:
            phases.append(0)
        if not skip_phase_one:
            phases.append(1)
        if not skip_phase_two and use_replication:
            # If we're using replication, phase two will have network transfer,
            # use barriers to guarantee sockets are connected.
            phases.append(2)
        if not skip_phase_three and use_replication:
            # If we're using replication, phase three will have network
            # transfer, use barriers to guarantee sockets are connected.
            phases.append(3)

        # Setup barriers
        self.coordinator_db.create_barriers(phases, batch_id, batch_jobs)

        # Generate read requests for the jobs in the batch
        read_requests = generate_read_requests(
            job_inputs = batch_inputs,
            phase_zero_sample_rate = phase_zero_sample_rate,
            phase_zero_sample_points_per_file =\
                phase_zero_sample_points_per_file,
            tuple_start_offset = fixed_tuple_length,
            job_ids = batch_jobs, phases=phases)

        # Load read requests into read request queue for each worker
        load_read_requests(self.coordinator_db, read_requests)

        start_time = time.time()
        # Mark phase zero as starting now.
        self.coordinator_db.begin_phase(batch_id, "phase_zero")
        self.batch_phase_info[batch_id] = ("phase_zero", 0, start_time)
        log.info("Running phase_zero...")
        print_keyboard_commands()

        for job_id in batch_jobs:
            self.coordinator_db.update_job_status(
                job_id, {
                    "start_time": str(start_time),
                    "batch_id": batch_id,
                    "date": time.asctime()
                })

        self.coordinator_db.add_jobs_to_batch(batch_id, batch_jobs)

        self.coordinator_db.mark_batch_incomplete(batch_id)

        # Setting current_batch will cause all node coordinators to start work
        # on that batch
        self.coordinator_db.add_batch_to_node_coordinator_batch_queues(
            batch_id)