Ejemplo n.º 1
0
def reload_read_request_queues(job_description_file, job_ids, redis_host,
                               redis_port, redis_db, skip_phase_zero,
                               skip_phase_one, phase_zero_sample_size):

    with open(job_description_file, 'r') as fp:
        job_description = json.load(fp)

    coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port,
                                               redis_db)

    input_files = input_file_utils.gather_input_file_paths(
        coordinator_db, job_description["input_directory"])

    phases = []

    if not skip_phase_zero:
        phases.append(0)

    if not skip_phase_one:
        phases.append(1)

    read_requests = input_file_utils.generate_read_requests(
        input_files, phase_zero_sample_size, job_ids, phases)

    input_file_utils.load_read_requests(coordinator_db, read_requests)
    def setup_new_job(self, job_description, job_id):
        # Put intermediates and outputs for this job in their own
        # sub-directories.
        job_description["intermediate_directory"] += "/job_%d" % job_id
        job_description["output_directory"] += "/job_%d" % job_id

        self.coordinator_db.new_job_info(job_id, job_description)

        # Extract a list of all input files from the directory
        # corresponding to the input location
        input_dir = job_description["input_directory"]

        max_input_files_per_disk = None
        if "max_input_files_per_disk" in job_description:
            max_input_files_per_disk = \
                job_description["max_input_files_per_disk"]

        input_path_struct = gather_input_file_paths(
            self.coordinator_db, input_dir, max_input_files_per_disk)

        if input_path_struct is not None:
            (worker_inputs, total_input_size) = input_path_struct
        else:
            worker_inputs = None
            total_input_size = 0

        input_files_error = (worker_inputs is None or
                             len(worker_inputs) == 0 or
                             total_input_size == 0)

        if worker_inputs == None:
            error_msg = "Unable to list input directory '%s'" % (
                input_dir)
        elif len(worker_inputs) == 0:
            error_msg = ("Didn't find any input files in directory '%s' "
                         % (input_dir))
        elif total_input_size == 0:
            error_msg = "Total length of all input files is 0B"

        if input_files_error:
            self.fail_job(job_id, error_msg)
            log.error("Job %d failed: %s" % (job_id, error_msg))

            self.coordinator_db.update_job_status(
                job_id, { "fail_message" : error_msg }, post_status="Failed")
            return None

        self.coordinator_db.update_job_status(
            job_id, { "total_input_size_bytes" : total_input_size })

        return worker_inputs
    def setup_new_job(self, job_description, job_id):
        # Put intermediates and outputs for this job in their own
        # sub-directories.
        job_description["intermediate_directory"] += "/job_%d" % job_id
        job_description["output_directory"] += "/job_%d" % job_id

        self.coordinator_db.new_job_info(job_id, job_description)

        # Extract a list of all input files from the directory
        # corresponding to the input location
        input_dir = job_description["input_directory"]

        max_input_files_per_disk = None
        if "max_input_files_per_disk" in job_description:
            max_input_files_per_disk = \
                job_description["max_input_files_per_disk"]

        input_path_struct = gather_input_file_paths(self.coordinator_db,
                                                    input_dir,
                                                    max_input_files_per_disk)

        if input_path_struct is not None:
            (worker_inputs, total_input_size) = input_path_struct
        else:
            worker_inputs = None
            total_input_size = 0

        input_files_error = (worker_inputs is None or len(worker_inputs) == 0
                             or total_input_size == 0)

        if worker_inputs == None:
            error_msg = "Unable to list input directory '%s'" % (input_dir)
        elif len(worker_inputs) == 0:
            error_msg = ("Didn't find any input files in directory '%s' " %
                         (input_dir))
        elif total_input_size == 0:
            error_msg = "Total length of all input files is 0B"

        if input_files_error:
            self.fail_job(job_id, error_msg)
            log.error("Job %d failed: %s" % (job_id, error_msg))

            self.coordinator_db.update_job_status(job_id,
                                                  {"fail_message": error_msg},
                                                  post_status="Failed")
            return None

        self.coordinator_db.update_job_status(
            job_id, {"total_input_size_bytes": total_input_size})

        return worker_inputs
def reload_read_request_queues(
    job_description_file, job_ids, redis_host, redis_port, redis_db,
    skip_phase_zero, skip_phase_one, phase_zero_sample_size):

    with open(job_description_file, 'r') as fp:
        job_description = json.load(fp)

    coordinator_db = redis_utils.CoordinatorDB(redis_host, redis_port, redis_db)

    input_files = input_file_utils.gather_input_file_paths(
        coordinator_db, job_description["input_directory"])

    phases = []

    if not skip_phase_zero:
        phases.append(0)

    if not skip_phase_one:
        phases.append(1)

    read_requests = input_file_utils.generate_read_requests(
        input_files, phase_zero_sample_size, job_ids, phases)

    input_file_utils.load_read_requests(coordinator_db, read_requests)