Beispiel #1
0
    def test_tasks(self):
        model = self.model
        u = model.User(email="*****@*****.**", password="******")
        job = model.Job()
        task = model.Task(job=job, working_directory="/tmp", prepare_files_cmd="split.sh")
        job.user = u
        self.persist(u, job, task)

        loaded_task = model.session.query(model.Task).filter(model.Task.job == job).first()
        assert loaded_task.prepare_input_files_cmd == "split.sh"
Beispiel #2
0
    def test_job_metrics(self):
        model = self.model
        u = model.User(email="*****@*****.**", password="******")
        job = model.Job()
        job.user = u
        job.tool_id = "cat1"

        job.add_metric("gx", "galaxy_slots", 5)
        job.add_metric("system", "system_name", "localhost")

        self.persist(u, job)

        task = model.Task(job=job, working_directory="/tmp", prepare_files_cmd="split.sh")
        task.add_metric("gx", "galaxy_slots", 5)
        task.add_metric("system", "system_name", "localhost")

        big_value = ":".join("%d" % i for i in range(2000))
        task.add_metric("env", "BIG_PATH", big_value)
        self.persist(task)
        # Ensure big values truncated
        assert len(task.text_metrics[1].metric_value) <= 1023
Beispiel #3
0
def do_split(job_wrapper):
    parent_job = job_wrapper.get_job()
    working_directory = os.path.abspath(job_wrapper.working_directory)

    parallel_settings = job_wrapper.tool.parallelism.attributes
    # Syntax: split_inputs="input1,input2" shared_inputs="genome"
    # Designates inputs to be split or shared
    split_inputs = parallel_settings.get("split_inputs")
    if split_inputs is None:
        split_inputs = []
    else:
        split_inputs = [x.strip() for x in split_inputs.split(",")]

    shared_inputs = parallel_settings.get("shared_inputs")
    if shared_inputs is None:
        shared_inputs = []
    else:
        shared_inputs = [x.strip() for x in shared_inputs.split(",")]
    illegal_inputs = [x for x in shared_inputs if x in split_inputs]
    if len(illegal_inputs) > 0:
        raise Exception("Inputs have conflicting parallelism attributes: %s" %
                        str(illegal_inputs))

    subdir_index = [
        0
    ]  # use a list to get around Python 2.x lame closure support
    task_dirs = []

    def get_new_working_directory_name():
        dir = os.path.join(working_directory, 'task_%d' % subdir_index[0])
        subdir_index[0] = subdir_index[0] + 1
        if not os.path.exists(dir):
            os.makedirs(dir)
        task_dirs.append(dir)
        return dir

    # For things like paired end alignment, we need two inputs to be split. Since all inputs to all
    # derived subtasks need to be correlated, allow only one input type to be split
    type_to_input_map = {}
    for input in parent_job.input_datasets:
        if input.name in split_inputs:
            type_to_input_map.setdefault(input.dataset.datatype,
                                         []).append(input.name)
        elif input.name in shared_inputs:
            pass  # pass original file name
        else:
            log_error = "The input '%s' does not define a method for implementing parallelism" % str(
                input.name)
            log.exception(log_error)
            raise Exception(log_error)

    if len(type_to_input_map) > 1:
        log_error = "The multi splitter does not support splitting inputs of more than one type"
        log.error(log_error)
        raise Exception(log_error)

    # split the first one to build up the task directories
    input_datasets = []
    for input in parent_job.input_datasets:
        if input.name in split_inputs:
            this_input_files = job_wrapper.get_input_dataset_fnames(
                input.dataset)
            if len(this_input_files) > 1:
                log_error = "The input '%s' is composed of multiple files - splitting is not allowed" % str(
                    input.name)
                log.error(log_error)
                raise Exception(log_error)
            input_datasets.append(input.dataset)

    input_type = type_to_input_map.keys()[0]
    # DBTODO execute an external task to do the splitting, this should happen at refactor.
    # If the number of tasks is sufficiently high, we can use it to calculate job completion % and give a running status.
    try:
        input_type.split(input_datasets, get_new_working_directory_name,
                         parallel_settings)
    except AttributeError:
        log_error = "The type '%s' does not define a method for splitting files" % str(
            input_type)
        log.error(log_error)
        raise
    log.debug('do_split created %d parts' % len(task_dirs))
    # next, after we know how many divisions there are, add the shared inputs via soft links
    for input in parent_job.input_datasets:
        if input and input.name in shared_inputs:
            names = job_wrapper.get_input_dataset_fnames(input.dataset)
            for dir in task_dirs:
                for file in names:
                    os.symlink(file, os.path.join(dir, os.path.basename(file)))
    tasks = []
    prepare_files = os.path.join(util.galaxy_directory(),
                                 'extract_dataset_parts.sh') + ' %s'
    for dir in task_dirs:
        task = model.Task(parent_job, dir, prepare_files % dir)
        tasks.append(task)
    return tasks