class RunAnalyticalPipelinesTask(object):
    def __init__(self, task, pipeline, version, instance_type, instance_disk):
        self.api = PipelineAPI(os.environ['API'], 'logs')
        self.task = task
        self.pipeline = self.api.find_pipeline(pipeline)
        self.version = version
        self.instance_type = instance_type
        self.instance_disk = instance_disk

    def run(self):
        analysis_folder = os.environ['ANALYSIS_FOLDER']
        machine_run_folder = os.environ['MACHINE_RUN_FOLDER']
        sample_sheet = os.environ['SAMPLE_SHEET']
        Logger.info('Starting analytical processing for sample sheet %s' %
                    sample_sheet,
                    task_name=self.task)
        samples = SampleSheetParser(
            sample_sheet,
            [SAMPLE_ID, SAMPLE_NAME, SAMPLE_PROJECT]).parse_sample_sheet()
        launched_runs = {}
        for sample in samples:
            Logger.info('Starting "%s" sample processing.' %
                        sample[SAMPLE_NAME],
                        task_name=self.task)
            launched_runs[sample[SAMPLE_NAME]] = self.__run_sample(
                sample[SAMPLE_NAME], analysis_folder, machine_run_folder)
        failed_runs = self.__wait_runs_completion(launched_runs)
        if failed_runs:
            for sample, run_id in failed_runs.iteritems():
                Logger.fail(
                    'Processing failed for sample "%s". Check run %d logs for more information.'
                    % (sample, run_id),
                    task_name=self.task)
            sys.exit(1)
        Logger.success("All samples processed successfully.",
                       task_name=self.task)

    def __run_sample(self, sample, analysis_folder, machine_run_folder):
        Logger.info(
            'Launching analytical pipeline "%s" with version "%s" for sample %s.'
            % (self.pipeline['name'], self.version, sample),
            task_name=self.task)
        read1, read2 = self.__fetch_reads(sample, analysis_folder,
                                          machine_run_folder)
        pipeline_params = {
            'SAMPLE': {
                'value': sample
            },
            'READ1': {
                'value': read1,
                'type': 'input'
            },
            'READ2': {
                'value': read2,
                'type': 'input'
            },
            'OUTPUT_FOLDER': {
                'value': analysis_folder,
                'type': 'output'
            }
        }
        run = self.api.launch_pipeline(self.pipeline['id'],
                                       self.version,
                                       pipeline_params,
                                       instance=self.instance_type,
                                       disk=self.instance_disk,
                                       parent_run_id=os.environ['RUN_ID'])
        return run['id']

    def __fetch_reads(self, sample, analysis_folder, machine_run_folder):
        run_folder_name = urlparse.urlparse(machine_run_folder).path
        read_folder = self.__get_path_without_trailing_slash(analysis_folder) + \
                      self.__get_path_without_trailing_slash(run_folder_name) + \
                      '/PipelineInputData/FASTQ/'
        return os.path.join(read_folder,
                            sample + '_R1.fastq.gz'), os.path.join(
                                read_folder, sample + '_R2.fastq.gz')

    def __get_path_without_trailing_slash(self, path):
        return path[:-1] if path.endswith('/') else path

    def __wait_runs_completion(self, launched_runs):
        finished = {}
        failed = {}
        while True:
            for sample, run_id in launched_runs.iteritems():
                current_status = self.api.load_run(run_id)['status']
                Logger.info('Processing sample: %s. Run %d status is %s.' %
                            (sample, run_id, current_status),
                            task_name=self.task)
                if current_status != 'RUNNING':
                    finished[sample] = run_id
                    if current_status != 'SUCCESS':
                        failed[sample] = run_id
            if len(finished) == len(launched_runs):
                Logger.info("Processing for all samples completed.",
                            task_name=self.task)
                return failed
            time.sleep(60)
Beispiel #2
0
class DemultiplexTask(object):
    def __init__(self, task, pipeline, version, instance_type, instance_disk):
        self.task = task
        self.pipeline_name = pipeline
        self.version = version
        self.instance_type = instance_type
        self.instance_disk = instance_disk
        self.api = PipelineAPI(os.environ['API'], 'logs')

    def run(self):
        Logger.info('Launching demultiplex pipeline "%s" with version "%s"' %
                    (self.pipeline_name, self.version),
                    task_name=self.task)
        pipeline = self.api.find_pipeline(self.pipeline_name)
        pipeline_params = {
            'MACHINE_RUN_FOLDER': {
                'value': os.environ['MACHINE_RUN_FOLDER'],
                'type': 'input'
            },
            'SAMPLE_SHEET': {
                'value': os.environ['SAMPLE_SHEET_ORIGINAL'],
                'type': 'input'
            },
            'ANALYSIS_FOLDER': {
                'value': os.environ['ANALYSIS_FOLDER'],
                'type': 'output'
            }
        }
        run = self.api.launch_pipeline(pipeline['id'],
                                       self.version,
                                       pipeline_params,
                                       instance=self.instance_type,
                                       disk=self.instance_disk,
                                       parent_run_id=os.environ['RUN_ID'])
        demultiplex_run_id = run['id']
        Logger.info('Launched demultiplex run %d.' % demultiplex_run_id,
                    task_name=self.task)
        Logger.info('Waiting till run %d completion.' % demultiplex_run_id,
                    task_name=self.task)
        final_status = self.__wait_run_completion(demultiplex_run_id)
        if final_status != 'SUCCESS':
            Logger.fail(
                'Demultiplex processing does not completed successfully. '
                'Check run %d logs for more information.' % demultiplex_run_id,
                task_name=self.task)
            sys.exit(1)
        Logger.success('Demultiplex processing completed sucessfully.',
                       task_name=self.task)

    def __wait_run_completion(self, run_id):
        current_status = self.api.load_run(run_id)['status']
        while current_status == 'RUNNING':
            Logger.info('Run %d status is %s. Waiting for completion...' %
                        (run_id, current_status),
                        task_name=self.task)
            time.sleep(60)
            current_status = self.api.load_run(run_id)['status']
        Logger.info('Run %d finished with status %s' %
                    (run_id, current_status),
                    task_name=self.task)
        return current_status
Beispiel #3
0
class AbstractPipelineLauncher(AbstractTask):
    TASK_NAME = "LaunchSampleProcessing"
    # important! cmd template should be single-quoted to prevent parameter expansion
    CMD_TEMPLATE = "pipe run --yes --quiet --pipeline {pipe_id}@{version} --instance-disk {instance_disk} " \
                   "--instance-type {instance_type} --docker-image {docker_image} --cmd-template '{cmd}' " \
                   "--parent-id {parent}"
    SAMPLE_TEMPLATE = " --sample_name {sample_name} --sample_id {sample_id}"
    POLL_TIMEOUT = 30
    RETRY_COUNT = 10
    SAMPLE_ID = "Sample_ID"
    SAMPLE_NAME = "Sample_Name"

    def __init__(self, run_files, param_names, run_id, pipe_id, version,
                 pipe_params, param_types):
        AbstractTask.__init__(self, self.TASK_NAME)
        self.samples_number = len(run_files)
        self.run_id = run_id
        self.run_files = run_files
        self.param_names = param_names
        self.pipe_id = pipe_id
        self.version = version
        self.api = PipelineAPI(os.environ['API'], 'logs')
        self.pipe_params = pipe_params
        self.child_id = None
        self.param_types = param_types

    def launch_pipeline(self,
                        params,
                        param_names,
                        instance_size,
                        instance_disk,
                        docker_image,
                        cmd,
                        sample=None):
        if not self.child_run_active():
            self.launch_child_run(params,
                                  param_names,
                                  cmd,
                                  instance_size,
                                  instance_disk,
                                  docker_image,
                                  sample=sample)
            return

        command = self.CMD_TEMPLATE.format(pipe_id=self.pipe_id,
                                           version=self.version,
                                           instance_disk=instance_disk,
                                           instance_type=instance_size,
                                           docker_image=docker_image,
                                           cmd=cmd,
                                           parent=self.run_id)
        if sample:
            command = command + self.SAMPLE_TEMPLATE.format(
                sample_name=sample[self.SAMPLE_NAME],
                sample_id=sample[self.SAMPLE_ID])
        # add all pattern params
        index = 0
        for name in param_names:
            if sample:
                value = ','.join(params[name])
            else:
                value = params[index]
            command += " --{} input?{}".format(name, value)
            index = index + 1
        # add all other params
        for param, value in self.pipe_params.iteritems():
            if param.startswith('i_'):
                command += " --{} input?{}".format(
                    self.change_parameter_name(param), value)
            elif param.startswith('c_'):
                command += " --{} common?{}".format(
                    self.change_parameter_name(param), value)
            elif param.startswith('o_'):
                command += " --{} output?{}".format(
                    self.change_parameter_name(param), value)
            else:
                command += " --{} {}".format(param, value)

        Logger.info('Starting pipeline with command: "{}".'.format(command),
                    task_name=self.TASK_NAME)
        try:
            LoggedCommand(command, None, self.TASK_NAME).execute()
        except Exception as e:
            Logger.warn(
                "Failed to launch sample processing with command: '{}'. Error: '{}'."
                .format(command, e.message),
                task_name=self.TASK_NAME)

    def launch_child_run(self,
                         params,
                         param_names,
                         cmd,
                         instance_size,
                         instance_disk,
                         docker_image,
                         sample=None):
        run_params = {'parent-id': self.run_id}
        if sample:
            run_params['sample_name'] = sample[self.SAMPLE_NAME]
            run_params['sample_id'] = sample[self.SAMPLE_ID]
        index = 0
        # add all pattern params
        for name in param_names:
            if sample:
                value = ','.join(params[name])
            else:
                value = params[index]
            run_params[name] = {'value': value, 'type': 'input'}
            index = index + 1

        # add all other params
        for param, value in self.pipe_params.iteritems():
            param_type = None
            param_name = param
            real_value = self.normalize_value(value)
            if param.startswith('i_'):
                param_type = 'input'
                param_name = self.change_parameter_name(param)
            elif param.startswith('c_'):
                param_type = 'common'
                param_name = self.change_parameter_name(param)
            elif param.startswith('o_'):
                param_type = 'output'
                param_name = self.change_parameter_name(param)
            run_params[param_name] = {'value': real_value}
            if param_type is not None:
                run_params[param_name]['type'] = param_type
            else:
                run_params[param_name]['type'] = self.get_type_from_env(
                    param_name)
        Logger.info(
            "Starting child pipeline run on a parent node with parameters: '{}'."
            .format(str(run_params)),
            task_name=self.TASK_NAME)
        try:
            run = self.api.launch_pipeline(self.pipe_id,
                                           self.version,
                                           run_params,
                                           parent_node_id=self.run_id,
                                           cmd=cmd,
                                           instance=instance_size,
                                           disk=instance_disk,
                                           docker=docker_image)
            self.child_id = run['id']
        except Exception as e:
            Logger.warn(
                "Failed to launch sample processing with parameters: '{}'. Error: '{}'."
                .format(str(run_params), e.message),
                task_name=self.TASK_NAME)
            self.child_id = None

    # to have possibilities to change way of naming new parameter in the batched pipeline
    @staticmethod
    def change_parameter_name(param):
        return param[2:]

    def get_running_samples(self):
        attempts = 0
        while attempts < self.RETRY_COUNT:
            try:
                child_runs = self.api.load_child_pipelines(self.run_id)
                count = 0
                for run in child_runs:
                    if run['status'] == 'RUNNING':
                        count = count + 1
                return count
            except Exception as e:
                Logger.warn("Failed to fetch running samples: {}.".format(
                    e.message),
                            task_name=self.TASK_NAME)
                attempts = attempts + 1
                time.sleep(self.POLL_TIMEOUT)
        Logger.fail("Exceeded maximum attempts to fetch running samples.")
        raise RuntimeError(
            "Exceeded maximum attempts to fetch running samples.")

    def child_run_active(self):
        if self.child_id is None:
            return False
        attempts = 0
        while attempts < self.RETRY_COUNT:
            try:
                run = self.api.load_run(self.child_id)
                return run['status'] == 'RUNNING'
            except Exception as e:
                Logger.warn(
                    "Failed to fetch child run ID '' status: {}.".format(
                        str(self.child_id), e.message),
                    task_name=self.TASK_NAME)
                attempts = attempts + 1
                time.sleep(self.POLL_TIMEOUT)
        Logger.fail("Exceeded maximum attempts to fetch child run status.")
        raise RuntimeError(
            "Exceeded maximum attempts to fetch child run status.")

    def wait_all_samples_finish(self):
        running = self.get_running_samples()
        while running != 0:
            time.sleep(self.POLL_TIMEOUT)
            running = self.get_running_samples()

    def get_type_from_env(self, param_name):
        if param_name not in self.param_types or not self.param_types[
                param_name]:
            return 'string'
        else:
            return self.param_types[param_name]

    # remove escaped ENV values
    def normalize_value(self, value):
        return value.replace("\\$", "$")