Esempio n. 1
0
class BatchRunner(object):
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 context):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = context.provenance()
        self.job_id = None
        rpc_context = context.get('rpc_context')
        if rpc_context is not None and hasattr(rpc_context, 'get'):
            current_call_ctx = rpc_context.get('call_stack')
            if len(current_call_ctx):
                self.job_id = current_call_ctx[0].get('job_id')

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(self.provenance) > 0:
            if 'subactions' in self.provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_BatchApp', self.provenance[0]['subactions'])
        print('Running kb_BatchApp version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.parallel_runner = KBParallel(self.callback_url, service_ver='dev')

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'release'

    def run(self, params):
        self.validate_params(
            params
        )  # raises an exception if there's a failure. see that function for details.

        app_info = {
            'module_name': params['module_name'],
            'function_name': params['method_name'],
            'version': params['service_ver']
        }

        params_list = params.get('batch_params')
        print('Running on set of parameters =')
        pprint(params_list)

        tasks = []
        for input_params in params_list:
            tasks.append(
                self.build_single_execution_task(app_info, input_params))

        batch_run_params = {
            'tasks': tasks,
            'runner': 'parallel',
            'max_retries': 2
        }
        if self.job_id is not None:
            batch_run_params['parent_job_id'] = self.job_id

        # TODO check if this should be given in input
        batch_run_params['concurrent_local_tasks'] = 0
        batch_run_params['concurrent_njsw_tasks'] = 5

        print(
            "========================  BATCH_RUN_PARAMS  ====================")
        pprint(batch_run_params)
        print(
            "================================================================")

        batch_results = self.parallel_runner.run_batch(batch_run_params)
        print('Batch run results=')
        pprint(batch_results)

        results = {'batch_results': dict()}
        for result in batch_results['results']:
            results['batch_results'][result['result_package']['run_context']
                                     ['job_id']] = result

        results['report_name'], results['report_ref'] = build_report(
            self.callback_url, self.scratch_dir, results['batch_results'],
            len(params_list), params['wsid'])

        return results

    def build_single_execution_task(self, app_info, params):
        task_params = copy.deepcopy(params.get('params')[0])

        retVal = {'parameters': task_params}
        retVal.update(app_info)
        return retVal

    def clean(self, run_output_info):
        """
        Not really necessary on a single run, but if we are running multiple local subjobs, we
        should clean up files that have already been saved back up to KBase.
        """
        pass

    def validate_params(self, params):
        """
        Things to validate.
        params.module_name and params.method_name are real (maybe just let that go and assume they're ok)
        params.wsid is a real workspace id and the current user has write-access.
        params.batch_params is a list with len > 0
        """
        if params.get("batch_params", None) is None or (
                isinstance(params["batch_params"], list)
                and len(params["batch_params"]) == 0):
            raise ValueError("batch_params must be a list with a length >= 1")
        if params.get("module_name") is None:
            raise ValueError(
                "module_name must be an existing KBase app module!")
        elif "." in params["module_name"] or "/" in params["module_name"]:
            raise ValueError(
                "module_name should just be the name of the module, NOT the full module.method"
            )
        if params.get("method_name") is None:
            raise ValueError(
                "method_name must be an existing KBase app method!")
        elif "." in params["method_name"] or "/" in params["method_name"]:
            raise ValueError(
                "method_name should just be the name of the method, NOT the full module.method"
            )
        if params.get("service_ver") is None or not isinstance(
                params["service_ver"], basestring):
            raise ValueError("service_ver must be a valid string!")
        if params.get("wsid") is None:
            raise ValueError(
                "A workspace id must be provided to associate each subjob!")
        return params
Esempio n. 2
0
    def run_batch(self, reads_refs, params):
        """
        Runs HISAT2 in batch mode.
        reads_refs should be a list of dicts, where each looks like the following:
        {
            "ref": reads object reference,
            "condition": condition for that ref (string)
        }
        """
        # build task list and send it to KBParallel
        tasks = list()
        set_name = get_object_names(
            [params["sampleset_ref"]],
            self.workspace_url)[params["sampleset_ref"]]
        for idx, reads_ref in enumerate(reads_refs):
            single_param = dict(params)  # need a copy of the params
            single_param["build_report"] = 0
            single_param["sampleset_ref"] = reads_ref["ref"]
            if "condition" in reads_ref:
                single_param["condition"] = reads_ref["condition"]
            else:
                single_param["condition"] = "unspecified"

            tasks.append({
                "module_name": "kb_hisat2",
                "function_name": "run_hisat2",
                "version": self.my_version,
                "parameters": single_param
            })
        # UNCOMMENT BELOW FOR LOCAL TESTING
        batch_run_params = {
            "tasks": tasks,
            "runner": "parallel",
            # "concurrent_local_tasks": 3,
            # "concurrent_njsw_tasks": 0,
            "max_retries": 2
        }
        parallel_runner = KBParallel(self.callback_url)
        results = parallel_runner.run_batch(batch_run_params)["results"]
        alignment_items = list()
        alignments = dict()
        for idx, result in enumerate(results):
            # idx of the result is the same as the idx of the inputs AND reads_refs
            if result["is_error"] != 0:
                raise RuntimeError(
                    "Failed a parallel run of HISAT2! {}".format(
                        result["result_package"]["error"]))
            reads_ref = tasks[idx]["parameters"]["sampleset_ref"]
            alignment_items.append({
                "ref":
                result["result_package"]["result"][0]["alignment_objs"]
                [reads_ref]["ref"],
                "label":
                reads_refs[idx].get("condition",
                                    params.get("condition", "unspecified"))
            })
            alignments[reads_ref] = result["result_package"]["result"][0][
                "alignment_objs"][reads_ref]
        # build the final alignment set
        output_ref = self.upload_alignment_set(
            alignment_items, set_name + params["alignmentset_suffix"],
            params["ws_name"])
        return (alignments, output_ref)
Esempio n. 3
0
class Bowtie2Aligner(object):
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 provenance):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = provenance

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_Bowtie2', provenance[0]['subactions'])
        print('Running kb_Bowtie2 version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.bowtie2 = Bowtie2Runner(self.scratch_dir)
        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url)

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'release'

    def align(self, params):
        validated_params = self.validate_params(params)
        input_info = self.determine_input_info(validated_params)
        # input info provides information on the input and tells us if we should
        # run as a single_library or as a set:
        #     input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'}

        assembly_or_genome_ref = validated_params['assembly_or_genome_ref']

        if input_info['run_mode'] == 'single_library':
            if 'output_alignment_name' not in validated_params:
                suffix = '_alignment'
                if 'output_alignment_suffix' in validated_params:
                    suffix = validated_params['output_alignment_suffix']
                validated_params[
                    'output_alignment_name'] = input_info['info'][1] + suffix
            single_lib_result = self.single_reads_lib_run(
                input_info,
                assembly_or_genome_ref,
                validated_params,
                create_report=validated_params['create_report'])

            return single_lib_result

        if input_info['run_mode'] == 'sample_set':
            reads = self.fetch_reads_refs_from_sampleset(
                input_info['ref'], input_info['info'], validated_params)
            self.build_bowtie2_index(assembly_or_genome_ref,
                                     validated_params['output_workspace'])

            print('Running on set of reads=')
            pprint(reads)

            tasks = []
            for r in reads:
                tasks.append(
                    self.build_single_execution_task(
                        r['ref'], params, r['alignment_output_name'],
                        r['condition']))

            batch_run_params = {
                'tasks': tasks,
                'runner': 'parallel',
                'max_retries': 2
            }
            if validated_params['concurrent_local_tasks'] is not None:
                batch_run_params['concurrent_local_tasks'] = validated_params[
                    'concurrent_local_tasks']
            if validated_params['concurrent_njsw_tasks'] is not None:
                batch_run_params['concurrent_njsw_tasks'] = validated_params[
                    'concurrent_njsw_tasks']
            results = self.parallel_runner.run_batch(batch_run_params)
            print('Batch run results=')
            pprint(results)
            batch_result = self.process_batch_result(results, validated_params,
                                                     reads, input_info['info'])
            return batch_result

        raise ('Improper run mode')

    def build_single_execution_task(self, reads_lib_ref, params, output_name,
                                    condition):
        task_params = copy.deepcopy(params)

        task_params['input_ref'] = reads_lib_ref
        task_params['output_alignment_name'] = output_name
        task_params['create_report'] = 0
        task_params['condition_label'] = condition

        return {
            'module_name': 'kb_Bowtie2',
            'function_name': 'align_reads_to_assembly_app',
            'version': self.my_version,
            'parameters': task_params
        }

    def single_reads_lib_run(self,
                             read_lib_info,
                             assembly_or_genome_ref,
                             validated_params,
                             create_report=False,
                             bowtie2_index_info=None):
        ''' run on one reads '''

        # download reads and prepare any bowtie2 index files
        input_configuration = self.prepare_single_run(
            read_lib_info, assembly_or_genome_ref, bowtie2_index_info,
            validated_params['output_workspace'])

        # run the actual program
        run_output_info = self.run_bowtie2_align_cli(input_configuration,
                                                     validated_params)

        # process the result and save the output
        upload_results = self.save_read_alignment_output(
            run_output_info, input_configuration, validated_params)
        run_output_info['upload_results'] = upload_results

        report_info = None
        if create_report:
            report_info = self.create_report_for_single_run(
                run_output_info, input_configuration, validated_params)

        self.clean(run_output_info)

        return {'output_info': run_output_info, 'report_info': report_info}

    def build_bowtie2_index(self, assembly_or_genome_ref, ws_for_cache):
        bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir,
                                                  self.workspace_url,
                                                  self.callback_url,
                                                  self.srv_wiz_url,
                                                  self.provenance)

        return bowtie2IndexBuilder.get_index({
            'ref': assembly_or_genome_ref,
            'ws_for_cache': ws_for_cache
        })

    def prepare_single_run(self, input_info, assembly_or_genome_ref,
                           bowtie2_index_info, ws_for_cache):
        ''' Given a reads ref and an assembly, setup the bowtie2 index '''
        # first setup the bowtie2 index of the assembly
        input_configuration = {'bowtie2_index_info': bowtie2_index_info}
        if not bowtie2_index_info:
            bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir,
                                                      self.workspace_url,
                                                      self.callback_url,
                                                      self.srv_wiz_url,
                                                      self.provenance)

            index_result = bowtie2IndexBuilder.get_index({
                'ref':
                assembly_or_genome_ref,
                'ws_for_cache':
                ws_for_cache
            })
            input_configuration['bowtie2_index_info'] = index_result

        # next download the reads
        read_lib_ref = input_info['ref']
        read_lib_info = input_info['info']
        reads_params = {
            'read_libraries': [read_lib_ref],
            'interleaved': 'false',
            'gzipped': None
        }
        ru = ReadsUtils(self.callback_url)
        reads = ru.download_reads(reads_params)['files']

        input_configuration['reads_lib_type'] = self.get_type_from_obj_info(
            read_lib_info).split('.')[1]
        input_configuration['reads_files'] = reads[read_lib_ref]
        input_configuration['reads_lib_ref'] = read_lib_ref

        return input_configuration

    def run_bowtie2_align_cli(self, input_configuration, validated_params):
        # pprint('======== input_configuration =====')
        # pprint(input_configuration)
        options = []
        run_output_info = {}

        # set the bowtie2 index location
        bt2_index_dir = input_configuration['bowtie2_index_info']['output_dir']
        bt2_index_basename = input_configuration['bowtie2_index_info'][
            'index_files_basename']
        options.extend(['-x', bt2_index_basename])

        # set the input reads
        if input_configuration['reads_lib_type'] == 'SingleEndLibrary':
            options.extend(
                ['-U', input_configuration['reads_files']['files']['fwd']])
            run_output_info['library_type'] = 'single_end'
        elif input_configuration['reads_lib_type'] == 'PairedEndLibrary':
            options.extend(
                ['-1', input_configuration['reads_files']['files']['fwd']])
            options.extend(
                ['-2', input_configuration['reads_files']['files']['rev']])
            run_output_info['library_type'] = 'paired_end'

        # setup the output file name
        output_dir = os.path.join(
            self.scratch_dir,
            'bowtie2_alignment_output_' + str(int(time.time() * 10000)))
        output_sam_file = os.path.join(output_dir, 'reads_alignment.sam')
        os.makedirs(output_dir)
        options.extend(['-S', output_sam_file])
        run_output_info['output_sam_file'] = output_sam_file
        run_output_info['output_dir'] = output_dir

        # parse all the other parameters
        if 'quality_score' in validated_params:
            options.append('--' + str(validated_params['quality_score']))

        if 'alignment_type' in validated_params:
            options.append('--' + str(validated_params['alignment_type']))

        if 'preset_options' in validated_params:
            if 'alignment_type' in validated_params and validated_params[
                    'alignment_type'] == 'local':
                options.append('--' + str(validated_params['preset_options'] +
                                          '-local'))
            else:
                options.append('--' + str(validated_params['preset_options']))

        if 'trim5' in validated_params:
            options.extend(['--trim5', str(validated_params['trim5'])])
        if 'trim3' in validated_params:
            options.extend(['--trim3', str(validated_params['trim3'])])
        if 'np' in validated_params:
            options.extend(['--np', str(validated_params['np'])])

        if 'minins' in validated_params:
            options.extend(['--minins', str(validated_params['minins'])])
        if 'maxins' in validated_params:
            options.extend(['--maxins', str(validated_params['maxins'])])

        # unfortunately, bowtie2 expects the index files to be in the current directory, and
        # you cannot configure it otherwise.  So run bowtie out of the index directory, but
        # place the output SAM file somewhere else
        self.bowtie2.run('bowtie2', options, cwd=bt2_index_dir)

        return run_output_info

    def save_read_alignment_output(self, run_output_info, input_configuration,
                                   validated_params):
        rau = ReadsAlignmentUtils(self.callback_url)
        destination_ref = validated_params[
            'output_workspace'] + '/' + validated_params[
                'output_alignment_name']
        condition = 'unknown'
        if 'condition_label' in validated_params:
            condition = validated_params['condition_label']
        upload_params = {
            'file_path': run_output_info['output_sam_file'],
            'destination_ref': destination_ref,
            'read_library_ref': input_configuration['reads_lib_ref'],
            'assembly_or_genome_ref':
            validated_params['assembly_or_genome_ref'],
            'condition': condition
        }
        upload_results = rau.upload_alignment(upload_params)
        return upload_results

    def clean(self, run_output_info):
        ''' Not really necessary on a single run, but if we are running multiple local subjobs, we
        should clean up files that have already been saved back up to kbase '''
        pass

    def create_report_for_single_run(self, run_output_info,
                                     input_configuration, validated_params):
        # first run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': run_output_info['upload_results']['obj_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create report
        report_text = 'Ran on a single reads library.\n\n'
        alignment_info = self.get_obj_info(
            run_output_info['upload_results']['obj_ref'])
        report_text = 'Created ReadsAlignment: ' + str(
            alignment_info[1]) + '\n'
        report_text = '                        ' + run_output_info[
            'upload_results']['obj_ref'] + '\n'
        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref':
                run_output_info['upload_results']['obj_ref'],
                'description':
                'ReadsAlignment'
            }],
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

    def process_batch_result(self, batch_result, validated_params, reads,
                             input_set_info):

        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        # reads alignment set items
        items = []
        objects_created = []

        for k in range(0, len(batch_result['results'])):
            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                # Note: could add a label to the alignment here?
                items.append({'ref': ra_ref, 'label': reads[k]['condition']})
                objects_created.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        alignment_set_data = {'description': '', 'items': items}
        alignment_set_save_params = {
            'data':
            alignment_set_data,
            'workspace':
            validated_params['output_workspace'],
            'output_object_name':
            str(input_set_info[1]) + validated_params['output_obj_name_suffix']
        }

        set_api = SetAPI(self.srv_wiz_url)
        save_result = set_api.save_reads_alignment_set_v1(
            alignment_set_save_params)
        print('Saved ReadsAlignment=')
        pprint(save_result)
        objects_created.append({
            'ref':
            save_result['set_ref'],
            'description':
            'Set of all reads alignments generated'
        })
        set_name = save_result['set_info'][1]

        # run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': save_result['set_ref']})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        print('Report text=')
        print(report_text)

        kbr = KBaseReport(self.callback_url)
        report_info = kbr.create_extended_report({
            'message':
            report_text,
            'objects_created':
            objects_created,
            'report_object_name':
            'kb_Bowtie2_' + str(uuid.uuid4()),
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'workspace_name':
            validated_params['output_workspace']
        })

        result = {
            'report_info': {
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        }
        result['batch_output_info'] = batch_result

        return result

    def validate_params(self, params):
        validated_params = {}

        required_string_fields = [
            'input_ref', 'assembly_or_genome_ref', 'output_obj_name_suffix',
            'output_workspace'
        ]
        for field in required_string_fields:
            if field in params and params[field]:
                validated_params[field] = params[field]
            else:
                raise ValueError('"' + field +
                                 '" field required to run bowtie2 aligner app')

        optional_fields = [
            'quality_score', 'alignment_type', 'preset_options', 'trim5',
            'trim3', 'condition_label', 'np', 'minins', 'maxins',
            'output_alignment_suffix', 'output_alignment_name'
        ]
        for field in optional_fields:
            if field in params:
                if params[field] is not None:
                    validated_params[field] = params[field]

        validated_params['create_report'] = True
        if 'create_report' in params and params['create_report'] is not None:
            if int(params['create_report']) == 1:
                validated_params['create_report'] = True
            elif int(params['create_report']) == 0:
                validated_params['create_report'] = False
            else:
                raise ValueError(
                    '"create_report" field, if present, should be set to a boolean value: 0 or 1'
                )

        validated_params['concurrent_local_tasks'] = None
        validated_params['concurrent_njsw_tasks'] = None

        if 'concurrent_local_tasks' in params and params[
                'concurrent_local_tasks'] is not None:
            validated_params['concurrent_local_tasks'] = int(
                params['concurrent_local_tasks'])
        if 'concurrent_njsw_tasks' in params and params[
                'concurrent_njsw_tasks'] is not None:
            validated_params['concurrent_njsw_tasks'] = int(
                params['concurrent_njsw_tasks'])

        return validated_params

    def fetch_reads_refs_from_sampleset(self, ref, info, validated_params):
        """
        Note: adapted from kbaseapps/kb_hisat2 - file_util.py

        From the given object ref, return a list of all reads objects that are a part of that
        object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
        refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
        {
            "ref": reads object reference,
            "condition": condition string associated with that reads object
        }
        The only one required is "ref", all other keys may or may not be present, based on the reads
        object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
        for each reads object, but a single PairedEndLibrary may not have that info.
        If ref is already a Reads library, just returns a list with ref as a single element.
        """
        obj_type = self.get_type_from_obj_info(info)
        refs = list()
        refs_for_ws_info = list()
        if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
            print("Looking up reads references in ReadsSet object")
            set_api = SetAPI(self.srv_wiz_url)
            reads_set = set_api.get_reads_set_v1({
                'ref':
                ref,
                'include_item_info':
                0,
                'include_set_item_ref_paths':
                1
            })

            for reads in reads_set["data"]["items"]:
                refs.append({
                    'ref': reads['ref_path'],
                    'condition': reads['label']
                })
                refs_for_ws_info.append({'ref': reads['ref_path']})
        else:
            raise ValueError("Unable to fetch reads reference from object {} "
                             "which is a {}".format(ref, obj_type))

        # get object info so we can name things properly
        infos = self.ws.get_object_info3({'objects':
                                          refs_for_ws_info})['infos']

        name_ext = '_alignment'
        if 'output_alignment_suffix' in validated_params \
                and validated_params['output_alignment_suffix'] is not None:
            ext = validated_params['output_alignment_suffix'].replace(' ', '')
            if ext:
                name_ext = ext

        unique_name_lookup = {}
        for k in range(0, len(refs)):
            refs[k]['info'] = infos[k]
            name = infos[k][1]
            if name not in unique_name_lookup:
                unique_name_lookup[name] = 1
            else:
                unique_name_lookup[name] += 1
                name = name + '_' + str(unique_name_lookup[name])
            name = name + name_ext
            refs[k]['alignment_output_name'] = name

        return refs

    def determine_input_info(self, validated_params):
        ''' get info on the input_ref object and determine if we run once or run on a set '''
        info = self.get_obj_info(validated_params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in [
                'KBaseAssembly.PairedEndLibrary',
                'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary',
                'KBaseFile.SingleEndLibrary'
        ]:
            return {
                'run_mode': 'single_library',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseRNASeq.RNASeqSampleSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }
        if obj_type == 'KBaseSets.ReadsSet':
            return {
                'run_mode': 'sample_set',
                'info': info,
                'ref': validated_params['input_ref']
            }

        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Esempio n. 4
0
class BatchRunner(object):
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 provenance):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = provenance

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_BatchApp', provenance[0]['subactions'])
        print('Running kb_BatchApp version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.parallel_runner = KBParallel(self.callback_url)

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'release'

    def run(self, params):
        #
        # validated_params = self.validate_params(params)
        validated_params = params
        num_params = len(validated_params.get('batch_params'))

        app_info = {
            'module_name': validated_params.get('app_id'),
            'function_name': validated_params.get('method'),
            'version': validated_params.get('service_ver')
        }

        if num_params >= 1:
            params_list = validated_params.get('batch_params')
            print('Running on set of parameters =')
            pprint(params_list)

            tasks = []
            for input_params in params_list:
                tasks.append(
                    self.build_single_execution_task(app_info, input_params))

            batch_run_params = {
                'tasks': tasks,
                'runner': 'parallel',
                'max_retries': 2
            }

            # TODO check if this should be given in input
            batch_run_params['concurrent_local_tasks'] = 1
            batch_run_params['concurrent_njsw_tasks'] = 0

            print(
                "========================  BATCH_RUN_PARAMS  ===================="
            )
            pprint(batch_run_params)
            print(
                "================================================================"
            )

            results = self.parallel_runner.run_batch(batch_run_params)
            print('Batch run results=')
            pprint(results)

            return results

        raise ('Improper number of method parameters')

    def build_single_execution_task(self, app_info, params):
        task_params = copy.deepcopy(params.get('params')[0])

        retVal = {'parameters': task_params}
        retVal.update(app_info)
        return retVal

    def clean(self, run_output_info):
        ''' Not really necessary on a single run, but if we are running multiple local subjobs, we
        should clean up files that have already been saved back up to kbase '''
        pass

    def validate_params(self, params):
        # TODO Add validation if needed
        return params
Esempio n. 5
0
class STAR_Aligner(object):
    def __init__(self, config, provenance):
        self.config = config
        self.workspace_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.srv_wiz_url = config['srv-wiz-url']
        self.parallel_runner = KBParallel(self.callback_url)
        self.provenance = provenance
        self.star_utils = STARUtils(self.scratch, self.workspace_url,
                                    self.callback_url, self.srv_wiz_url,
                                    provenance)
        self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev')
        self.star_idx_dir = None
        self.star_out_dir = None

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_STAR', provenance[0]['subactions'])
        print('Running kb_STAR version = ' + self.my_version)

    def run_align(self, params):
        # 0. create the star folders
        if self.star_idx_dir is None:
            (idx_dir, out_dir) = self.star_utils.create_star_dirs(self.scratch)
            self.star_idx_dir = idx_dir
            self.star_out_dir = out_dir

        # 1. validate & process the input parameters
        validated_params = self.star_utils.process_params(params)
        input_obj_info = self.star_utils.determine_input_info(validated_params)

        # 2. convert the input parameters (from refs to file paths, especially)
        input_params = self.star_utils.convert_params(validated_params)

        returnVal = {"report_ref": None, "report_name": None}
        if input_obj_info['run_mode'] == 'single_library':
            returnVal = self.star_run_single(input_params)

        if input_obj_info['run_mode'] == 'sample_set':
            #returnVal = self.star_run_batch_parallel(input_params)
            returnVal = self.star_run_batch_sequential(input_params)

        return returnVal

    def star_run_single(self, input_params):
        """
        Performs a single run of STAR against a single reads reference. The rest of the info
        is taken from the params dict - see the spec for details.
        """
        log('--->\nrunning STAR_Aligner.star_run_single\n' +
            'params:\n{}'.format(json.dumps(input_params, indent=1)))

        # 0. get index
        self.get_index(input_params)

        # 1. Prepare for mapping
        rds = None
        reads_refs = input_params[STARUtils.SET_READS]
        for r in reads_refs:
            if r['ref'] == input_params[STARUtils.PARAM_IN_READS]:
                rds = r
                break

        reads_info = self.star_utils._get_reads_info(
            rds, input_params[STARUtils.PARAM_IN_READS])

        rds_name = rds['alignment_output_name'].replace(
            input_params['alignment_suffix'], '')

        alignment_objs = list()
        alignment_ref = None
        singlerun_output_info = {}
        report_info = {'name': None, 'ref': None}
        ret_val = None

        rds_files = list()
        ret_fwd = reads_info["file_fwd"]
        if ret_fwd is not None:
            rds_files.append(ret_fwd)
            if reads_info.get('file_rev', None) is not None:
                rds_files.append(reads_info['file_rev'])

        input_params[STARUtils.PARAM_IN_OUTFILE_PREFIX] = rds_name + '_'
        # 2. After all is set, do the alignment and upload the output.
        star_mp_ret = self.run_star_mapping(input_params, rds_files, rds_name)

        if star_mp_ret.get('star_output', None) is not None:
            bam_sort = ''
            if input_params.get('outSAMtype', None) == 'BAM':
                bam_sort = 'sortedByCoord'
            output_bam_file = '{}_Aligned.{}.out.bam'.format(
                rds_name, bam_sort)
            output_bam_file = os.path.join(star_mp_ret['star_output'],
                                           output_bam_file)

            # Upload the alignment
            upload_results = self.star_utils.upload_STARalignment(
                input_params, rds, reads_info, output_bam_file)
            alignment_ref = upload_results['obj_ref']
            alignment_obj = {
                'ref': alignment_ref,
                'name': rds['alignment_output_name']
            }
            alignment_objs.append({
                'reads_ref': rds['ref'],
                'AlignmentObj': alignment_obj
            })

            singlerun_output_info['index_dir'] = self.star_idx_dir
            singlerun_output_info['output_dir'] = star_mp_ret['star_output']
            singlerun_output_info['output_bam_file'] = output_bam_file
            singlerun_output_info['upload_results'] = upload_results

            if input_params.get("create_report", 0) == 1:
                report_info = self.star_utils.generate_report_for_single_run(
                    singlerun_output_info, input_params)

            ret_val = {
                'alignmentset_ref': None,
                'output_directory': singlerun_output_info['output_dir'],
                'output_info': singlerun_output_info,
                'alignment_objs': alignment_objs,
                'report_name': report_info['name'],
                'report_ref': report_info['ref']
            }
        else:
            ret_val = {
                'alignmentset_ref': None,
                'output_directory': None,
                'output_info': None,
                'alignment_objs': None,
                'report_name': None,
                'report_ref': None
            }

        if ret_fwd is not None:
            os.remove(ret_fwd)
            if reads_info.get('file_rev', None) is not None:
                os.remove(reads_info["file_rev"])

        return ret_val

    def star_run_batch_sequential(self, input_params):
        """
        star_run_batch_sequential: running the STAR align by looping
        """
        log('--->\nrunning STAR_Aligner.star_run_batch_sequential\n' +
            'params:\n{}'.format(json.dumps(input_params, indent=1)))

        self.get_index(input_params)

        reads_refs = input_params[STARUtils.SET_READS]

        single_input_params = copy.deepcopy(input_params)

        # 1. Run the mapping one by one
        alignment_items = []
        alignment_objs = []
        rds_names = []
        for r in reads_refs:
            single_input_params[STARUtils.PARAM_IN_READS] = r['ref']
            single_input_params['create_report'] = 0
            single_ret = self.star_run_single(single_input_params)

            item = single_ret['alignment_objs'][0]
            a_obj = item['AlignmentObj']
            r_ref = item['reads_ref']
            alignment_objs.append(item)
            alignment_items.append({
                'ref':
                a_obj['ref'],
                'label':
                r.get('condition',
                      single_input_params.get('condition', 'unspecified'))
            })

            rds_names.append(r['alignment_output_name'].replace(
                single_input_params['alignment_suffix'], ''))

        # 2. Process all the results after mapping is done
        (set_result, report_info) = self._batch_sequential_post_processing(
            alignment_items, rds_names, input_params)

        set_result['output_directory'] = self.star_out_dir

        result = {
            'alignmentset_ref': set_result['set_ref'],
            'output_info': set_result,
            'alignment_objs': alignment_objs,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return result

    def _batch_sequential_post_processing(self, alignment_items, rds_names,
                                          params):
        '''
        process the mapping results of all the reads in the readsset_ref
        '''
        # 1. Save the alignment set
        set_name_map = self.star_utils.get_object_names(
            [params[STARUtils.PARAM_IN_READS]])
        set_name = set_name_map[params[STARUtils.PARAM_IN_READS]]

        output_alignmentset_name = set_name + params['alignmentset_suffix']

        save_result = self.star_utils.upload_alignment_set(
            alignment_items, output_alignmentset_name,
            params['output_workspace'])

        result_obj_ref = save_result['set_ref']

        index_dir = os.path.join(self.scratch, STARUtils.STAR_IDX_DIR)
        output_dir = os.path.join(self.scratch, STARUtils.STAR_OUT_DIR)

        # 2. Extract the ReadsPerGene counts if necessary
        self._extract_readsPerGene(params, rds_names, output_dir)

        # 3. Reporting...
        report_info = {'name': None, 'ref': None}

        #run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': result_obj_ref})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']
        qc_result = [{
            'shock_id': qc_result_zip_info['shock_id'],
            'name': qc_result_zip_info['index_html_file_name'],
            'label': qc_result_zip_info['name']
        }]

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text += 'Created ReadsAlignmentSet: ' + str(
            output_alignmentset_name) + '\n\n'

        report_info = self.star_utils._generate_star_report(
            result_obj_ref, report_text, qc_result, params['output_workspace'],
            index_dir, output_dir)

        return (save_result, report_info)

    def star_run_batch_parallel(self, input_params):
        """
        star_run_batch_parallel: running the STAR align in batch parallelly
        """
        log('--->\nrunning STAR_Aligner.star_run_batch_parallel\n' +
            'params:\n{}'.format(json.dumps(input_params, indent=1)))

        reads_refs = input_params[STARUtils.SET_READS]

        # build task list and send it to KBParallel
        tasks = []
        for r in reads_refs:
            tasks.append(
                self.build_single_execution_task(r['ref'], input_params))

        batch_run_params = {
            'tasks': tasks,
            'runner': 'parallel',
            'max_retries': 2
        }

        if input_params.get('concurrent_local_tasks', None) is not None:
            batch_run_params['concurrent_local_tasks'] = input_params[
                'concurrent_local_tasks']
        if input_params.get('concurrent_njsw_tasks', None) is not None:
            batch_run_params['concurrent_njsw_tasks'] = input_params[
                'concurrent_njsw_tasks']

        results = self.parallel_runner.run_batch(batch_run_params)
        print('Batch run results=')
        pprint(results)

        batch_result = self.process_batch_result(results, input_params,
                                                 reads_refs)
        batch_result['output_directory'] = self.star_out_dir

        return batch_result

    def process_batch_result(self, batch_result, params, reads_refs):
        n_jobs = len(batch_result['results'])
        n_success = 0
        n_error = 0
        ran_locally = 0
        ran_njsw = 0

        set_name_map = self.star_utils.get_object_names(
            [params[STARUtils.PARAM_IN_READS]])
        set_name = set_name_map[params[STARUtils.PARAM_IN_READS]]

        # reads alignment set items
        alignment_items = []
        alignment_objs = []
        rds_names = []

        for k in range(0, len(batch_result['results'])):
            reads_ref = reads_refs[k]
            rds_names.append(reads_ref['alignment_output_name'].replace(
                params['alignment_suffix'], ''))

            job = batch_result['results'][k]
            result_package = job['result_package']
            if job['is_error']:
                n_error += 1
            else:
                n_success += 1
                output_info = result_package['result'][0]['output_info']
                ra_ref = output_info['upload_results']['obj_ref']
                alignment_items.append({
                    'ref':
                    ra_ref,
                    'label':
                    reads_ref.get('condition',
                                  params.get('condition', 'unspecified'))
                })
                alignment_objs.append({'ref': ra_ref})

            if result_package['run_context']['location'] == 'local':
                ran_locally += 1
            if result_package['run_context']['location'] == 'njsw':
                ran_njsw += 1

        # Save the alignment set
        output_alignmentset_name = set_name + params['alignmentset_suffix']
        save_result = self.star_utils.upload_alignment_set(
            alignment_items, output_alignmentset_name,
            params['output_workspace'])

        result_obj_ref = save_result['set_ref']

        index_dir = os.path.join(self.scratch, STARUtils.STAR_IDX_DIR)
        output_dir = os.path.join(self.scratch, STARUtils.STAR_OUT_DIR)

        # Extract the ReadsPerGene counts if necessary
        self._extract_readsPerGene(params, rds_names, output_dir)

        # Reporting...
        report_info = {'name': None, 'ref': None}

        #run qualimap
        qualimap_report = self.qualimap.run_bamqc(
            {'input_ref': result_obj_ref})
        qc_result_zip_info = qualimap_report['qc_result_zip_info']
        qc_result = [{
            'shock_id': qc_result_zip_info['shock_id'],
            'name': qc_result_zip_info['index_html_file_name'],
            'label': qc_result_zip_info['name']
        }]

        # create the report
        report_text = 'Ran on SampleSet or ReadsSet.\n\n'
        report_text += 'Created ReadsAlignmentSet: ' + str(
            output_alignmentset_name) + '\n\n'
        report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n'
        report_text += '        Successful runs = ' + str(n_success) + '\n'
        report_text += '            Failed runs = ' + str(n_error) + '\n'
        report_text += '       Ran on main node = ' + str(ran_locally) + '\n'
        report_text += '   Ran on remote worker = ' + str(ran_njsw) + '\n\n'

        report_info = self.star_utils._generate_star_report(
            result_obj_ref, report_text, qc_result, params['output_workspace'],
            index_dir, output_dir)

        result = {
            'alignmentset_ref': result_obj_ref,
            'output_info': batch_result,
            'alignment_objs': alignment_objs,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return result

    def _extract_readsPerGene(self, params, rds_names, output_dir):
        # Extract the ReadsPerGene counts if 'quantMode' was set during the STAR run
        gene_count_files = []
        if (params.get('quantMode', None) is not None
                and (params['quantMode'] == 'Both'
                     or 'GeneCounts' in params['quantMode'])):
            for reads_name in rds_names:
                gene_count_files.append('{}/{}_ReadsPerGene.out.tab'.format(
                    reads_name, reads_name))

            extract_geneCount_matrix(gene_count_files, output_dir)

    def build_single_execution_task(self, rds_ref, params):
        task_params = copy.deepcopy(params)

        task_params[STARUtils.PARAM_IN_READS] = rds_ref
        task_params['create_report'] = 0

        if 'condition' in rds_ref:
            task_param['condition'] = rds_ref['condition']
        else:
            task_params['condition'] = 'unspecified'

        return {
            'module_name': 'STAR',
            'function_name': 'run_star',
            'version': self.my_version,
            #'version': 'dev',
            'parameters': task_params
        }

    def get_version_from_subactions(self, module_name, subactions):
        # go through each sub action looking for
        if not subactions:
            return 'dev'  #'release'  # default to release if we can't find anything
        for sa in subactions:
            if 'name' in sa:
                if sa['name'] == module_name:
                    # local-docker-image implies that we are running in kb-test, so return 'dev'
                    if sa['commit'] == 'local-docker-image':
                        return 'dev'
                    # to check that it is a valid hash, make sure it is the right
                    # length and made up of valid hash characters
                    if re.match('[a-fA-F0-9]{40}$', sa['commit']):
                        return sa['commit']
        # again, default to setting this to release
        return 'dev'  #'release'

    def run_star_indexing(self, input_params):
        """
        Runs STAR in genomeGenerate mode to build the index files and directory for STAR mapping.
        It creates a directory as defined by self.star_idx_dir in the scratch area that houses
        the index files.
        """
        ret_params = copy.deepcopy(input_params)
        ret_params[STARUtils.PARAM_IN_STARMODE] = 'genomeGenerate'

        # build the indexing parameters
        params_idx = self.star_utils._get_indexing_params(
            ret_params, self.star_idx_dir)

        ret = 1
        try:
            if ret_params[STARUtils.PARAM_IN_STARMODE] == 'genomeGenerate':
                ret = self.star_utils._exec_indexing(params_idx)
            else:
                ret = 0
            while (ret != 0):
                time.sleep(1)
        except ValueError as eidx:
            log('STAR genome indexing raised error:\n')
            pprint(eidx)
        else:
            ret = 0

        return (ret, params_idx[STARUtils.STAR_IDX_DIR])

    def run_star_mapping(self, params, rds_files, rds_name):
        """
        Runs STAR in alignReads mode for STAR mapping.
        It creates a directory as defined by self.star_out_dir with a subfolder named after the reads
        """
        params_mp = self.star_utils._get_mapping_params(
            params, rds_files, rds_name, self.star_idx_dir, self.star_out_dir)

        retVal = {}
        params_mp[STARUtils.PARAM_IN_STARMODE] = 'alignReads'
        try:
            ret = self.star_utils._exec_mapping(params_mp)
            while (ret != 0):
                time.sleep(1)
        except ValueError as emp:
            log('STAR mapping raised error:\n')
            pprint(emp)
            retVal = {'star_idx': self.star_idx_dir, 'star_output': None}
        else:  #no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting
            retVal = {
                'star_idx': self.star_idx_dir,
                'star_output': params_mp.get('align_output')
            }

        return retVal

    def get_index(self, input_params):
        '''
        get_index: generate the index if not yet existing
        '''
        gnm_ref = input_params[STARUtils.PARAM_IN_GENOME]
        if input_params.get('sjdbGTFfile', None) is None:
            input_params['sjdbGTFfile'] = self.star_utils._get_genome_gtf_file(
                gnm_ref, self.star_idx_dir)

        if not os.path.isfile(
                os.path.join(self.star_idx_dir, 'genomeParameters.txt')):
            # fetch genome fasta and GTF from refs to file location(s)
            input_params[
                STARUtils.
                PARAM_IN_FASTA_FILES] = self.star_utils._get_genome_fasta(
                    gnm_ref)

            # generate the indices
            (idx_ret, idx_dir) = self.run_star_indexing(input_params)
            if idx_ret != 0:
                raise ValueError(
                    "Failed to generate genome indices, aborting...")