def calcSquareSum(self, ctx, params): """ :param params: instance of type "CalcSquareSumParams" (===================== main =====================) -> structure: parameter "n" of Long :returns: instance of type "CalcSquareSumInputOutput" -> structure: parameter "square_sum" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN calcSquareSum kbp = KBParallel(os.environ['SDK_CALLBACK_URL'], token=ctx['token']) returnVal = kbp.run({ 'prepare_method': { 'module_name': 'ParallelSquareSum', 'method_name': 'calcSquareSumPrepare', 'service_ver': 'dev' }, 'is_local': 1, 'global_params': params, 'time_limit': 1000000 }) #END calcSquareSum # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method calcSquareSum return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def manyHellos(self, ctx, input_params): """ :param input_params: instance of type "ManyHellosInputParams" (hello_msg - what to print as the message, time_limit - how long the program will run, in seconds, workspace - used to store report(s).) -> structure: parameter "hello_msg" of String, parameter "num_jobs" of Long, parameter "time_limit" of Long, parameter "workspace" of String :returns: instance of type "ManyHellos_globalResult" -> structure: parameter "output" of String, parameter "jobs" of list of tuple of size 2: parameter "job_number" of Long, parameter "message" of String """ # ctx is the context object # return variables are: returnVal #BEGIN manyHellos print("Hi this is manyHellos()!") print("hello_mesg is ", input_params["hello_msg"]) print("time_limit is ", input_params["time_limit"]) print("num_jobs is ", input_params["num_jobs"]) print("workspace is ", input_params["workspace"]) kbp = KBParallel(os.environ['SDK_CALLBACK_URL'], token=ctx['token']) returnVal = kbp.run({ 'method': { 'module_name': 'ManyHellos', 'method_name': 'manyHellos', 'service_ver': 'dev' }, 'is_local': 1, 'global_params': { 'msg': input_params["hello_msg"], 'num_jobs': input_params["num_jobs"], 'workspace': input_params["workspace"] }, 'time_limit': input_params["time_limit"] }) print("this is manyHellos(), signing off! Bye!") #END manyHellos # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method manyHellos return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.scratch_dir = scratch_dir self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.provenance = provenance # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_BatchApp', provenance[0]['subactions']) print('Running kb_BatchApp version = ' + self.my_version) self.ws = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url)
def __init__(self, config, provenance): self.config = config self.workspace_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.srv_wiz_url = config['srv-wiz-url'] self.parallel_runner = KBParallel(self.callback_url) self.provenance = provenance self.star_utils = STARUtils(self.scratch, self.workspace_url, self.callback_url, self.srv_wiz_url, provenance) self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.star_idx_dir = None self.star_out_dir = None # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_STAR', provenance[0]['subactions']) print('Running kb_STAR version = ' + self.my_version)
def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, context): self.scratch_dir = scratch_dir self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.provenance = context.provenance() self.job_id = None rpc_context = context.get('rpc_context') if rpc_context is not None and hasattr(rpc_context, 'get'): current_call_ctx = rpc_context.get('call_stack') if len(current_call_ctx): self.job_id = current_call_ctx[0].get('job_id') # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(self.provenance) > 0: if 'subactions' in self.provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_BatchApp', self.provenance[0]['subactions']) print('Running kb_BatchApp version = ' + self.my_version) self.ws = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url, service_ver='dev')
def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.au = AssemblyUtil(self.callback_url) self.dfu = DataFileUtil(self.callback_url, service_ver='beta') self.scratch = scratch_dir self.working_dir = scratch_dir self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch) self.provenance = provenance self.ws_client = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url, service_ver='beta')
class BatchRunner(object): def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, context): self.scratch_dir = scratch_dir self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.provenance = context.provenance() self.job_id = None rpc_context = context.get('rpc_context') if rpc_context is not None and hasattr(rpc_context, 'get'): current_call_ctx = rpc_context.get('call_stack') if len(current_call_ctx): self.job_id = current_call_ctx[0].get('job_id') # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(self.provenance) > 0: if 'subactions' in self.provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_BatchApp', self.provenance[0]['subactions']) print('Running kb_BatchApp version = ' + self.my_version) self.ws = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url, service_ver='dev') def get_version_from_subactions(self, module_name, subactions): # go through each sub action looking for if not subactions: return 'release' # default to release if we can't find anything for sa in subactions: if 'name' in sa: if sa['name'] == module_name: # local-docker-image implies that we are running in kb-test, so return 'dev' if sa['commit'] == 'local-docker-image': return 'dev' # to check that it is a valid hash, make sure it is the right # length and made up of valid hash characters if re.match('[a-fA-F0-9]{40}$', sa['commit']): return sa['commit'] # again, default to setting this to release return 'release' def run(self, params): self.validate_params( params ) # raises an exception if there's a failure. see that function for details. app_info = { 'module_name': params['module_name'], 'function_name': params['method_name'], 'version': params['service_ver'] } params_list = params.get('batch_params') print('Running on set of parameters =') pprint(params_list) tasks = [] for input_params in params_list: tasks.append( self.build_single_execution_task(app_info, input_params)) batch_run_params = { 'tasks': tasks, 'runner': 'parallel', 'max_retries': 2 } if self.job_id is not None: batch_run_params['parent_job_id'] = self.job_id # TODO check if this should be given in input batch_run_params['concurrent_local_tasks'] = 0 batch_run_params['concurrent_njsw_tasks'] = 5 print( "======================== BATCH_RUN_PARAMS ====================") pprint(batch_run_params) print( "================================================================") batch_results = self.parallel_runner.run_batch(batch_run_params) print('Batch run results=') pprint(batch_results) results = {'batch_results': dict()} for result in batch_results['results']: results['batch_results'][result['result_package']['run_context'] ['job_id']] = result results['report_name'], results['report_ref'] = build_report( self.callback_url, self.scratch_dir, results['batch_results'], len(params_list), params['wsid']) return results def build_single_execution_task(self, app_info, params): task_params = copy.deepcopy(params.get('params')[0]) retVal = {'parameters': task_params} retVal.update(app_info) return retVal def clean(self, run_output_info): """ Not really necessary on a single run, but if we are running multiple local subjobs, we should clean up files that have already been saved back up to KBase. """ pass def validate_params(self, params): """ Things to validate. params.module_name and params.method_name are real (maybe just let that go and assume they're ok) params.wsid is a real workspace id and the current user has write-access. params.batch_params is a list with len > 0 """ if params.get("batch_params", None) is None or ( isinstance(params["batch_params"], list) and len(params["batch_params"]) == 0): raise ValueError("batch_params must be a list with a length >= 1") if params.get("module_name") is None: raise ValueError( "module_name must be an existing KBase app module!") elif "." in params["module_name"] or "/" in params["module_name"]: raise ValueError( "module_name should just be the name of the module, NOT the full module.method" ) if params.get("method_name") is None: raise ValueError( "method_name must be an existing KBase app method!") elif "." in params["method_name"] or "/" in params["method_name"]: raise ValueError( "method_name should just be the name of the method, NOT the full module.method" ) if params.get("service_ver") is None or not isinstance( params["service_ver"], basestring): raise ValueError("service_ver must be a valid string!") if params.get("wsid") is None: raise ValueError( "A workspace id must be provided to associate each subjob!") return params
def run_batch(self, reads_refs, params): """ Runs HISAT2 in batch mode. reads_refs should be a list of dicts, where each looks like the following: { "ref": reads object reference, "condition": condition for that ref (string) } """ # build task list and send it to KBParallel tasks = list() set_name = get_object_names( [params["sampleset_ref"]], self.workspace_url)[params["sampleset_ref"]] for idx, reads_ref in enumerate(reads_refs): single_param = dict(params) # need a copy of the params single_param["build_report"] = 0 single_param["sampleset_ref"] = reads_ref["ref"] if "condition" in reads_ref: single_param["condition"] = reads_ref["condition"] else: single_param["condition"] = "unspecified" tasks.append({ "module_name": "kb_hisat2", "function_name": "run_hisat2", "version": self.my_version, "parameters": single_param }) # UNCOMMENT BELOW FOR LOCAL TESTING batch_run_params = { "tasks": tasks, "runner": "parallel", # "concurrent_local_tasks": 3, # "concurrent_njsw_tasks": 0, "max_retries": 2 } parallel_runner = KBParallel(self.callback_url) results = parallel_runner.run_batch(batch_run_params)["results"] alignment_items = list() alignments = dict() for idx, result in enumerate(results): # idx of the result is the same as the idx of the inputs AND reads_refs if result["is_error"] != 0: raise RuntimeError( "Failed a parallel run of HISAT2! {}".format( result["result_package"]["error"])) reads_ref = tasks[idx]["parameters"]["sampleset_ref"] alignment_items.append({ "ref": result["result_package"]["result"][0]["alignment_objs"] [reads_ref]["ref"], "label": reads_refs[idx].get("condition", params.get("condition", "unspecified")) }) alignments[reads_ref] = result["result_package"]["result"][0][ "alignment_objs"][reads_ref] # build the final alignment set output_ref = self.upload_alignment_set( alignment_items, set_name + params["alignmentset_suffix"], params["ws_name"]) return (alignments, output_ref)
class BatchRunner(object): def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.scratch_dir = scratch_dir self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.provenance = provenance # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_BatchApp', provenance[0]['subactions']) print('Running kb_BatchApp version = ' + self.my_version) self.ws = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url) def get_version_from_subactions(self, module_name, subactions): # go through each sub action looking for if not subactions: return 'release' # default to release if we can't find anything for sa in subactions: if 'name' in sa: if sa['name'] == module_name: # local-docker-image implies that we are running in kb-test, so return 'dev' if sa['commit'] == 'local-docker-image': return 'dev' # to check that it is a valid hash, make sure it is the right # length and made up of valid hash characters if re.match('[a-fA-F0-9]{40}$', sa['commit']): return sa['commit'] # again, default to setting this to release return 'release' def run(self, params): # # validated_params = self.validate_params(params) validated_params = params num_params = len(validated_params.get('batch_params')) app_info = { 'module_name': validated_params.get('app_id'), 'function_name': validated_params.get('method'), 'version': validated_params.get('service_ver') } if num_params >= 1: params_list = validated_params.get('batch_params') print('Running on set of parameters =') pprint(params_list) tasks = [] for input_params in params_list: tasks.append( self.build_single_execution_task(app_info, input_params)) batch_run_params = { 'tasks': tasks, 'runner': 'parallel', 'max_retries': 2 } # TODO check if this should be given in input batch_run_params['concurrent_local_tasks'] = 1 batch_run_params['concurrent_njsw_tasks'] = 0 print( "======================== BATCH_RUN_PARAMS ====================" ) pprint(batch_run_params) print( "================================================================" ) results = self.parallel_runner.run_batch(batch_run_params) print('Batch run results=') pprint(results) return results raise ('Improper number of method parameters') def build_single_execution_task(self, app_info, params): task_params = copy.deepcopy(params.get('params')[0]) retVal = {'parameters': task_params} retVal.update(app_info) return retVal def clean(self, run_output_info): ''' Not really necessary on a single run, but if we are running multiple local subjobs, we should clean up files that have already been saved back up to kbase ''' pass def validate_params(self, params): # TODO Add validation if needed return params
class Bowtie2Aligner(object): def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.scratch_dir = scratch_dir self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.provenance = provenance # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_Bowtie2', provenance[0]['subactions']) print('Running kb_Bowtie2 version = ' + self.my_version) self.ws = Workspace(self.workspace_url) self.bowtie2 = Bowtie2Runner(self.scratch_dir) self.parallel_runner = KBParallel(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url) def get_version_from_subactions(self, module_name, subactions): # go through each sub action looking for if not subactions: return 'release' # default to release if we can't find anything for sa in subactions: if 'name' in sa: if sa['name'] == module_name: # local-docker-image implies that we are running in kb-test, so return 'dev' if sa['commit'] == 'local-docker-image': return 'dev' # to check that it is a valid hash, make sure it is the right # length and made up of valid hash characters if re.match('[a-fA-F0-9]{40}$', sa['commit']): return sa['commit'] # again, default to setting this to release return 'release' def align(self, params): validated_params = self.validate_params(params) input_info = self.determine_input_info(validated_params) # input info provides information on the input and tells us if we should # run as a single_library or as a set: # input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'} assembly_or_genome_ref = validated_params['assembly_or_genome_ref'] if input_info['run_mode'] == 'single_library': if 'output_alignment_name' not in validated_params: suffix = '_alignment' if 'output_alignment_suffix' in validated_params: suffix = validated_params['output_alignment_suffix'] validated_params[ 'output_alignment_name'] = input_info['info'][1] + suffix single_lib_result = self.single_reads_lib_run( input_info, assembly_or_genome_ref, validated_params, create_report=validated_params['create_report']) return single_lib_result if input_info['run_mode'] == 'sample_set': reads = self.fetch_reads_refs_from_sampleset( input_info['ref'], input_info['info'], validated_params) self.build_bowtie2_index(assembly_or_genome_ref, validated_params['output_workspace']) print('Running on set of reads=') pprint(reads) tasks = [] for r in reads: tasks.append( self.build_single_execution_task( r['ref'], params, r['alignment_output_name'], r['condition'])) batch_run_params = { 'tasks': tasks, 'runner': 'parallel', 'max_retries': 2 } if validated_params['concurrent_local_tasks'] is not None: batch_run_params['concurrent_local_tasks'] = validated_params[ 'concurrent_local_tasks'] if validated_params['concurrent_njsw_tasks'] is not None: batch_run_params['concurrent_njsw_tasks'] = validated_params[ 'concurrent_njsw_tasks'] results = self.parallel_runner.run_batch(batch_run_params) print('Batch run results=') pprint(results) batch_result = self.process_batch_result(results, validated_params, reads, input_info['info']) return batch_result raise ('Improper run mode') def build_single_execution_task(self, reads_lib_ref, params, output_name, condition): task_params = copy.deepcopy(params) task_params['input_ref'] = reads_lib_ref task_params['output_alignment_name'] = output_name task_params['create_report'] = 0 task_params['condition_label'] = condition return { 'module_name': 'kb_Bowtie2', 'function_name': 'align_reads_to_assembly_app', 'version': self.my_version, 'parameters': task_params } def single_reads_lib_run(self, read_lib_info, assembly_or_genome_ref, validated_params, create_report=False, bowtie2_index_info=None): ''' run on one reads ''' # download reads and prepare any bowtie2 index files input_configuration = self.prepare_single_run( read_lib_info, assembly_or_genome_ref, bowtie2_index_info, validated_params['output_workspace']) # run the actual program run_output_info = self.run_bowtie2_align_cli(input_configuration, validated_params) # process the result and save the output upload_results = self.save_read_alignment_output( run_output_info, input_configuration, validated_params) run_output_info['upload_results'] = upload_results report_info = None if create_report: report_info = self.create_report_for_single_run( run_output_info, input_configuration, validated_params) self.clean(run_output_info) return {'output_info': run_output_info, 'report_info': report_info} def build_bowtie2_index(self, assembly_or_genome_ref, ws_for_cache): bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir, self.workspace_url, self.callback_url, self.srv_wiz_url, self.provenance) return bowtie2IndexBuilder.get_index({ 'ref': assembly_or_genome_ref, 'ws_for_cache': ws_for_cache }) def prepare_single_run(self, input_info, assembly_or_genome_ref, bowtie2_index_info, ws_for_cache): ''' Given a reads ref and an assembly, setup the bowtie2 index ''' # first setup the bowtie2 index of the assembly input_configuration = {'bowtie2_index_info': bowtie2_index_info} if not bowtie2_index_info: bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir, self.workspace_url, self.callback_url, self.srv_wiz_url, self.provenance) index_result = bowtie2IndexBuilder.get_index({ 'ref': assembly_or_genome_ref, 'ws_for_cache': ws_for_cache }) input_configuration['bowtie2_index_info'] = index_result # next download the reads read_lib_ref = input_info['ref'] read_lib_info = input_info['info'] reads_params = { 'read_libraries': [read_lib_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callback_url) reads = ru.download_reads(reads_params)['files'] input_configuration['reads_lib_type'] = self.get_type_from_obj_info( read_lib_info).split('.')[1] input_configuration['reads_files'] = reads[read_lib_ref] input_configuration['reads_lib_ref'] = read_lib_ref return input_configuration def run_bowtie2_align_cli(self, input_configuration, validated_params): # pprint('======== input_configuration =====') # pprint(input_configuration) options = [] run_output_info = {} # set the bowtie2 index location bt2_index_dir = input_configuration['bowtie2_index_info']['output_dir'] bt2_index_basename = input_configuration['bowtie2_index_info'][ 'index_files_basename'] options.extend(['-x', bt2_index_basename]) # set the input reads if input_configuration['reads_lib_type'] == 'SingleEndLibrary': options.extend( ['-U', input_configuration['reads_files']['files']['fwd']]) run_output_info['library_type'] = 'single_end' elif input_configuration['reads_lib_type'] == 'PairedEndLibrary': options.extend( ['-1', input_configuration['reads_files']['files']['fwd']]) options.extend( ['-2', input_configuration['reads_files']['files']['rev']]) run_output_info['library_type'] = 'paired_end' # setup the output file name output_dir = os.path.join( self.scratch_dir, 'bowtie2_alignment_output_' + str(int(time.time() * 10000))) output_sam_file = os.path.join(output_dir, 'reads_alignment.sam') os.makedirs(output_dir) options.extend(['-S', output_sam_file]) run_output_info['output_sam_file'] = output_sam_file run_output_info['output_dir'] = output_dir # parse all the other parameters if 'quality_score' in validated_params: options.append('--' + str(validated_params['quality_score'])) if 'alignment_type' in validated_params: options.append('--' + str(validated_params['alignment_type'])) if 'preset_options' in validated_params: if 'alignment_type' in validated_params and validated_params[ 'alignment_type'] == 'local': options.append('--' + str(validated_params['preset_options'] + '-local')) else: options.append('--' + str(validated_params['preset_options'])) if 'trim5' in validated_params: options.extend(['--trim5', str(validated_params['trim5'])]) if 'trim3' in validated_params: options.extend(['--trim3', str(validated_params['trim3'])]) if 'np' in validated_params: options.extend(['--np', str(validated_params['np'])]) if 'minins' in validated_params: options.extend(['--minins', str(validated_params['minins'])]) if 'maxins' in validated_params: options.extend(['--maxins', str(validated_params['maxins'])]) # unfortunately, bowtie2 expects the index files to be in the current directory, and # you cannot configure it otherwise. So run bowtie out of the index directory, but # place the output SAM file somewhere else self.bowtie2.run('bowtie2', options, cwd=bt2_index_dir) return run_output_info def save_read_alignment_output(self, run_output_info, input_configuration, validated_params): rau = ReadsAlignmentUtils(self.callback_url) destination_ref = validated_params[ 'output_workspace'] + '/' + validated_params[ 'output_alignment_name'] condition = 'unknown' if 'condition_label' in validated_params: condition = validated_params['condition_label'] upload_params = { 'file_path': run_output_info['output_sam_file'], 'destination_ref': destination_ref, 'read_library_ref': input_configuration['reads_lib_ref'], 'assembly_or_genome_ref': validated_params['assembly_or_genome_ref'], 'condition': condition } upload_results = rau.upload_alignment(upload_params) return upload_results def clean(self, run_output_info): ''' Not really necessary on a single run, but if we are running multiple local subjobs, we should clean up files that have already been saved back up to kbase ''' pass def create_report_for_single_run(self, run_output_info, input_configuration, validated_params): # first run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': run_output_info['upload_results']['obj_ref']}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create report report_text = 'Ran on a single reads library.\n\n' alignment_info = self.get_obj_info( run_output_info['upload_results']['obj_ref']) report_text = 'Created ReadsAlignment: ' + str( alignment_info[1]) + '\n' report_text = ' ' + run_output_info[ 'upload_results']['obj_ref'] + '\n' kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({ 'message': report_text, 'objects_created': [{ 'ref': run_output_info['upload_results']['obj_ref'], 'description': 'ReadsAlignment' }], 'report_object_name': 'kb_Bowtie2_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'workspace_name': validated_params['output_workspace'] }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } def process_batch_result(self, batch_result, validated_params, reads, input_set_info): n_jobs = len(batch_result['results']) n_success = 0 n_error = 0 ran_locally = 0 ran_njsw = 0 # reads alignment set items items = [] objects_created = [] for k in range(0, len(batch_result['results'])): job = batch_result['results'][k] result_package = job['result_package'] if job['is_error']: n_error += 1 else: n_success += 1 output_info = result_package['result'][0]['output_info'] ra_ref = output_info['upload_results']['obj_ref'] # Note: could add a label to the alignment here? items.append({'ref': ra_ref, 'label': reads[k]['condition']}) objects_created.append({'ref': ra_ref}) if result_package['run_context']['location'] == 'local': ran_locally += 1 if result_package['run_context']['location'] == 'njsw': ran_njsw += 1 # Save the alignment set alignment_set_data = {'description': '', 'items': items} alignment_set_save_params = { 'data': alignment_set_data, 'workspace': validated_params['output_workspace'], 'output_object_name': str(input_set_info[1]) + validated_params['output_obj_name_suffix'] } set_api = SetAPI(self.srv_wiz_url) save_result = set_api.save_reads_alignment_set_v1( alignment_set_save_params) print('Saved ReadsAlignment=') pprint(save_result) objects_created.append({ 'ref': save_result['set_ref'], 'description': 'Set of all reads alignments generated' }) set_name = save_result['set_info'][1] # run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': save_result['set_ref']}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create the report report_text = 'Ran on SampleSet or ReadsSet.\n\n' report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n' report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n' report_text += ' Successful runs = ' + str(n_success) + '\n' report_text += ' Failed runs = ' + str(n_error) + '\n' report_text += ' Ran on main node = ' + str(ran_locally) + '\n' report_text += ' Ran on remote worker = ' + str(ran_njsw) + '\n\n' print('Report text=') print(report_text) kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({ 'message': report_text, 'objects_created': objects_created, 'report_object_name': 'kb_Bowtie2_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'workspace_name': validated_params['output_workspace'] }) result = { 'report_info': { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } } result['batch_output_info'] = batch_result return result def validate_params(self, params): validated_params = {} required_string_fields = [ 'input_ref', 'assembly_or_genome_ref', 'output_obj_name_suffix', 'output_workspace' ] for field in required_string_fields: if field in params and params[field]: validated_params[field] = params[field] else: raise ValueError('"' + field + '" field required to run bowtie2 aligner app') optional_fields = [ 'quality_score', 'alignment_type', 'preset_options', 'trim5', 'trim3', 'condition_label', 'np', 'minins', 'maxins', 'output_alignment_suffix', 'output_alignment_name' ] for field in optional_fields: if field in params: if params[field] is not None: validated_params[field] = params[field] validated_params['create_report'] = True if 'create_report' in params and params['create_report'] is not None: if int(params['create_report']) == 1: validated_params['create_report'] = True elif int(params['create_report']) == 0: validated_params['create_report'] = False else: raise ValueError( '"create_report" field, if present, should be set to a boolean value: 0 or 1' ) validated_params['concurrent_local_tasks'] = None validated_params['concurrent_njsw_tasks'] = None if 'concurrent_local_tasks' in params and params[ 'concurrent_local_tasks'] is not None: validated_params['concurrent_local_tasks'] = int( params['concurrent_local_tasks']) if 'concurrent_njsw_tasks' in params and params[ 'concurrent_njsw_tasks'] is not None: validated_params['concurrent_njsw_tasks'] = int( params['concurrent_njsw_tasks']) return validated_params def fetch_reads_refs_from_sampleset(self, ref, info, validated_params): """ Note: adapted from kbaseapps/kb_hisat2 - file_util.py From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = self.get_type_from_obj_info(info) refs = list() refs_for_ws_info = list() if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in ReadsSet object") set_api = SetAPI(self.srv_wiz_url) reads_set = set_api.get_reads_set_v1({ 'ref': ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) for reads in reads_set["data"]["items"]: refs.append({ 'ref': reads['ref_path'], 'condition': reads['label'] }) refs_for_ws_info.append({'ref': reads['ref_path']}) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) # get object info so we can name things properly infos = self.ws.get_object_info3({'objects': refs_for_ws_info})['infos'] name_ext = '_alignment' if 'output_alignment_suffix' in validated_params \ and validated_params['output_alignment_suffix'] is not None: ext = validated_params['output_alignment_suffix'].replace(' ', '') if ext: name_ext = ext unique_name_lookup = {} for k in range(0, len(refs)): refs[k]['info'] = infos[k] name = infos[k][1] if name not in unique_name_lookup: unique_name_lookup[name] = 1 else: unique_name_lookup[name] += 1 name = name + '_' + str(unique_name_lookup[name]) name = name + name_ext refs[k]['alignment_output_name'] = name return refs def determine_input_info(self, validated_params): ''' get info on the input_ref object and determine if we run once or run on a set ''' info = self.get_obj_info(validated_params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in [ 'KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary' ]: return { 'run_mode': 'single_library', 'info': info, 'ref': validated_params['input_ref'] } if obj_type == 'KBaseRNASeq.RNASeqSampleSet': return { 'run_mode': 'sample_set', 'info': info, 'ref': validated_params['input_ref'] } if obj_type == 'KBaseSets.ReadsSet': return { 'run_mode': 'sample_set', 'info': info, 'ref': validated_params['input_ref'] } raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
class STAR_Aligner(object): def __init__(self, config, provenance): self.config = config self.workspace_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.srv_wiz_url = config['srv-wiz-url'] self.parallel_runner = KBParallel(self.callback_url) self.provenance = provenance self.star_utils = STARUtils(self.scratch, self.workspace_url, self.callback_url, self.srv_wiz_url, provenance) self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.star_idx_dir = None self.star_out_dir = None # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_STAR', provenance[0]['subactions']) print('Running kb_STAR version = ' + self.my_version) def run_align(self, params): # 0. create the star folders if self.star_idx_dir is None: (idx_dir, out_dir) = self.star_utils.create_star_dirs(self.scratch) self.star_idx_dir = idx_dir self.star_out_dir = out_dir # 1. validate & process the input parameters validated_params = self.star_utils.process_params(params) input_obj_info = self.star_utils.determine_input_info(validated_params) # 2. convert the input parameters (from refs to file paths, especially) input_params = self.star_utils.convert_params(validated_params) returnVal = {"report_ref": None, "report_name": None} if input_obj_info['run_mode'] == 'single_library': returnVal = self.star_run_single(input_params) if input_obj_info['run_mode'] == 'sample_set': #returnVal = self.star_run_batch_parallel(input_params) returnVal = self.star_run_batch_sequential(input_params) return returnVal def star_run_single(self, input_params): """ Performs a single run of STAR against a single reads reference. The rest of the info is taken from the params dict - see the spec for details. """ log('--->\nrunning STAR_Aligner.star_run_single\n' + 'params:\n{}'.format(json.dumps(input_params, indent=1))) # 0. get index self.get_index(input_params) # 1. Prepare for mapping rds = None reads_refs = input_params[STARUtils.SET_READS] for r in reads_refs: if r['ref'] == input_params[STARUtils.PARAM_IN_READS]: rds = r break reads_info = self.star_utils._get_reads_info( rds, input_params[STARUtils.PARAM_IN_READS]) rds_name = rds['alignment_output_name'].replace( input_params['alignment_suffix'], '') alignment_objs = list() alignment_ref = None singlerun_output_info = {} report_info = {'name': None, 'ref': None} ret_val = None rds_files = list() ret_fwd = reads_info["file_fwd"] if ret_fwd is not None: rds_files.append(ret_fwd) if reads_info.get('file_rev', None) is not None: rds_files.append(reads_info['file_rev']) input_params[STARUtils.PARAM_IN_OUTFILE_PREFIX] = rds_name + '_' # 2. After all is set, do the alignment and upload the output. star_mp_ret = self.run_star_mapping(input_params, rds_files, rds_name) if star_mp_ret.get('star_output', None) is not None: bam_sort = '' if input_params.get('outSAMtype', None) == 'BAM': bam_sort = 'sortedByCoord' output_bam_file = '{}_Aligned.{}.out.bam'.format( rds_name, bam_sort) output_bam_file = os.path.join(star_mp_ret['star_output'], output_bam_file) # Upload the alignment upload_results = self.star_utils.upload_STARalignment( input_params, rds, reads_info, output_bam_file) alignment_ref = upload_results['obj_ref'] alignment_obj = { 'ref': alignment_ref, 'name': rds['alignment_output_name'] } alignment_objs.append({ 'reads_ref': rds['ref'], 'AlignmentObj': alignment_obj }) singlerun_output_info['index_dir'] = self.star_idx_dir singlerun_output_info['output_dir'] = star_mp_ret['star_output'] singlerun_output_info['output_bam_file'] = output_bam_file singlerun_output_info['upload_results'] = upload_results if input_params.get("create_report", 0) == 1: report_info = self.star_utils.generate_report_for_single_run( singlerun_output_info, input_params) ret_val = { 'alignmentset_ref': None, 'output_directory': singlerun_output_info['output_dir'], 'output_info': singlerun_output_info, 'alignment_objs': alignment_objs, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } else: ret_val = { 'alignmentset_ref': None, 'output_directory': None, 'output_info': None, 'alignment_objs': None, 'report_name': None, 'report_ref': None } if ret_fwd is not None: os.remove(ret_fwd) if reads_info.get('file_rev', None) is not None: os.remove(reads_info["file_rev"]) return ret_val def star_run_batch_sequential(self, input_params): """ star_run_batch_sequential: running the STAR align by looping """ log('--->\nrunning STAR_Aligner.star_run_batch_sequential\n' + 'params:\n{}'.format(json.dumps(input_params, indent=1))) self.get_index(input_params) reads_refs = input_params[STARUtils.SET_READS] single_input_params = copy.deepcopy(input_params) # 1. Run the mapping one by one alignment_items = [] alignment_objs = [] rds_names = [] for r in reads_refs: single_input_params[STARUtils.PARAM_IN_READS] = r['ref'] single_input_params['create_report'] = 0 single_ret = self.star_run_single(single_input_params) item = single_ret['alignment_objs'][0] a_obj = item['AlignmentObj'] r_ref = item['reads_ref'] alignment_objs.append(item) alignment_items.append({ 'ref': a_obj['ref'], 'label': r.get('condition', single_input_params.get('condition', 'unspecified')) }) rds_names.append(r['alignment_output_name'].replace( single_input_params['alignment_suffix'], '')) # 2. Process all the results after mapping is done (set_result, report_info) = self._batch_sequential_post_processing( alignment_items, rds_names, input_params) set_result['output_directory'] = self.star_out_dir result = { 'alignmentset_ref': set_result['set_ref'], 'output_info': set_result, 'alignment_objs': alignment_objs, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return result def _batch_sequential_post_processing(self, alignment_items, rds_names, params): ''' process the mapping results of all the reads in the readsset_ref ''' # 1. Save the alignment set set_name_map = self.star_utils.get_object_names( [params[STARUtils.PARAM_IN_READS]]) set_name = set_name_map[params[STARUtils.PARAM_IN_READS]] output_alignmentset_name = set_name + params['alignmentset_suffix'] save_result = self.star_utils.upload_alignment_set( alignment_items, output_alignmentset_name, params['output_workspace']) result_obj_ref = save_result['set_ref'] index_dir = os.path.join(self.scratch, STARUtils.STAR_IDX_DIR) output_dir = os.path.join(self.scratch, STARUtils.STAR_OUT_DIR) # 2. Extract the ReadsPerGene counts if necessary self._extract_readsPerGene(params, rds_names, output_dir) # 3. Reporting... report_info = {'name': None, 'ref': None} #run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': result_obj_ref}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] qc_result = [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }] # create the report report_text = 'Ran on SampleSet or ReadsSet.\n\n' report_text += 'Created ReadsAlignmentSet: ' + str( output_alignmentset_name) + '\n\n' report_info = self.star_utils._generate_star_report( result_obj_ref, report_text, qc_result, params['output_workspace'], index_dir, output_dir) return (save_result, report_info) def star_run_batch_parallel(self, input_params): """ star_run_batch_parallel: running the STAR align in batch parallelly """ log('--->\nrunning STAR_Aligner.star_run_batch_parallel\n' + 'params:\n{}'.format(json.dumps(input_params, indent=1))) reads_refs = input_params[STARUtils.SET_READS] # build task list and send it to KBParallel tasks = [] for r in reads_refs: tasks.append( self.build_single_execution_task(r['ref'], input_params)) batch_run_params = { 'tasks': tasks, 'runner': 'parallel', 'max_retries': 2 } if input_params.get('concurrent_local_tasks', None) is not None: batch_run_params['concurrent_local_tasks'] = input_params[ 'concurrent_local_tasks'] if input_params.get('concurrent_njsw_tasks', None) is not None: batch_run_params['concurrent_njsw_tasks'] = input_params[ 'concurrent_njsw_tasks'] results = self.parallel_runner.run_batch(batch_run_params) print('Batch run results=') pprint(results) batch_result = self.process_batch_result(results, input_params, reads_refs) batch_result['output_directory'] = self.star_out_dir return batch_result def process_batch_result(self, batch_result, params, reads_refs): n_jobs = len(batch_result['results']) n_success = 0 n_error = 0 ran_locally = 0 ran_njsw = 0 set_name_map = self.star_utils.get_object_names( [params[STARUtils.PARAM_IN_READS]]) set_name = set_name_map[params[STARUtils.PARAM_IN_READS]] # reads alignment set items alignment_items = [] alignment_objs = [] rds_names = [] for k in range(0, len(batch_result['results'])): reads_ref = reads_refs[k] rds_names.append(reads_ref['alignment_output_name'].replace( params['alignment_suffix'], '')) job = batch_result['results'][k] result_package = job['result_package'] if job['is_error']: n_error += 1 else: n_success += 1 output_info = result_package['result'][0]['output_info'] ra_ref = output_info['upload_results']['obj_ref'] alignment_items.append({ 'ref': ra_ref, 'label': reads_ref.get('condition', params.get('condition', 'unspecified')) }) alignment_objs.append({'ref': ra_ref}) if result_package['run_context']['location'] == 'local': ran_locally += 1 if result_package['run_context']['location'] == 'njsw': ran_njsw += 1 # Save the alignment set output_alignmentset_name = set_name + params['alignmentset_suffix'] save_result = self.star_utils.upload_alignment_set( alignment_items, output_alignmentset_name, params['output_workspace']) result_obj_ref = save_result['set_ref'] index_dir = os.path.join(self.scratch, STARUtils.STAR_IDX_DIR) output_dir = os.path.join(self.scratch, STARUtils.STAR_OUT_DIR) # Extract the ReadsPerGene counts if necessary self._extract_readsPerGene(params, rds_names, output_dir) # Reporting... report_info = {'name': None, 'ref': None} #run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': result_obj_ref}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] qc_result = [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }] # create the report report_text = 'Ran on SampleSet or ReadsSet.\n\n' report_text += 'Created ReadsAlignmentSet: ' + str( output_alignmentset_name) + '\n\n' report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n' report_text += ' Successful runs = ' + str(n_success) + '\n' report_text += ' Failed runs = ' + str(n_error) + '\n' report_text += ' Ran on main node = ' + str(ran_locally) + '\n' report_text += ' Ran on remote worker = ' + str(ran_njsw) + '\n\n' report_info = self.star_utils._generate_star_report( result_obj_ref, report_text, qc_result, params['output_workspace'], index_dir, output_dir) result = { 'alignmentset_ref': result_obj_ref, 'output_info': batch_result, 'alignment_objs': alignment_objs, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return result def _extract_readsPerGene(self, params, rds_names, output_dir): # Extract the ReadsPerGene counts if 'quantMode' was set during the STAR run gene_count_files = [] if (params.get('quantMode', None) is not None and (params['quantMode'] == 'Both' or 'GeneCounts' in params['quantMode'])): for reads_name in rds_names: gene_count_files.append('{}/{}_ReadsPerGene.out.tab'.format( reads_name, reads_name)) extract_geneCount_matrix(gene_count_files, output_dir) def build_single_execution_task(self, rds_ref, params): task_params = copy.deepcopy(params) task_params[STARUtils.PARAM_IN_READS] = rds_ref task_params['create_report'] = 0 if 'condition' in rds_ref: task_param['condition'] = rds_ref['condition'] else: task_params['condition'] = 'unspecified' return { 'module_name': 'STAR', 'function_name': 'run_star', 'version': self.my_version, #'version': 'dev', 'parameters': task_params } def get_version_from_subactions(self, module_name, subactions): # go through each sub action looking for if not subactions: return 'dev' #'release' # default to release if we can't find anything for sa in subactions: if 'name' in sa: if sa['name'] == module_name: # local-docker-image implies that we are running in kb-test, so return 'dev' if sa['commit'] == 'local-docker-image': return 'dev' # to check that it is a valid hash, make sure it is the right # length and made up of valid hash characters if re.match('[a-fA-F0-9]{40}$', sa['commit']): return sa['commit'] # again, default to setting this to release return 'dev' #'release' def run_star_indexing(self, input_params): """ Runs STAR in genomeGenerate mode to build the index files and directory for STAR mapping. It creates a directory as defined by self.star_idx_dir in the scratch area that houses the index files. """ ret_params = copy.deepcopy(input_params) ret_params[STARUtils.PARAM_IN_STARMODE] = 'genomeGenerate' # build the indexing parameters params_idx = self.star_utils._get_indexing_params( ret_params, self.star_idx_dir) ret = 1 try: if ret_params[STARUtils.PARAM_IN_STARMODE] == 'genomeGenerate': ret = self.star_utils._exec_indexing(params_idx) else: ret = 0 while (ret != 0): time.sleep(1) except ValueError as eidx: log('STAR genome indexing raised error:\n') pprint(eidx) else: ret = 0 return (ret, params_idx[STARUtils.STAR_IDX_DIR]) def run_star_mapping(self, params, rds_files, rds_name): """ Runs STAR in alignReads mode for STAR mapping. It creates a directory as defined by self.star_out_dir with a subfolder named after the reads """ params_mp = self.star_utils._get_mapping_params( params, rds_files, rds_name, self.star_idx_dir, self.star_out_dir) retVal = {} params_mp[STARUtils.PARAM_IN_STARMODE] = 'alignReads' try: ret = self.star_utils._exec_mapping(params_mp) while (ret != 0): time.sleep(1) except ValueError as emp: log('STAR mapping raised error:\n') pprint(emp) retVal = {'star_idx': self.star_idx_dir, 'star_output': None} else: #no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting retVal = { 'star_idx': self.star_idx_dir, 'star_output': params_mp.get('align_output') } return retVal def get_index(self, input_params): ''' get_index: generate the index if not yet existing ''' gnm_ref = input_params[STARUtils.PARAM_IN_GENOME] if input_params.get('sjdbGTFfile', None) is None: input_params['sjdbGTFfile'] = self.star_utils._get_genome_gtf_file( gnm_ref, self.star_idx_dir) if not os.path.isfile( os.path.join(self.star_idx_dir, 'genomeParameters.txt')): # fetch genome fasta and GTF from refs to file location(s) input_params[ STARUtils. PARAM_IN_FASTA_FILES] = self.star_utils._get_genome_fasta( gnm_ref) # generate the indices (idx_ret, idx_dir) = self.run_star_indexing(input_params) if idx_ret != 0: raise ValueError( "Failed to generate genome indices, aborting...")