def build_feature_set(self, params): self.validate_params(params, {'output_feature_set', 'workspace_name', }, {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description'}) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError("You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set']}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{'ref': feature_set_obj_ref, 'description': 'Feature Set'}] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref']}
def filter_matrix_with_fs(self, params): self.validate_params(params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]} )['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] message = "Filtered Expression Matrix based of the {} feature ids present in {}"\ .format(len(feature_ids), feature_set_name) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref']}
def _generate_report(self, expression_matrix_ref, workspace_name): """ _generate_report: generate report """ objects_created = [{'ref': expression_matrix_ref, 'description': 'Average ExpressionMatrix'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, # 'html_links': output_html_files, # 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_ave_expr_matrix_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report(up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object'}] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object'}] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
def run_CGView(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_CGView print('Starting run_kellyhuangCGView function. Params=') print(params) # Validating workspace_name and input_file is present print('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'input_file' not in params: raise ValueError( 'Parameter input_file is not set in input arguments') input_file = params['input_file'] # Set up CCT project_folder subprocess.call( "cd /opt/cgview_comparison_tool && ./update_cogs.sh && cgview_comparison_tool.pl -p project", shell=True) # Turn genome object to Genbank file gfu = GenomeFileUtil(self.callback_url) gbk = gfu.genome_to_genbank({'genome_ref': input_file}) gbk_file = gbk["genbank_file"]["file_path"] subprocess.call([ "cp", gbk_file, "/opt/cgview_comparison_tool/project/reference_genome" ]) base = ntpath.basename(gbk_file).rsplit(".", 1)[0] name_gbff = base + ".gbff" name_gbk = base + ".gbk" from_path = "/opt/cgview_comparison_tool/project/reference_genome/" + name_gbff print("===== from", from_path) to_path = "/opt/cgview_comparison_tool/project/reference_genome/" + name_gbk print("===== to", to_path) subprocess.call(["mv", from_path, to_path]) # Add Genbank file to project_folder/reference_genome # Generate map from Genbank file # subprocess.call("cgview_comparison_tool.pl -p project", shell=True) os.chdir("/opt/cgview_comparison_tool") proc = subprocess.Popen([ "cgview_comparison_tool.pl", "-p", "/opt/cgview_comparison_tool/project" ], stdout=subprocess.PIPE) # for line in proc.stdout: # print(line) proc.wait() subprocess.call(["cgview_comparison_tool.pl", "-p", " project"], shell=True) # Retrieve map PNG from project_folder/maps subprocess.call([ "cp", "/opt/cgview_comparison_tool/project/maps/medium.png", self.shared_folder ]) subprocess.call([ "cp", "/opt/cgview_comparison_tool/project/maps/medium.html", self.shared_folder ]) # Resize image basewidth = 900 img = Image.open('/opt/cgview_comparison_tool/project/maps/medium.png') wpercent = (basewidth / float(img.size[0])) hsize = int((float(img.size[1]) * float(wpercent))) img = img.resize((basewidth, hsize), Image.ANTIALIAS) # img = img.resize((600, 600), Image.ANTIALIAS) img.save('/opt/cgview_comparison_tool/project/maps/medium1.png', quality=95) # print("=====", os.listdir("/opt/cgview_comparison_tool/project/maps/")) subprocess.call([ "cp", "/opt/cgview_comparison_tool/project/maps/medium1.png", self.shared_folder ]) png_dir = os.path.join(self.shared_folder, 'medium1.png') png_dir_higher = os.path.join(self.shared_folder, 'medium.png') html_dir = os.path.join(self.shared_folder, 'medium.html') png_dict = {'path': png_dir_higher, 'name': 'Circular_Genome_Map_PNG'} html_dict = {'path': png_dir, 'name': 'Circular Genome Map'} report_client = KBaseReport(self.callback_url) report = report_client.create_extended_report({ 'direct_html_link_index': 0, 'html_links': [html_dict], 'file_links': [png_dict], 'workspace_name': params['workspace_name'], 'summary_window_height': 900, 'html_window_height': 900 }) # subprocess.check_output(["cd", "/opt/cgview_comparison_tool"], shell=True) # proj_output = subprocess.check_output(["pwd"], shell=True) # print("=====cd /opt/cgview_comparison_tool=====", proj_output) # # report = KBaseReport(self.callback_url) # report_info = report.create({'report': {'objects_created':[], # 'text_message': params['input_file']}, # 'workspace_name': params['workspace_name']}) output = { 'report_name': report['name'], 'report_ref': report['ref'], } #END run_CGView # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_CGView return value ' + 'output is not type dict as required.') # return the results return [output]
def run_classify(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_classify logging.info(params) params = Params(params) Var.params = params ''' tmp/ `shared_folder` └── kb_rdp_clsf_<uuid>/ `run_dir` ├── return/ `return_dir` | ├── cmd.txt | ├── study_seqs.fna | └── RDP_Classifier_output/ `out_dir` | ├── out_allRank.tsv | └── out_fixedRank.tsv └── report/ `report_dir` ├── pie_hist.html ├── suburst.html └── report.html ''' ## ## set up globals ds `Var` for this API-method run ## which involves making this API-method run's directory structure Var.update({ 'run_dir': os.path.join(self.shared_folder, 'kb_rdp_clsf_' + str(uuid.uuid4())), 'dfu': DataFileUtil(self.callback_url), 'ws': Workspace(self.workspace_url), 'gapi': GenericsAPI(self.callback_url), 'kbr': KBaseReport(self.callback_url), 'warnings': [], }) os.mkdir(Var.run_dir) Var.update({ 'return_dir': os.path.join(Var.run_dir, 'return'), 'report_dir': os.path.join(Var.run_dir, 'report'), }) os.mkdir(Var.return_dir) os.mkdir(Var.report_dir) Var.update( {'out_dir': os.path.join(Var.return_dir, 'RDP_Classifier_output')}) os.mkdir(Var.out_dir) # cat and gunzip SILVA refdata # which has been split into ~99MB chunks to get onto Github #if params.is_custom(): # app_file.prep_refdata() # ## ### load objects #### ##### amp_mat = AmpliconMatrix(params['amp_mat_upa']) row_attr_map_upa = amp_mat.obj.get('row_attributemapping_ref') create_row_attr_map = row_attr_map_upa is None row_attr_map = AttributeMapping(row_attr_map_upa, amp_mat=amp_mat) # ## ### cmd #### ##### fasta_flpth = os.path.join(Var.return_dir, 'study_seqs.fna') Var.out_allRank_flpth = os.path.join(Var.out_dir, 'out_allRank.tsv') Var.out_shortSeq_flpth = os.path.join( Var.out_dir, 'out_unclassifiedShortSeqs.txt') # seqs too short to classify shutil.copyfile(amp_mat.get_fasta(), fasta_flpth) cmd = ('java -Xmx4g -jar %s classify %s ' % (Var.classifier_jar_flpth, fasta_flpth) + ' '.join(params.cli_args) + ' ' + '--format allRank ' + '--outputFile %s --shortseq_outfile %s' % (Var.out_allRank_flpth, Var.out_shortSeq_flpth)) run_check(cmd) # ## ### extract classifications #### ##### id2taxStr = app_file.get_fix_filtered_id2tax() # get ids of classified and unclassified seqs shortSeq_id_l = app_file.parse_shortSeq( ) # sequences too short to get clsf classified_id_l = list(id2taxStr.keys()) # make sure classifieds and shorts complement if Var.debug: ret = sorted(classified_id_l + shortSeq_id_l) mat = sorted(amp_mat.obj['data']['row_ids']) assert ret == mat, \ 'diff1: %s, diff2: %s' % (set(ret)-set(mat), set(mat)-set(ret)) if len(classified_id_l) == 0: raise Exception('No sequences were long enough to be classified') # add in id->'' for unclassified seqs # so id2taxStr_l is complete # so no KeyErrors later for shortSeq_id in shortSeq_id_l: id2taxStr[shortSeq_id] = '' # add to globals for testing Var.shortSeq_id_l = shortSeq_id_l # ## ### add to row AttributeMapping #### ##### prose_args = params.get_prose_args() attribute = ('RDP Classifier Taxonomy (conf=%s, gene=%s)' % (prose_args['conf'], prose_args['gene'])) attribute_names = row_attr_map.get_attribute_names() if attribute in attribute_names: attribute = get_numbered_duplicate(attribute_names, attribute) source = 'RDP Classifier' ind, attribute = row_attr_map.add_attribute_slot(attribute, source) row_attr_map.update_attribute(ind, id2taxStr) # ## ### save obj #### ##### amp_mat_output_name = Var.params['output_name'] attr_map_output_name = (amp_mat_output_name + '.Amplicon_attributes' if create_row_attr_map else None) row_attr_map_upa_new = row_attr_map.save(name=attr_map_output_name) amp_mat.obj['row_attributemapping_ref'] = row_attr_map_upa_new amp_mat_upa_new = amp_mat.save(amp_mat_output_name) objects_created = [ dict( # row AttrMap ref=row_attr_map_upa_new, description='%sAdded attribute `%s`' % ( 'Created. ' if create_row_attr_map else '', attribute, )), dict( # AmpMat ref=amp_mat_upa_new, description= 'Updated amplicon AttributeMapping reference to `%s`' % row_attr_map_upa_new), ] # testing if Var.debug: Var.update(dict( amp_mat=amp_mat, row_attr_map=row_attr_map, )) # ## ### html report #### ##### hrw = report.HTMLReportWriter(cmd_l=[cmd]) html_flpth = hrw.write() html_links = [{ 'path': Var.report_dir, 'name': os.path.basename(html_flpth), }] # ## ### #### ##### file_links = [{ 'path': Var.run_dir, 'name': 'RDP_Classifier_results.zip', 'description': 'Input, output' }] params_report = { 'warnings': Var.warnings, 'objects_created': objects_created, 'html_links': html_links, 'direct_html_link_index': 0, 'file_links': file_links, 'workspace_id': params['workspace_id'], 'html_window_height': Var.report_height, } # testing Var.params_report = params_report report_obj = Var.kbr.create_extended_report(params_report) output = { 'report_name': report_obj['name'], 'report_ref': report_obj['ref'], } #END run_classify # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_classify return value ' + 'output is not type dict as required.') # return the results return [output]
def run_picrust2_pipeline(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_picrust2_pipeline #################################################################################################### #################################################################################################### #################################################################################################### #################################################################################################### #################################################################################################### logging.info(params) # ## ### params, app-globals, directories, etc #### ##### logging.info('BEGINNING KB_PICRUST2. params: %s' % str(params)) params = Params(params) dprint('params', run=locals()) reset_Var() # clear all fields but `debug` Var.update( params=params, dfu=DataFileUtil(self.callback_url), kbr=KBaseReport(self.callback_url), fpu=FunctionalProfileUtil(self.callback_url, service_ver='beta'), gapi=GenericsAPI(self.callback_url), shared_folder=self.shared_folder, run_dir=os.path.join(self.shared_folder, 'run_dir_picrust2_' + str(uuid.uuid4())), warnings=[], objects_created=[], ) os.mkdir(Var.run_dir) # for this API-method run Var.update(return_dir=os.path.join(Var.run_dir, 'return'), ) os.mkdir(Var.return_dir) # for return input/output/logs etc. if Var.debug: with open(os.path.join(Var.run_dir, '#params'), 'w') as fh: json.dump(params.params, fh) # TODO document `run_dir` structure # ## ### obj #### ##### # instantiate amp_mat = AmpliconMatrix(params['amplicon_matrix_upa']) if 'row_attributemapping_ref' in amp_mat.obj: row_attrmap = AttributeMapping( amp_mat.obj['row_attributemapping_ref'], amp_mat) else: msg = ( "Input AmpliconMatrix " "does not have a row AttributeMapping to assign PICRUSt2 functions to." ) logging.warning(msg) Var.warnings.append(msg) # validate input data amp_mat.validate_amplicon_abundance_data() # generate input files seq_flpth = os.path.join(Var.return_dir, 'study_seqs.fna') seq_abundance_table_flpth = os.path.join(Var.return_dir, 'study_seqs.tsv') amp_mat.to_fasta(seq_flpth) amp_mat.to_seq_abundance_table(seq_abundance_table_flpth) # objs should be app globals Var.amp_mat = amp_mat # ## ### args #### ##### # TODO get tee functionality working in run_check # to avoid extra cmd Var.out_dir = os.path.join(Var.return_dir, 'PICRUSt2_output') log_flpth = os.path.join(Var.return_dir, 'log.txt') p = 4 cmd_pipeline = ' '.join([ 'set -o pipefail &&', 'source activate picrust2 &&', 'picrust2_pipeline.py', '-s', seq_flpth, '-i', seq_abundance_table_flpth, '-o', Var.out_dir, '--per_sequence_contrib', '-p', str(p), '|& tee', log_flpth, ]) cmd_description = ' \\\n'.join([ 'cd %s &&' % Var.out_dir, 'source activate picrust2 &&', 'add_descriptions.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -m EC', ' -o EC_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz', '&&', 'add_descriptions.py -i KO_metagenome_out/pred_metagenome_unstrat.tsv.gz -m KO', ' -o KO_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz', '&&', 'add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -m METACYC', ' -o pathways_out/path_abun_unstrat_descrip.tsv.gz' ]) get_cmd_func_l = lambda FUNC: [ ('cd %s && ' % Var.out_dir + 'source activate picrust2 && ' f'hsp.py -i {FUNC} -t out.tre -o {FUNC}_predicted.tsv.gz -p {p}'), ('cd %s && ' % Var.out_dir + 'source activate picrust2 && ' 'metagenome_pipeline.py ' '-i ../%s ' % os.path.basename(seq_abundance_table_flpth) + '-m marker_predicted_and_nsti.tsv.gz ' f'-f {FUNC}_predicted.tsv.gz ' f'-o {FUNC}_metagenome_out') ] + ([] if FUNC == 'PHENO' else [ # no descriptions for IMG phenotype ('cd %s && ' % Var.out_dir + 'source activate picrust2 && ' f'add_descriptions.py -i {FUNC}_metagenome_out/pred_metagenome_unstrat.tsv.gz -m {FUNC} ' f'-o {FUNC}_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz' ), ]) cmd_func_l = [] for func in ['cog', 'pfam', 'tigrfam', 'pheno']: if params.getd(func) == 1: cmd_func_l.extend(get_cmd_func_l(func.upper())) # ## ### run #### ##### run_check(cmd_pipeline) run_check(cmd_description) for cmd_func in cmd_func_l: run_check(cmd_func) # ## ### sanity checks #### ##### if Var.debug: for func in Var.func_l: if not Var.params.getd(func): continue fp0 = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][0]) fp1 = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][1]) # Check dropped amplicons are the unaligned/distant ones (debug) appfile.check_dropped_amplicon_ids(fp0, amp_mat) # Check no samples dropped (debug) appfile.check_dropped_sample_ids(fp1, amp_mat) # ## ### update/save Amplicon workflow objects #### ##### path_abun_predictions_tsv_gz_flpth = os.path.join( Var.out_dir, 'pathways_out/path_abun_predictions.tsv.gz') attribute = 'MetaCyc Predictions' source = 'PICRUSt2' # if row AttributeMapping, # update that and referencing objs if amp_mat.row_attrmap_upa is not None: # update row AttributeMapping with traits id2attr = appfile.parse_picrust2_traits( path_abun_predictions_tsv_gz_flpth) ind, attribute = row_attrmap.add_attribute_slot(attribute, source) row_attrmap.map_update_attribute(ind, id2attr) row_attrmap_upa_new = row_attrmap.save() # update AmpliconMatrix which references row AttributeMapping amp_mat.obj['row_attributemapping_ref'] = row_attrmap_upa_new amp_mat_upa_new = amp_mat.save(name=params['output_name']) Var.objects_created.extend([ { 'ref': row_attrmap_upa_new, 'description': 'Added attribute `%s`' % attribute, }, { 'ref': amp_mat_upa_new, 'description': 'Updated amplicon AttributeMapping reference to `%s`' % row_attrmap_upa_new }, ]) # ## ### html report w/ heatmaps #### ##### logging.info('Beginning report business') ## ## report Var.report_dir = os.path.join(Var.run_dir, 'report') report_html_flpth = report.HTMLReportWriter( [cmd_pipeline, cmd_description] + cmd_func_l, ).write() html_links = [{ 'path': Var.report_dir, 'name': os.path.basename(report_html_flpth), }] # ## ### FunctionalProfile #### ##### logging.info('Starting saving FunctionalProfiles if any') if Var.debug: FP_amp_mat_ref = params[ 'amplicon_matrix_upa'] # this makes mocking more flexible in case something makes a fake UPA else: FP_amp_mat_ref = amp_mat_upa_new # this AmpliconMatrix is new one with new AttributeMapping # gunzip TSVs out to another directory tsv_dir = os.path.join(Var.run_dir, 'decompressed_tsv') os.mkdir(tsv_dir) for func in Var.func_l: if not Var.params.getd(func): continue func_name = Var.func_2_cfg[func]['name'] if Var.params.getd('create_amplicon_fps'): id = 'amplicon_' + func desc = 'Amplicon %s abundance' % func_name fp_src = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][0]) fp_dst = os.path.join(tsv_dir, id + '.tsv') gunzip(fp_src, fp_dst) upa = Var.fpu.import_func_profile( dict( workspace_id=Var.params['workspace_id'], func_profile_obj_name='%s.%s' % (Var.params['output_name'], id), original_matrix_ref=FP_amp_mat_ref, profile_file_path=fp_dst, profile_type='amplicon', profile_category='organism', data_epistemology='predicted', epistemology_method='PICRUSt2', description=desc, ))['func_profile_ref'] Var.objects_created.append(dict(ref=upa, description=desc)) if Var.params.getd('create_sample_fps'): id = 'metagenome_' + func desc = 'Metagenome %s abundance' % func_name fp_src = os.path.join(Var.out_dir, Var.func_2_cfg[func]['relfp'][1]) fp_dst = os.path.join(tsv_dir, id + '.tsv') gunzip(fp_src, fp_dst) upa = Var.fpu.import_func_profile( dict( workspace_id=Var.params['workspace_id'], func_profile_obj_name='%s.%s' % (Var.params['output_name'], id), original_matrix_ref=FP_amp_mat_ref, profile_file_path=fp_dst, profile_type='mg', profile_category='community', data_epistemology='predicted', epistemology_method='PICRUSt2', description=desc, ))['func_profile_ref'] Var.objects_created.append(dict(ref=upa, description=desc)) # look at TSVs dprint( 'ls -lh %s/*' % tsv_dir, #'file -i %s/*/*' % tsv_dir, run='cli') # ## ### return files #### ##### file_links = [{ 'path': Var.return_dir, 'name': 'PICRUSt2_results.zip', 'description': 'Input, output, cmd, intermediate files, log' }] params_report = { 'warnings': Var.warnings, 'objects_created': Var.objects_created, 'file_links': file_links, 'html_links': html_links, 'direct_html_link_index': 0, 'report_object_name': 'kb_PICRUSt2_report', 'workspace_name': params['workspace_name'], 'html_window_height': report.REPORT_HEIGHT, } Var.params_report = params_report obj = Var.kbr.create_extended_report(params_report) output = { 'report_name': obj['name'], 'report_ref': obj['ref'], } #END run_picrust2_pipeline # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_picrust2_pipeline return value ' + 'output is not type dict as required.') # return the results return [output]
def update_matrix_attribute_mapping(self, params): dimension = params.get('dimension') if dimension not in ['col', 'row']: raise ValueError('Please use "col" or "row" for input dimension') workspace_name = params.get('workspace_name') old_matrix_ref = params.get('input_matrix_ref') old_matrix_obj = self.dfu.get_objects( {'object_refs': [old_matrix_ref]})['data'][0] old_matrix_info = old_matrix_obj['info'] old_matrix_data = old_matrix_obj['data'] old_am_ref = old_matrix_data.get( '{}_attributemapping_ref'.format(dimension)) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name if not old_am_ref: raise ValueError( 'Matrix object does not have {} attribute mapping'.format( dimension)) new_am_ref = self.append_file_to_attribute_mapping( params['staging_file_subdir_path'], old_am_ref, workspace_id, params['output_am_obj_name'])['attribute_mapping_ref'] old_matrix_data['{}_attributemapping_ref'.format( dimension)] = new_am_ref info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": old_matrix_info[2], "data": old_matrix_data, "name": params['output_matrix_obj_name'] }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_am_ref, 'description': 'Updated Attribute Mapping' }, { 'ref': new_matrix_obj_ref, 'description': 'Updated Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'new_attribute_mapping_ref': new_am_ref, 'report_name': output['name'], 'report_ref': output['ref'] }
def find_motifs(self, ctx, params): """ :param params: instance of type "find_motifs_params" (SS_ref - optional, used for exact genome locations if possible) -> structure: parameter "workspace_name" of String, parameter "fastapath" of String, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long, parameter "SS_ref" of String, parameter "obj_name" of String, parameter "background" of Long :returns: instance of type "extract_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs if 'motif_min_length' not in params: params['motif_min_length'] = 8 if 'motif_max_length' not in params: params['motif_max_length'] = 16 motMin = params['motif_min_length'] motMax = params['motif_max_length'] promoterFastaFilePath = self.SSU.SeqSetToFasta({ 'ws_name': params['workspace_name'], 'SS_ref': params['SS_ref'] })['path'] MEMEMotifCommand = self.MEU.build_meme_command(promoterFastaFilePath, motMin, motMax, params['background']) meme_out_path = self.MEU.run_meme_command(MEMEMotifCommand) meme_params = { 'ws_name': params['workspace_name'], 'format': 'MEME', 'file': { 'path': meme_out_path }, 'obj_name': params['obj_name'], 'seq_set_ref': params['SS_ref'] } # MOU.parseMotifSet with the same parameters will # return a dictionary of the motifset object that you save on # your own # # MOU.saveMotifSet will save the object with DataFileUtils to # whatever workspace you specify in ws_name # # This function will also download the sequence set as a fasta to # unique (uuid4) file name in the scratch directory obj_ref = self.MOU.saveMotifSet(meme_params) timestamp = str( int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) get_obj_params = {'object_refs': [obj_ref]} memeMotifSet = self.dfu.get_objects(get_obj_params)['data'][0]['data'] self.GR.MakeMotifReport(htmlDir, memeMotifSet) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except Exception: raise ValueError('Error uploading HTML file: ' + str(htmlDir) + ' to shock') reportname = 'MEMEMotifFinder_report_' + str(uuid.uuid4()) reportobj = { 'objects_created': [{ 'ref': obj_ref, 'description': 'Motif Set generated by MEME' }], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportname } # attach to report obj reportobj['direct_html'] = '' reportobj['direct_html_link_index'] = 0 reportobj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportobj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
class snp2gene: ''' Module Name: snp2gene Module Description: A KBase module: snp2gene ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "[email protected]:kbasecollaborations/snp2gene.git" GIT_COMMIT_HASH = "8dd593e96c4b37fcf91a719181389e1b04c0bb4a" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.config['callback_url'] = os.environ['SDK_CALLBACK_URL'] callback_url = self.config['callback_url'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.kbr = KBaseReport(callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def annotate_gwas_results(self, ctx, params): """ annotate_gwas_results: inputs: file path to gwas results genome object - with reference to GFF file outputs: TSV file represented by shock/handle ids and :param params: instance of type "annotate_gwas_input" -> structure: parameter "gwas_result_file" of type "file_path" (A valid file path), parameter "genome_obj" of type "genome_ref" (KBase style object reference X/Y/Z @id ws KBaseGenomes.Genome) :returns: instance of type "annotate_gwas_output" -> structure: parameter "snp_to_gene_list" of type "file_path" (A valid file path) """ # ctx is the context object # return variables are: output #BEGIN annotate_gwas_results gene_list = GFFUtils(self.config).annotate_GWAS_results( params['genome_obj'], params['gwas_result_file']) output = {'snp_to_gene_list': gene_list} #END annotate_gwas_results # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_gwas_results return value ' + 'output is not type dict as required.') # return the results return [output] def annotate_gwas_results_app(self, ctx, params): """ :param params: instance of type "annotate_gwas_app_input" -> structure: parameter "associations" of list of type "association_ref" (KBase style object reference X/Y/Z @id ws KBaseGwasData.Associations), parameter "p_value" of String, parameter "prefix" of String :returns: instance of type "annotate_gwas_app_output" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "featureset_obj" of type "featureset_ref" (KBase style object reference X/Y/Z @id ws KBaseCollections.FeatureSet) """ # ctx is the context object # return variables are: output #BEGIN annotate_gwas_results_app # return the results print(params) #TODO: Hanlde cases where there are no significant SNPs #genome_ref = "47506/4/1" objects_created = [] for association_ref in params['associations']: variation_ref = self.wsc.get_object_subset([{ 'included': ['/variation_id'], 'ref': association_ref }])[0]['data']['variation_id'] genome_ref = self.wsc.get_object_subset([{ 'included': ['/genome_ref'], 'ref': variation_ref }])[0]['data']['genome_ref'] featureset_obj = GFFUtils2(self.config).annotate_GWAS_results( genome_ref, association_ref, params['workspace_name'], params['prefix'], params['p_value']) objects_created.append({ 'ref': featureset_obj, 'description': 'FeatureSet' }) # Build the new gff before doing anything # Download the workspace object for association one at a time # Filter SNPs for p-value, if no snps shows up, append this to warnings # Build the table structure needed for snp2gene # Run snp2gene algorithm and get final list.txt # Save as featureset. Find how to save featureset from genelist report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'annotate_gwas_results_app_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) output = dict() output['report_name'] = report_info['name'] output['report_ref'] = report_info['ref'] print(output) #END annotate_gwas_results_app # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_gwas_results_app return value ' + 'output is not type dict as required.') return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def link_reads(self, ctx, params): """ :param params: instance of type "LinkReadsParams" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of String, parameter "sample_set_ref" of String, parameter "links" of list of type "ReadsLink" (Create links between samples and reads objects.) -> structure: parameter "sample_name" of String, parameter "reads_ref" of String :returns: instance of type "LinkReadsOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "links" of list of unspecified object """ # ctx is the context object # return variables are: output #BEGIN link_reads logging.info(params) ss = SampleService(self.sample_url) sample_set_ref = params['sample_set_ref'] sample_set_obj = self.dfu.get_objects( {'object_refs': [sample_set_ref]})['data'][0]['data'] sample_name_2_info = {d['name']: d for d in sample_set_obj['samples']} links = [(d['sample_name'][0], d['reads_ref']) for d in params['links']] new_data_links = [] for sample_name, reads_ref in links: sample_id = sample_name_2_info[sample_name]['id'] version = sample_name_2_info[sample_name]['version'] sample = ss.get_sample({ 'id': sample_id, 'version': version, }) ret = ss.create_data_link( dict( upa=reads_ref, id=sample_id, version=version, node=sample['node_tree'][0]['id'], update=1, )) new_data_links.append(ret) report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report({ 'workspace_name': params['workspace_name'], }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'links': new_data_links, } #END link_reads # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method link_reads return value ' + 'output is not type dict as required.') # return the results return [output]
def run_FAPROTAX(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FAPROTAX logging.info(params) Var.update({ # carry over into globals `Var`, regardless of resetting, for all API-method runs 'params': Params(params), 'shared_folder': self.shared_folder, 'kbase_endpoint': self.config['kbase-endpoint'], # contains environment, for constructing Genome landing page url #--- 'ws': Workspace(self.workspace_url), 'dfu': DataFileUtil(self.callback_url), # instantiate here so within runtime of @patch 'kbr': KBaseReport(self.callback_url), # instantiate here so within runtime of @patch 'gapi': GenericsAPI(self.callback_url), 'fpu': FunctionalProfileUtil(self.callback_url, service_ver='beta'), # TODO overhead? #--- 'warnings': [], #--- 'run_dir': os.path.join(self.shared_folder, 'kbfptx_' + str(uuid.uuid4())), }) os.mkdir(Var.run_dir) Var.update({ 'return_dir': os.path.join(Var.run_dir, 'return'), }) os.mkdir(Var.return_dir) # ## ### detect input type #### ##### oi = Var.ws.get_object_info3( {'objects': [{ 'ref': params['input_upa'] }]})['infos'][0] if oi[2].startswith('KBaseSearch.GenomeSet'): output = do_GenomeSet_workflow() elif oi[2].startswith('KBaseMatrices.AmpliconMatrix'): output = do_AmpliconMatrix_workflow() else: raise Exception('Unknown type `%s` for `input_upa`' % oi[2]) #END run_FAPROTAX # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FAPROTAX return value ' + 'output is not type dict as required.') # return the results return [output]
def find_motifs(self, ctx, params): """ :param params: instance of type "find_motifs_params" -> structure: parameter "workspace_name" of String, parameter "fastapath" of String, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long, parameter "SS_ref" of String, parameter "obj_name" of String :returns: instance of type "extract_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs if 'motif_length' not in params: params['motif_length'] = 8 motLen = params['motif_length'] promoterFastaFilePath = params['fastapath'] MDU=MdscanUtil() MdscanMotifCommand = MDU.build_mdscan_motif_command(promoterFastaFilePath,motLen,params['background']) MDU.run_mdscan_command(MdscanMotifCommand) mdscan_out_path = '/kb/module/work/tmp/mdscan_out' mdscan_params = {'ws_name' : params['workspace_name'], 'path' : mdscan_out_path,'obj_name' : params['obj_name']} MOU = MotifUtils(self.callback_url) dfu = DataFileUtil(self.callback_url) locDict = {} obj_ref = MDU.UploadFromMdscan(self.callback_url, mdscan_params)[0]['obj_ref'] MDU.write_obj_ref(mdscan_out_path, obj_ref) timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000) timestamp = str(timestamp) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) lineCount = 0 with open(promoterFastaFilePath,'r') as pFile: for line in pFile: lineCount += 1 numFeat = lineCount/2 with open(promoterFastaFilePath,'r') as pFile: fileStr = pFile.read() promHtmlStr = '<html><body> ' + fileStr + ' </body></html>' with open(htmlDir + '/promoters.html','w') as promHTML: promHTML.write(promHtmlStr) JsonPath = '/kb/module/work/tmp' dfu = DataFileUtil(self.callback_url) get_obj_params = {'object_refs' : [obj_ref]} mdscanMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data'] mr=MakeNewReport() mr.MakeReport(htmlDir,mdscanMotifSet) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') reportName = 'MdscanMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by Mdscan'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
def find_motifs(self, ctx, params): """ :param params: instance of type "find_motifs_params" (SS_ref - optional, used for exact genome locations if possible) -> structure: parameter "workspace_name" of String, parameter "fastapath" of String, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long, parameter "SS_ref" of String :returns: instance of type "extract_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs if 'motif_min_length' not in params: params['motif_min_length'] = 8 if 'motif_max_length' not in params: params['motif_max_length'] = 16 motMin = params['motif_min_length'] motMax = params['motif_max_length'] #promoterFastaFilePath = self.get_promoter_for_gene(ctx,params)[0] promoterFastaFilePath = params['fastapath'] #GibbsMotifCommand = GU.build_gibbs_command(promoterFastaFilePath) gibbsCommandList = [] for i in range(motMin, motMax + 1, 2): gibbsCommandList.append( GU.build_gibbs_command(promoterFastaFilePath, i)) for g in gibbsCommandList: GU.run_gibbs_command(g) #GU.run_gibbs_command(GibbsMotifCommand) gibbs_out_path = '/kb/module/work/tmp/gibbs' gibbs_params = { 'ws_name': params['workspace_name'], 'path': gibbs_out_path, 'obj_name': params['obj_name'] } MOU = MotifUtils(self.callback_url) dfu = DataFileUtil(self.callback_url) locDict = {} if 'SS_ref' in params: get_ss_params = {'object_refs': [params['SS_ref']]} SS = dfu.get_objects(get_ss_params)['data'][0]['data'] for s in SS['sequences']: if s['source'] is not None: locDict['sequence_id'] = { 'contig': s['source']['location'][0][0], 'start': str(s['source']['location'][0][1]) } if len(locDict.keys()) > 0: gibbs_params['absolute_locations'] = locDict gibbs_params['min_len'] = motMin gibbs_params['max_len'] = motMax obj_ref = MOU.UploadFromGibbs(gibbs_params)['obj_ref'] #memeMotifList = MEU.parse_meme_output() #HERE: #we've got object ref #we've got html building functions #build report, setup return, #make report and return it #buildReportFromMotifSet() timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) timestamp = str(timestamp) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) lineCount = 0 with open(promoterFastaFilePath, 'r') as pFile: for line in pFile: lineCount += 1 numFeat = lineCount / 2 with open(promoterFastaFilePath, 'r') as pFile: fileStr = pFile.read() promHtmlStr = '<html><body> ' + fileStr + ' </body></html>' with open(htmlDir + '/promoters.html', 'w') as promHTML: promHTML.write(promHtmlStr) JsonPath = '/kb/module/work/tmp' dfu = DataFileUtil(self.callback_url) get_obj_params = {'object_refs': [obj_ref]} gibbsMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data'] MakeReport(htmlDir, gibbsMotifSet) #buildReportFromMotifSet(gibbsMotifSet,htmlDir,'gibbs') #TODO: Here replace the makereport with a call to motifset utils #subprocess.call(['python','/kb/module/lib/identify_promoter/Utils/makeReport.py',JsonPath + '/meme_out/meme.json',htmlDir + '/meme.html',str(numFeat)]) #fullMotifList = [] #for m in memeMotifList: # fullMotifList.append(m) #What needs to happen here: #call makeLogo for each of the json outputs(capture these from somewhere) #plt.rcParams['figure.dpi'] = 300 #htmlFiles = ['index.html','gibbs.html','homer.html'] #shockParamsList = [] #for f in htmlFiles: # shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'}) try: html_upload_ret = dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set #MSO = {} #MSO['Condition'] = 'Temp' #MSO['FeatureSet_ref'] = '123' #MSO['Motifs'] = [] #MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = {} #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(fullMotifList,MSO) #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)) #Pass motif set into this #save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}] #info = dfu.save_objects(save_objects_params)[0] #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'GibbsMotifFinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [{ 'ref': obj_ref, 'description': 'Motif Set generated by Gibbs' }], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
def ad_vina(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN ad_vina VarStash.update({'ctx': ctx, 'workspace_id': params['workspace_id']}) ctx_censored = ctx.copy() ctx_censored.pop('token') dprint('params', 'ctx_censored', run=locals()) ## #### ###### param validation ###### and defaulting #### ## params_search_space = params['search_space'] ## ## if center specified, must be completely specified key_center_l = ['center_' + ch for ch in list('xyz')] center_xyz = [params_search_space[key] for key in key_center_l] if any(center_xyz) and not all(center_xyz): raise ValueError( 'INPUT ERROR: ' 'If any of center (i.e., center_x, center_y, center_z) is specified, all of center must be specified. ' 'Please try again') """ ## ## must specify center to specify any of size key_size_l = ['size_' + ch for ch in list('xyz')] size_xyz = [params_search_space[key] for key in key_size_l] if not all(center_xyz) and any(size_xyz): raise ValueError( "INPUT ERROR: " "Must completely specify center (i.e., center_x, center_y, center_z) before specifying any of size (i.e., size_x, size_y, size_z). " "(Also, if any of size is unspecified, it will default to 30 Angstroms.) " "Please try again" ) """ ## ## if center specified, fill in default size size_default = 30 # Angstroms key_size_l = ['size_' + ch for ch in list('xyz')] size_xyz = [params_search_space[key] for key in key_size_l] if all(center_xyz) and not all(size_xyz): for key_size in key_size_l: if not params_search_space.get(key_size): params_search_space[key_size] = size_default ## #### ###### dl #### ## ps = ProteinStructure(params['pdb_ref']) ps.calc_center_size() ps.convert_to_pdbqt() cs = CompoundSet(params['ligand_list_ref']) cs.split_multiple_models() dprint(ChemKBaseObj.created_instances) ## #### ###### params ###### run #### ## ## params_static = {'cpu': 4} params_default = { 'num_modes': 1000, 'energy_range': 10, 'exhaustiveness': 20, } ## key_search_space_l = key_center_l + key_size_l key_misc_l = ['num_modes', 'energy_range', 'seed', 'exhaustiveness'] ## out_pdbqt_filename_l = [] log_filename_l = [] ## ## for each ligand for ligand_name, ligand_pdbqt_filepath in zip(cs.pdbqt_compound_l, cs.pdbqt_filepath_l): if params.get('skip_vina'): break run_name = ligand_name + '_vs_' + ps.name out_pdbqt_filename_l.append(run_name + '.pdbqt') log_filename_l.append(run_name + '.log') ## ## set up default params params_vina = { 'receptor': ps.pdbqt_filepath, 'ligand': ligand_pdbqt_filepath, 'log': run_name + '.log', 'out': run_name + '.pdbqt', **params_static, **params_default } for space_coords_name in ['center', 'size']: space_coords = getattr(ps, space_coords_name) for k, v in zip(list('xyz'), space_coords): params_vina[space_coords_name + '_' + k] = v ## ## check for search_space and misc params for key in key_misc_l: if params.get(key): params_vina[key] = params[key] for key in key_search_space_l: if params_search_space.get(key): params_vina[key] = params_search_space[key] ## ## cmd = 'vina' for param, arg in params_vina.items(): cmd += ' --' + param + ' ' + str(arg) """ _cmd = ( f"vina --receptor {ps.pdbqt_filepath} --ligand {ligand_pdbqt_filepath} " f"--cpu 4 --log {run_name + '.log'} " f"--center_x {ps.center[0]} --center_y {ps.center[1]} --center_z {ps.center[2]} " f"--size_x {ps.size[0]} --size_y {ps.size[1]} --size_z {ps.size[2]} " f"--out {run_name + '.pdbqt'}" ) """ retcode, stdout, stderr = dprint( cmd, run='cli', subproc_run_kwargs={'cwd': VarStash.shared_folder}) if retcode != 0: sep = '--------------------------------------------------------------------------' raise RuntimeError( f"AutoDock terminated abnormally with error message: " f"[{stderr}] " "You can check logs (click 'Job Status' tab in upper right of cell) for more information" ) if params.get('skip_most_vina'): break ## #### ###### html #### ## hb = HTMLBuilder(ps, cs) ## #### ###### return directories #### ## def dir_to_shock(dir_path, name, description): ''' For regular directories or html directories name - for regular directories: the name of the flat file returned to ui for html directories: the name of the html file ''' dfu_fileToShock_ret = VarStash.dfu.file_to_shock({ 'file_path': dir_path, 'make_handle': 0, 'pack': 'zip', }) dir_shockInfo = { 'shock_id': dfu_fileToShock_ret['shock_id'], 'name': name, 'description': description } return dir_shockInfo # return files dir_retFiles_path = os.path.join(self.shared_folder, 'pdbqt_log_dir') os.mkdir(dir_retFiles_path) for filename in out_pdbqt_filename_l + log_filename_l: shutil.copyfile(os.path.join(self.shared_folder, filename), os.path.join(dir_retFiles_path, filename)) # so DataFileUtil doesn't crash over zipping an empty folder if len(os.listdir(dir_retFiles_path)) == 0: dprint( rf"echo 'Sorry, no files were generated' > {os.path.join(dir_retFiles_path, 'README')}", run='cli') dir_retFiles_shockInfo = dir_to_shock( dir_retFiles_path, 'pdbqt_log.zip', 'Generated .pdbqt and log files') # html html_shockInfo = dir_to_shock(hb.html_dir, 'index.html', 'HTML report for AutoDock Vina') ## #### ###### report #### ## report_params = { 'message': 'this is the report_params `message`', 'warnings': ['this is the', 'report_params `warnings`'], 'direct_html_link_index': 0, #?0 'html_links': [html_shockInfo], 'file_links': [dir_retFiles_shockInfo], 'report_object_name': 'autodock_vina' + self.suffix, 'workspace_name': params['workspace_name'], } kbr = KBaseReport(self.callback_url) report_output = kbr.create_extended_report(report_params) output = { 'report_name': report_output['name'], 'report_ref': report_output['ref'], } #END ad_vina # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method ad_vina return value ' + 'output is not type dict as required.') # return the results return [output]
def find_motifs(self, ctx, params): """ :param params: instance of type "find_motifs_params" -> structure: parameter "workspace_name" of String, parameter "fastapath" of String, parameter "prb" of Double, parameter "motif_length" of Long, parameter "obj_name" of String, parameter "mask_repeats" of Long :returns: instance of type "extract_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs '''if 'motif_min_length' not in params: params['motif_min_length'] = 8 if 'motif_max_length' not in params: params['motif_max_length'] = 16 motMin = params['motif_min_length'] motMax = params['motif_max_length']''' if 'motif_length' not in params: params['motif_length'] = 20 if 'prb' not in params: params['prb'] = 0.05 motlen = params['motif_length'] prb = params['prb'] FastaFilePath = params['fastapath'] mfu = mfmdUtil() mfmdMotifCommand = mfu.build_mfmd_command(FastaFilePath, motlen, prb, self.config) mfu.run_mfmd_command(mfmdMotifCommand) mfmd_out_path = '/kb/module/work/tmp/mfmd_out' mfmd_params = { 'ws_name': params['workspace_name'], 'path': mfmd_out_path, 'location_path': mfmd_out_path + '/mfmd_out.txt', 'obj_name': params['obj_name'] } MOU = MotifUtils(self.callback_url) dfu = DataFileUtil(self.callback_url) locDict = {} if 'SS_ref' in params: get_ss_params = {'object_refs': [params['SS_ref']]} SS = dfu.get_objects(get_ss_params)['data'][0]['data'] for s in SS['sequences']: if s['source'] is not None: locDict['sequence_id'] = { 'contig': s['source']['location'][0][0], 'start': str(s['source']['location'][0][1]) } if len(locDict.keys()) > 0: mfmd_params['absolute_locations'] = locDict mfmd_params['motlen'] = motlen mfmd_params['prb'] = prb obj_ref = mfu.UploadFrommfmd(self.callback_url, mfmd_params)[0]['obj_ref'] mfu.write_obj_ref(mfmd_out_path, obj_ref) timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) timestamp = str(timestamp) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) lineCount = 0 dfu = DataFileUtil(self.callback_url) get_obj_params = {'object_refs': [obj_ref]} mfmdMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data'] mr = MakeNewReport() mr.MakeReport(htmlDir, mfmdMotifSet) try: html_upload_ret = dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') reportName = 'mfmdMotifFinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [{ 'ref': obj_ref, 'description': 'Motif Set generated by mfmd' }], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
def run_yangdar1en_ContigFilterDemo_max(self, ctx, params): """ :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_yangdar1en_ContigFilterDemo_max for name in ['min_length', 'max_length', 'assembly_ref', 'workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['max_length'], int) or (params['max_length'] < 0): raise ValueError('Max length must be a non-negative integer') if not isinstance(params['assembly_ref'], str) or not len(params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') print(params['min_length'], params['max_length'], params['assembly_ref']) if params['max_length'] > 999999999: raise ValueError('Max length must be smaller than 999999999') assembly_util = AssemblyUtil(self.callback_url) fasta_file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']}) # Parse the downloaded file in FASTA format parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta') min_length = params['min_length'] max_length = params['max_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length and len(record.seq) <= max_length: good_contigs.append(record) n_remaining += 1 # output = { # 'n_total': n_total, # 'n_remaining': n_remaining # } # Create a file to hold the filtered data workspace_name = params['workspace_name'] filtered_path = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': {'path': filtered_path}, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) ######################################################## # Create an output summary message for the report text_message = "".join([ 'Filtered assembly to ', str(n_remaining), ' contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded report_data = { 'objects_created': [ {'ref': new_ref, 'description': 'Filtered contigs'} ], 'text_message': text_message } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({ 'report': report_data, 'workspace_name': workspace_name }) # Return the report reference and name in our results output = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } ######################################################## # output = { # 'n_total': n_total, # 'n_remaining': n_remaining, # 'filtered_assembly_ref': new_ref # } #END run_yangdar1en_ContigFilterDemo_max # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_yangdar1en_ContigFilterDemo_max return value ' + 'output is not type dict as required.') # return the results return [output]
def run_kb_ReadSim(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Inparams" -> structure: parameter "workspace_name" of String, parameter "input_sample_set" of String, parameter "strain_info" of String, parameter "assembly_or_genome_ref" of String, parameter "base_error_rate" of String, parameter "outer_distance" of String, parameter "standard_deviation" of String, parameter "num_read_pairs" of String, parameter "len_first_read" of String, parameter "len_second_read" of String, parameter "mutation_rate" of String, parameter "frac_indels" of String, parameter "variation_object_name" of String, parameter "output_read_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ReadSim output_dir = self.shared_folder print(params) self.su.validate_simreads_params(params) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') self.du.download_genome(assembly_ref, output_dir) ref_genome = os.path.join(self.shared_folder, "ref_genome.fa") output_fwd_paired_file_path = os.path.join(self.shared_folder, "raed1.fq") output_rev_paired_file_path = os.path.join(self.shared_folder, "raed2.fq") self.eu.check_path_exists(ref_genome) self.su.simreads(ref_genome, output_fwd_paired_file_path, output_rev_paired_file_path, params) self.eu.check_path_exists(output_fwd_paired_file_path) self.eu.check_path_exists(output_rev_paired_file_path) retVal = self.ru.upload_reads({ 'wsname': params['workspace_name'], 'name': params['output_read_object'], 'sequencing_tech': 'illumina', 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path }) logfile = os.path.join(self.shared_folder, "variant.txt") self.eu.check_path_exists(logfile) vcf_file = self.su.format_vcf(logfile) self.eu.check_path_exists(vcf_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ReadSim # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ReadSim return value ' + 'output is not type dict as required.') # return the results return [output]
def import_samples(self, ctx, params): """ :param params: instance of type "ImportSampleInputs" -> structure: parameter "sample_set_ref" of String, parameter "sample_file" of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "file_format" of String, parameter "description" of String, parameter "set_name" of String, parameter "header_row_index" of Long, parameter "name_field" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long, parameter "prevalidate" of Long, parameter "incl_input_in_output" of Long, parameter "ignore_warnings" of Long, parameter "keep_existing_samples" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples print(f"Beginning sample import with following parameters:") print(f"params -- {params}") sample_set = {"samples": []} # Check if we have an existing Sample Set as input # if so, download if params.get('sample_set_ref'): ret = self.dfu.get_objects( {'object_refs': [params['sample_set_ref']]})['data'][0] sample_set = ret['data'] if params.get('set_name'): set_name = params.get('set_name') else: set_name = ret['info'][1] save_ws_id = params['sample_set_ref'].split('/')[0] else: if not params.get('set_name'): raise ValueError( f"Sample set name required, when new SampleSet object is created." ) set_name = params['set_name'] save_ws_id = params.get('workspace_id') if params.get('header_row_index'): header_row_index = int(params["header_row_index"]) - 1 else: header_row_index = find_header_row(params.get('sample_file'), params.get('file_format')) username = ctx['user_id'] if str(params.get('file_format')).lower() not in [ 'enigma', 'sesar', 'kbase' ]: raise ValueError( f"Only SESAR, ENIGMA, and KBase formats are currently supported for importing samples. " f"File of format {params.get('file_format')} not supported.") mappings = { 'enigma': ENIGMA_mappings, 'sesar': SESAR_mappings, 'kbase': {} } sample_set, has_unignored_errors, errors, sample_data_json = import_samples_from_file( params, self.sample_url, self.workspace_url, self.callback_url, username, ctx['token'], mappings[str(params.get('file_format')).lower()].get('groups', []), mappings[str(params.get('file_format')).lower()].get( 'date_columns', []), mappings[str(params.get('file_format')).lower()].get( 'column_unit_regex', []), sample_set, header_row_index, aliases.get(params.get('file_format').lower(), {})) file_links = [] new_data_links = [] sample_set_ref = None # create UI to display the errors clearly html_link = _error_ui(errors, sample_data_json, has_unignored_errors, self.scratch) if not has_unignored_errors: # only save object if there are no errors obj_info = self.dfu.save_objects({ 'id': save_ws_id, 'objects': [{ "name": set_name, "type": "KBaseSets.SampleSet", "data": sample_set }] })[0] sample_set_ref = '/'.join( [str(obj_info[6]), str(obj_info[0]), str(obj_info[4])]) sample_file_name = os.path.basename( params['sample_file']).split('.')[0] + '_OTU' # create a data link between each sample and the sampleset ss = SampleService(self.sample_url) for idx, sample_info in enumerate(sample_set['samples']): sample_id = sample_info['id'] version = sample_info['version'] sample = ss.get_sample({ 'id': sample_id, 'version': version, }) ret = ss.create_data_link( dict( upa=sample_set_ref, id=sample_id, dataid='samples/{}'.format(idx), version=version, node=sample['node_tree'][0]['id'], update=1, )) new_data_links.append(ret) # -- Format outputs below -- # if output file format specified, add one to output if params.get('output_format') in ['csv', 'xls']: otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name, self.scratch, params) file_links.append({ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "OTU template file", 'description': "file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }) if params.get('incl_input_in_output'): sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError( f"Input file {sample_file} does not exist.") sample_file_copy = os.path.join(self.scratch, os.path.basename(sample_file)) shutil.copy(sample_file, sample_file_copy) file_links.append({ "path": sample_file_copy, "name": os.path.basename(sample_file_copy), "label": "Input Sample file", "description": "Input file provided to create the sample set." }) # create report report_client = KBaseReport(self.callback_url) report_data = { 'report_object_name': "SampleSet_import_report_" + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] } if file_links: report_data['file_links'] = file_links if sample_set_ref: report_data[ 'message'] = f"SampleSet object named \"{set_name}\" imported." report_data['objects_created'] = [{'ref': sample_set_ref}] if html_link: report_data['html_links'] = [{ 'path': html_link, 'name': 'index.html', 'description': 'HTML Report for Sample Uploader' }] report_data['direct_html_link_index'] = 0 report_info = report_client.create_extended_report(report_data) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], 'sample_set': sample_set, 'sample_set_ref': sample_set_ref, 'errors': errors, 'links': new_data_links } #END import_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples return value ' + 'output is not type dict as required.') # return the results return [output]
def run_omreegalozpathway_completeness(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_omreegalozpathway_completeness #Preparing report client report_client = KBaseReport(self.callback_url) #Original report info report_info = report_client.create({ 'report': { 'objects_created': [], 'text_message': params['main_input_ref'] }, 'workspace_name': params['workspace_name'] }) token = os.environ.get('KB_AUTH_TOKEN', None) #Checking the input params if "main_input_ref" in params: main_input_ref = params['main_input_ref'] else: logging.info( 'the reference number is not in the params, program must end.') raise Exception("main_input_ref not in params") #Creating the workspace client object ws = Workspace(self.ws_url, token=token) #Getting information about the main input ref obj_info = ws.get_object_info3({'objects': [{'ref': main_input_ref}]}) #Catching errors: if "infos" in obj_info: #Getting information from object reference number object_name = obj_info["infos"][0][1] object_type = obj_info["infos"][0][2] ws_name = obj_info["infos"][0][7] #Logging: logging.debug("Object Type: " + object_type) logging.debug("Object Name: " + object_name) logging.debug("Workspace Name: " + ws_name) else: logging.info( "The function ws.get_object_info3 failed to download the right information. The program must abort." ) raise Exception("Could not find infos in obj_info") #We create the output file name and add information to it later. output_file_name = 'pathways_measurements' #This part is a hack, need to check type of data more accurately. if object_type[:17] == 'KBaseFBA.FBAModel': logging.info("Succesfully recognized type as FBA Model") #Preparing the output file name which we return to the user output_file_name += '_fba_model' #Creating an fba tools object fba_t = fba_tools(self.callback_url) # Getting the TSV file from the object X = fba_t.export_model_as_tsv_file({"input_ref": main_input_ref}) # Logging logging.info( "the object output from fba tools export model as tsv file:") logging.info(X) #Locating where the reactions tsv was placed (Not well done- replace this with a robust form) reactions_file_path = os.path.join( self.shared_folder, object_name + '/' + object_name + '-reactions.tsv') #Preparing an output path for a future function output_path = os.path.join(self.shared_folder, output_file_name + '.tsv') #This function performs the percentage calculation work for FBAModel Object Types. html_path = reactions_file_to_pathway_reactions_and_percentages( reactions_file_path, output_path, object_name) # Using KBase Gene Families- Domain Annotation elif object_type[:34] == "KBaseGeneFamilies.DomainAnnotation": logging.info("Succesfully recognized type as Domain Annotation") output_file_name += '_domain_annotation' #We get the object using workspace's get_objects2 function obj = ws.get_objects2({'objects': [{'ref': main_input_ref}]}) #Within the way the object dictionary is given, what we are looking for is in the location as follows: Y = obj['data'][0]['data']['data'] #Preparing our own output_file_path with Domain Annotation instead of FBAModel (why?) output_file_path = os.path.join(self.shared_folder, output_file_name + '.tsv') #This function (written for the module) finds percentages of pathway completeness. html_path = TIGRFAM_file_to_pathway_reactions_and_percentages( Y, output_file_path, object_name) else: logging.info("Object type unknown") raise Exception( "Could not recognize ref to object- Check if object is FBA Model or Domain Annotation type. If so, the error is in the program, not the input - contact [email protected]." ) html_dict = [{"path": html_path, "name": 'Completeness_Table'}] #Preparing final report: report = report_client.create_extended_report({ 'direct_html_link_index': 0, 'message': 'Here are the pathway completeness results', 'workspace_name': ws_name, 'html_links': html_dict }) output = { 'report_name': report['name'], 'report_ref': report['ref'], } #END run_omreegalozpathway_completeness # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method run_omreegalozpathway_completeness return value ' + 'output is not type dict as required.') # return the results return [output]
def run_CompMolNWChem(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_CompMolNWChem # Initial Tests to Check for Proper Inputs for name in ['Input_File','calculation_type','workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '"is required but missing') if not isinstance(params['Input_File'], str): raise ValueError('Input_File must be a string') # Load the tsv file into a compound set using DataFileUtil methods scratch_file_path = self.dfu.download_staging_file({'staging_file_subdir_path':params['Input_File']} ).get('copy_file_path') #print('Scratch File Path: ',scratch_file_path) mol2_file_dir = None ext = os.path.splitext(scratch_file_path)[1] file_name = os.path.basename(scratch_file_path) if ext == '.sdf': compounds = parse.read_sdf(scratch_file_path, mol2_file_dir=mol2_file_dir, callback_url=self.callback_url) elif ext == '.tsv': compounds = parse.read_tsv(scratch_file_path, mol2_file_dir=mol2_file_dir, callback_url=self.callback_url) #elif ext == '.csv': # compounds = parse.read_csv(scratch_file_path, # mol2_file_dir=mol2_file_dir, # callback_url=self.callback_url) #else: # raise ValueError('Invalid input file type. Expects .tsv or .sdf') #DEBUG:: #print('Compounds:',compounds) # compoundset = { # 'id': params['Input_File'], # 'name': params['Input_File'], # 'description': 'Compound Set produced from %s' % file_name, # 'compounds': compounds, # } # Finish Reading in Compound Set # Read ids and smiles from compound set for nwchem input # ids = [] # smiles = [] # for d in compounds: # ids.append(d['id']) # smiles.append(d['smiles']) #print(ids) #print(smiles) # Read the ids and structures of the compounds # its.inchi_to_dft(ids,smiles) #DEBUG:: #os.system('pwd') #os.system('ls') # length = len(ids) # for i in range(length): # os.chdir('./'+ids[i]+'/dft') # x = ids[i] + '_nwchem.out' #print('x:',x) # file1 = open(x, 'r') # nAtoms = mul.getNumberOfAtoms(file1) # energy = mul.getInternalEnergy0K(file1) # charge =mul.getMullikenCharge(file1,nAtoms) # file1.close() # mul.nAtoms = nAtoms # mul.E0K = energy # mul.calculate(ids[i]) from snakemake import snakemake reactionlist = scratch_file_path id_to_smiles = {} data = open('/kb/module/modelseed_test.csv','r') for lines in data.readlines(): id = lines.split(',')[0] smiles = lines.split(',')[1].rstrip() id_to_smiles[id] = smiles data.close() with open(reactionlist,'r') as f: reactions = f.readlines()[0].rstrip() reactant = reactions.split('=')[0].split('+') product = reactions.split('=')[1].split('+') metabolites = [] for each in reactant: each = each.strip() metabolites.append(each) for each in product: each = each.strip() metabolites.append(each) for molecule in metabolites: moldir = molecule if not os.path.exists(moldir): os.mkdir(moldir) initial_structure_dir = moldir + '/initial_structure' if not os.path.exists(initial_structure_dir): os.mkdir(initial_structure_dir) md_structure_dir = moldir + '/md' if not os.path.exists(md_structure_dir): os.mkdir(md_structure_dir) dft_structure_dir = moldir + '/dft' if not os.path.exists(dft_structure_dir): os.mkdir(dft_structure_dir) inchifile_str = initial_structure_dir + '/' + moldir + '.smiles' with open(inchifile_str,'w+') as f: f.write(id_to_smiles[moldir]) os.system('snakemake -p --cores 3 --snakefile snakemake-scripts/final_pipeline.snakemake -w 12000') # Build KBase Output. Should output entire /simulation directory and build a CompoundSet with Mol2 Files #result_directory = '/kb/module/snakemake-scripts' result_directory = '/kb/module/' ## Build CompoundSet with Mol2 Files... similarly to fetch_mol2_files_from_zinc (CompoundSetUtils).... # compoundset_copy = copy.deepcopy(compoundset) # count = 0 # for compound in compoundset_copy.get('compounds'): # if not compound.get('mol2_handle_ref'): # mol2_file_path = result_directory+compound.get('id') # SMILES = compound.get('smiles') # shutil.move(mol2_file_path,self.scratch) # os.chdir(self.scratch) # mol2_file_path = self.scratch + '/'+ compound.get('id')+'/dft/' + compound.get('id')+'_Mulliken.mol2' # handle_id = self.dfu.file_to_shock({'file_path': mol2_file_path, # 'make_handle': True})['handle']['hid'] # print('Handle ID:',handle_id) # compound['mol2_handle_ref'] = handle_id # count += 1 # if count: # message = 'Successfully fetched {} Mol2 files from Staging Path'.format(count) ## Create Extended Report output_files = self._generate_output_file_list(result_directory) #output_files = report_params = { 'message':'', 'workspace_id': params['workspace_id'], 'objects_created': [], 'file_links':output_files, 'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4())} report = KBaseReport(self.callback_url) report_info = report.create_extended_report(report_params) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } return [output]
def _save_output_to_kbase(self, io_params, app_params, output_dir, run_log, run_command): # TODO: insert the run_command into the output log # # read the output file list file_lookup = self._read_outputfile(os.path.join(output_dir, 'file-list.txt')) # save the new reads mapped_reads_ref = None unmapped_reads_ref = None objects_created = [] if 'mapped_reads_files' not in file_lookup: print('No mapped reads fastq file found in output. Not creating any mapped reads objects.') else: for file_name in file_lookup['mapped_reads_files'].split(','): mapped_reads_path = os.path.join(output_dir, file_name) mapped_reads_ref = upload_interleaved_reads( self.callback_url, mapped_reads_path, io_params['workspace_name'], file_name+'.reads', io_params.get('in_readslib_ref')) objects_created.append({ 'ref': mapped_reads_ref, 'description': 'Mapped reads library' }) if 'unmapped_reads_files' not in file_lookup: print('No unmapped reads fastq file found in output. Not creating any unmapped reads objects.') else: for file_name in file_lookup['unmapped_reads_files'].split(','): unmapped_reads_path = os.path.join(output_dir, file_name) unmapped_reads_ref = upload_interleaved_reads( self.callback_url, unmapped_reads_path, io_params['workspace_name'], file_name+'.reads', io_params.get('in_readslib_ref')) objects_created.append({ 'ref': unmapped_reads_ref, 'description': 'Unmapped reads library' }) # build the HTML report html_zipped = self._build_html_report(io_params.get('in_readslib_ref'), output_dir, file_lookup) file_links = self._build_file_report(output_dir, run_log) # save the report report_params = { 'message': '', 'objects_created': objects_created, 'direct_html_link_index': 0, 'html_links': [html_zipped], 'file_links': file_links, 'report_object_name': 'bbtools_bbmap_report_' + str(uuid.uuid4()), 'workspace_name': io_params['workspace_name'] } kr = KBaseReport(self.callback_url) report_output = kr.create_extended_report(report_params) return {'report_name': report_output['name'], 'report_ref': report_output['ref'], 'run_command': run_command}
def generate_report(callback_url, token, workspace_name, shared_folder: Path, virmatcher_output: Path): """ :param callback_url: :param token: Job token :param workspace_name: Workspace name :param shared_folder: KBase working directory on the node, used to save the HTML file :param virmatcher_output: VirMatcher proper final results directory, should have the summary file :return: """ html_template = Template("""<!DOCTYPE html> <html lang="en"> <head> <link href="https://netdna.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet"> <link href="https://cdn.datatables.net/1.10.22/css/jquery.dataTables.min.css" rel="stylesheet"> <link href="https://cdn.datatables.net/buttons/1.5.2/css/buttons.dataTables.min.css" rel="stylesheet"> <link href="https://cdn.datatables.net/searchpanes/1.2.0/css/searchPanes.dataTables.min.css" rel="stylesheet"> <link href="https://cdn.datatables.net/select/1.3.1/css/select.dataTables.min.css" rel="stylesheet"> <script src="https://code.jquery.com/jquery-3.5.1.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/1.10.22/js/jquery.dataTables.min.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/buttons/1.6.4/js/dataTables.buttons.min.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/buttons/1.6.4/js/buttons.flash.min.js" type="text/javascript"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.1.3/jszip.min.js" type="text/javascript"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/pdfmake.min.js" type="text/javascript"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/vfs_fonts.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/buttons/1.6.4/js/buttons.html5.min.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/buttons/1.6.4/js/buttons.print.min.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/searchpanes/1.2.0/js/dataTables.searchPanes.min.js" type="text/javascript"></script> <script src="https://cdn.datatables.net/select/1.3.1/js/dataTables.select.min.js" type="text/javascript"></script> <style> tfoot input { width: 100%; padding: 3px; box-sizing: border-box; } </style> </head> <body> <div class="container"> <div> ${html_table} </div> </div> <script type="text/javascript"> $$(document).ready(function() { $$('#my_id tfoot th').each( function () { var title = $$(this).text(); $$(this).html( '<input type="text" placeholder="Search '+title+'" />' ); }); var table = $$('#my_id').DataTable({ buttons: [ 'copy', 'csv', 'excel', 'pdf', 'print'], scrollX: true, dom: 'lBfrtip' //P to B disables panes }); table.columns().every( function () { var that = this; $$( 'input', this.footer() ).on( 'keyup change', function () { if ( that.search() !== this.value ) { that .search( this.value ) .draw(); } }); } ); } ); </script> </body> </html>""") report = KBaseReport(callback_url, token=token) dfu = DataFileUtil(callback_url, token=token) virmatcher_fp = virmatcher_output / 'VirMatcher_Summary_Predictions.tsv' virmatcher_df = pd.read_csv(virmatcher_fp, header=0, index_col=None, delimiter='\t') # Set column ordering order = [ 'Original Viral population', 'Original Host', 'Final_score', 'CRISPR score', 'Prophage blast score', 'WIsH score', 'Number of CRISPR matches', 'Max number of end mismatches detected in any CRISPR spacer', 'Prophage blast percent identity', 'Prophage blast viral contig coverage', 'tRNA match', 'Max number of end mismatches detected in host tRNA', 'Non-promiscuous tRNA match score', 'WIsH p-value', # 'LogLikelihood', 'Viral population', 'Predicted host', ] order = [ele for ele in order if ele in virmatcher_df.columns.tolist()] # Some may not exist! virmatcher_df = virmatcher_df[order] html = virmatcher_df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute(html_table=html) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace( ' style="text-align: right;"', '').replace( 'thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:] output_dir = shared_folder / str(uuid.uuid4()) os.mkdir(output_dir) html_fp = output_dir / 'index.html' with open(html_fp, 'w') as html_fh: html_fh.write(final_html) report_shock_id = dfu.file_to_shock({ 'file_path': str(output_dir), 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': 'index.html', 'label': 'index.html', 'description': 'Summary report for VirMatcher' }] report_params = { 'message': 'Basic message to show in the report', 'workspace_name': workspace_name, 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': f'VirMatcher_report_{str(uuid.uuid4())}', } report_output = report.create_extended_report(report_params) return report_output
def run_simplebatch(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "SimpleBatchParams" -> structure: parameter "batch_inputs" of type "batch_params" -> list of type "app_params" -> mapping from String to unspecified object, parameter "method_name" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_simplebatch report = KBaseReport(self.callback_url) #TODO Always request WSID? #"simpleapp.simple_add" method_name = "simpleapp.simple_add" #params['method_name'] wsid = "TODO" #TODO Get Service_Ver service_ver = "dev" batched_app_params = params['app_params'] job_ids = [] statuses = [] for i, app_param in enumerate(batched_app_params): print(f"About to submit job with params {app_param}") rjp = { "method": method_name, "params": [app_param], "service_ver": service_ver, "wsid": wsid, "app_id": "RanWithBatch", } try: job_id = self.ee2.run_job(params=rjp) status = "queued" except Exception: job_id = "failed to submit" status = "failure" job_ids.append(job_id) statuses.append(status) #TODO Create table with refresh buttons or autorefresh, which uses cookie or environment # Send this as a report report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['parameter_1'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_simplebatch # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_simplebatch return value ' + 'output is not type dict as required.') # return the results return [output]
def run_MotifSuite(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "motifsuite_seq_input" -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "SS_ref" of String, parameter "promoter_length" of Long, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long, parameter "obj_name" of String, parameter "prb" of Double, parameter "motif_length" of Long, parameter "background" of Long, parameter "mask_repeats" of Long, parameter "background_group" of mapping from String to String, parameter "threshold" of Double, parameter "proportion" of Double :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_MotifSuite report = KBaseReport(self.callback_url) mfmd_obj = MotifFindermfmd(self.callback_url) homer_obj = MotifFinderHomer(self.callback_url) meme_obj = MotifFinderMEME(self.callback_url) gibbs_obj = MotifFinderGibbs(self.callback_url) ensemble_obj = MotifEnsemble(self.callback_url) mdscan_obj = MotifFinderMdscan(self.callback_url) sampler_obj = MotifFinderSampler(self.callback_url) p1 = Process(target=homer_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p1.start() p1.join() p2 = Process(target=mfmd_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p2.start() p2.join() p3 = Process(target=meme_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p3.start() p3.join() p4 = Process(target=gibbs_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p4.start() p4.join() p5 = Process(target=mdscan_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p5.start() p5.join() p6 = Process(target=sampler_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p6.start() p6.join() MSU=MotifSuiteUtil() params['motifset_refs']= MSU.get_obj_refs() #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133','29716/72/134','29716/72/135','29716/72/136'] #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133'] print(params['motifset_refs']) #result = ensemble_obj.MotifEnsemble(params) #print('Ensemble RESULT:') #print(result) dms=DownloadMotifSets() MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],self.callback_url) matchSets = [] threshold = float(params['threshold']) fmu=FastaUtils() for i,MSR1 in enumerate(MotifSetDict.keys()): for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k,MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']): if fmu.CompareMotifsBP(motif1,motif2,threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m,mset in enumerate(matchSets): if (MSR1,j) in mset: found1 = True index1 = m if(MSR2,l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1,j)) elif not found2 and found1: matchSets[index1].add((MSR2,l)) elif found1 and found2: if index1 != index2: matchSets[index1].union(matchSets[index2]) matchSets.pop(index2) else: matchSets.append(set([(MSR1,j),(MSR2,l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i,mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys()))/numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = fmu.merge(matchSets[keep],MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) mr=MakeNewReport() mr.MakeReport(htmlDir,ESO) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_MotifSuite # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_MotifSuite return value ' + 'output is not type dict as required.') # return the results return [output]
def import_samples(self, ctx, params): """ :param params: instance of type "ImportSampleInputs" -> structure: parameter "sample_set_ref" of String, parameter "sample_file" of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "file_format" of String, parameter "description" of String, parameter "set_name" of String, parameter "header_row_index" of Long, parameter "id_field" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples print(f"Beginning sample import with following parameters:") print(f"params -- {params}") sample_set = {"samples": []} # We subtract by 1 for zero indexing. if params.get('sample_set_ref'): ret = self.dfu.get_objects( {'object_refs': [params['sample_set_ref']]})['data'][0] sample_set = ret['data'] set_name = ret['info'][1] save_ws_id = params['sample_set_ref'].split('/')[0] else: if not params.get('set_name'): raise ValueError( f"Sample set name required, when new SampleSet object is created." ) set_name = params['set_name'] save_ws_id = params.get('workspace_id') if params.get('header_row_index'): header_row_index = int(params["header_row_index"]) - 1 else: header_row_index = 0 if params.get('file_format') == "SESAR": header_row_index = 1 username = ctx['user_id'] if params.get('file_format') == 'ENIGMA': # ENIGMA_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']} # ) sample_set = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], ENIGMA_mappings['column_mapping'], ENIGMA_mappings.get('groups', []), ENIGMA_mappings['date_columns'], ENIGMA_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'SESAR': # SESAR_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in SESAR_mappings['basic_columns']} # ) sample_set = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], SESAR_mappings['column_mapping'], SESAR_mappings.get('groups', []), SESAR_mappings['date_columns'], SESAR_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'KBASE': sample_set = import_samples_from_file(params, self.sw_url, self.workspace_url, username, ctx['token'], {}, [], [], [], sample_set, header_row_index) else: raise ValueError( f"Only SESAR and ENIGMA formats are currently supported for importing samples. " "File of format {params.get('file_format')} not supported.") obj_info = self.dfu.save_objects({ 'id': save_ws_id, 'objects': [{ "name": set_name, "type": "KBaseSets.SampleSet", "data": sample_set }] })[0] sample_set_ref = '/'.join( [str(obj_info[6]), str(obj_info[0]), str(obj_info[4])]) sample_file_name = os.path.basename( params['sample_file']).split('.')[0] + '_OTU' # -- Format outputs below -- # if output file format specified, add one to output if params.get('output_format') in ['csv', 'xls']: otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name, self.scratch, params) file_links = [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "OTU template file", 'description': "file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }] else: file_links = [] if params.get('incl_input_in_output'): sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError( f"input file {sample_file} does not exist.") sample_file_copy = os.path.join(self.scratch, os.path.basename(sample_file)) shutil.copy(sample_file, sample_file_copy) file_links.append({ "path": sample_file_copy, "name": os.path.basename(sample_file_copy), "label": "Input Sample file", "description": "Input file provided to create the sample set." }) # create report report_client = KBaseReport(self.callback_url) report_name = "SampleSet_import_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'message': f"SampleSet object named \"{set_name}\" imported.", 'objects_created': [{ 'ref': sample_set_ref }], 'file_links': file_links, 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], 'sample_set': sample_set, 'sample_set_ref': sample_set_ref } #END import_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples return value ' + 'output is not type dict as required.') # return the results return [output]
class CompareAnnotationsUtil: workdir = 'tmp/work/' staging_dir = "/staging/" datadir = "/kb/module/data/" def __init__(self, config): os.makedirs(self.workdir, exist_ok=True) self.config = config self.timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.genome_api = GenomeAnnotationAPI(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.ws_client = Workspace(config["workspace-url"]) self.events = {} def get_ontology_events(self, params): if 'ontology_events' in self.genome: for event, ontology in enumerate(self.genome['ontology_events']): if 'description' not in ontology: ontology['description'] = ontology['method'] if ontology['description'] in params['annotations_to_compare'] or len(params['annotations_to_compare']) == 0: self.events[event] = {} ontology["id"] = mu.legacy_fix(ontology["id"]) for term in ontology: self.events[event][term] = ontology[term] else: logging.info("No ontology events in this genome!") # logging.info(self.events) def summarize_gto(self, params): summary = {"genes": {}, "terms": {}, "rxns": {}, "ontology_events": {}, "orphan_terms": {} } # add ontology events for ontology_event in self.events: summary['ontology_events'][ontology_event] = self.events[ontology_event] # add gene id to summary for feature in self.genome['features']: gene_id = feature['id'] summary["genes"][gene_id] = {"terms": {}, "rxns": {} } # get ontology term if "ontology_terms" in feature: for ontology_term_type in feature['ontology_terms']: ''' note, ontology_term_type might be a legacy term, and will need to be converted later after making the term_dict ''' # ontology_term_type = ontology_term_type.upper() # logging.info(ontology_term_type) # if ontology_term_type in mu.legacy_codes: # ontology_term_type = mu.legacy_codes[ontology_term_type] # logging.info(ontology_term_type) term_dict = feature['ontology_terms'][ontology_term_type] for term in term_dict: for ontology_event in term_dict[term]: # is this ontology event in the user-selected list? if ontology_event in self.events: rxn = "none" # get rxn, convert to upper case to make case insensitive ontology_type = summary['ontology_events'][ontology_event]['id'].upper( ) # fix annotation term to fit with style term = mu.standardize_annotation(term, ontology_type) # convert terms to rxns if term in self.translations[ontology_type]: rxn = self.translations[ontology_type][term] else: if ontology_event in summary["orphan_terms"]: summary["orphan_terms"][ontology_event].append(term) summary["orphan_terms"][ontology_event] = list( set(summary["orphan_terms"][ontology_event])) else: summary["orphan_terms"][ontology_event] = [term] # terms if term in summary["genes"][gene_id]['terms']: summary["genes"][gene_id]['terms'][term].append(ontology_event) else: summary["genes"][gene_id]['terms'][term] = [ontology_event] if term in summary['terms']: summary['terms'][term].append(ontology_event) summary['terms'][term] = list(set(summary['terms'][term])) else: summary['terms'][term] = [ontology_event] # rxns if rxn != "none": if rxn in summary["genes"][gene_id]['rxns']: summary["genes"][gene_id]['rxns'][rxn].append( ontology_event) else: summary["genes"][gene_id]['rxns'][rxn] = [ontology_event] if rxn in summary['rxns']: summary['rxns'][rxn].append(ontology_event) summary['rxns'][rxn] = list(set(summary['rxns'][rxn])) else: summary['rxns'][rxn] = [ontology_event] with open(os.path.join(self.scratch, "summary_dump.json"), 'w') as outfile: json.dump(summary, outfile, indent=2) return summary def html_summary(self, params, summary): # convert gto summary for this report html_summary_report = {} for ontology_event in summary['ontology_events']: html_summary_report[ontology_event] = {"gene": [], "term": [], "rxn": []} for gene in summary["genes"]: for term in summary["genes"][gene]['terms']: for ontology_event in summary["genes"][gene]['terms'][term]: html_summary_report[ontology_event]['gene'].append(gene) html_summary_report[ontology_event]['term'].append(term) html_summary_report[ontology_event]['gene'] = list( set(html_summary_report[ontology_event]['gene'])) html_summary_report[ontology_event]['term'] = list( set(html_summary_report[ontology_event]['term'])) for rxn in summary["genes"][gene]['rxns']: for ontology_event in summary["genes"][gene]['rxns'][rxn]: html_summary_report[ontology_event]['rxn'].append(rxn) html_summary_report[ontology_event]['gene'].append(gene) html_summary_report[ontology_event]['rxn'] = list( set(html_summary_report[ontology_event]['rxn'])) html_summary_report[ontology_event]['gene'] = list( set(html_summary_report[ontology_event]['gene'])) output_html_files = list() # Make report directory and copy over files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) result_file_path = os.path.join(output_directory, 'compare_annotations_summary.html') # make html table_lines = [] table_lines.append(f'<h2>Compare Annotations</h2>') table_lines.append(f'<h3>Summary</h3>') table_lines.append( '<table cellspacing="0" cellpadding="3" border="1"><tr><th>EVENT</th><th>DESCRIPTION</th><th>TYPE</th><th>GENES</th><th>TERMS</th><th>RXNS</th></tr>') for event in sorted(html_summary_report.keys()): # RAST/PROKKA don't have descriptions, but they have methods description = self.events[event].get('description', self.events[event]['method']) type = self.events[event]['id'] genes_list = html_summary_report[event]['gene'] terms_list = html_summary_report[event]['term'] rxns_list = html_summary_report[event]['rxn'] table_lines.append('<tr><td>' + str(event) + '</td><td>' + description + '</td><td>' + type + '</td><td>' + str( len(set(genes_list))) + '</td><td>' + str(len(terms_list)) + '</td><td>' + str(len(rxns_list)) + '</td></tr>') table_lines.append('</table>') # Write to file with open(result_file_path, 'w') as result_file: for line in table_lines: result_file.write(line + "\n") output_html_files.append( {'path': output_directory, 'name': os.path.basename(result_file_path), 'description': 'Summary Report'}) # bokeh plots totals_file_path = os.path.join(output_directory, 'totals.html') output_file(totals_file_path, title="Totals") totals = self.plot_totals(summary) save(totals) output_html_files.append( {'path': output_directory, 'name': os.path.basename(totals_file_path), 'description': 'Ontology Totals'}) csc_file_path = os.path.join(output_directory, 'csc.html') output_file(csc_file_path, title="CSC") csc = self.plot_csc2(summary) save(csc) output_html_files.append( {'path': output_directory, 'name': os.path.basename(csc_file_path), 'description': 'Cumulative Sum Plot'}) # finalize html reports report_params = { 'message': '', 'html_links': output_html_files, 'direct_html_link_index': 0, 'workspace_name': params['workspace_name'], 'report_object_name': f'compare_annotations_{uuid.uuid4()}'} output = self.kbr.create_extended_report(report_params) return {'report_name': output['name'], 'report_ref': output['ref']} # plotting functions def plot_totals(self, summary): descriptions = {} for o in summary["ontology_events"]: descriptions[o] = summary["ontology_events"][o].get( 'description', summary["ontology_events"][o]['method']) + '_' + str(o) logging.info(descriptions[o]) totals = {} for event in summary['ontology_events'].keys(): totals[str(event)] = {'genes': [], 'rxns': [], 'terms': []} # genes for gene in summary['genes']: for term in summary['genes'][gene]['terms']: for event in summary['genes'][gene]['terms'][term]: totals[str(event)]['genes'].append(gene) # terms for term in summary['terms']: for event in summary['terms'][term]: totals[str(event)]['terms'].append(term) # rxns for rxn in summary['rxns']: for event in summary['rxns'][rxn]: totals[str(event)]['rxns'].append(rxn) # sums events = [] types = ['genes', 'terms', 'rxns'] gene_counts = [] rxn_counts = [] term_counts = [] for event in totals: logging.info(event) events.append(descriptions[int(event)]) gene_counts.append(len(set(totals[event]['genes']))) rxn_counts.append(len(set(totals[event]['rxns']))) term_counts.append(len(set(totals[event]['terms']))) data = {'events': events, 'genes': gene_counts, 'terms': term_counts, 'rxns': rxn_counts } x = [(event, type) for event in events for type in types] counts = sum(zip(data['genes'], data['terms'], data['rxns']), ()) source = ColumnDataSource(data=dict(x=x, counts=counts)) p = figure(y_range=FactorRange(*x), plot_height=400, plot_width=1000, title="Unique Counts per Annotation Event", tools="wheel_zoom,box_zoom,reset,save") p.hbar(y='x', right='counts', height=0.9, source=source, line_color="black", fill_color=factor_cmap('x', palette=inferno(len(types)), factors=types, start=1, end=2)) p.x_range.start = 0 p.y_range.range_padding = 0.1 p.yaxis.major_label_orientation = "horizontal" p.yaxis.subgroup_label_orientation = "horizontal" p.yaxis.group_label_orientation = "horizontal" p.ygrid.grid_line_color = None p.title.text_font_size = '12pt' p.xaxis.major_label_text_font_size = "12pt" p.yaxis.major_label_text_font_size = "12pt" p.yaxis.group_text_font_size = "12pt" p.add_tools(HoverTool(tooltips=[("Type", "@x"), ("Count", "@counts")])) return(p) def plot_csc2(self, summary, summary_type="rxns"): descriptions = {} for o in summary["ontology_events"]: descriptions[o] = summary["ontology_events"][o].get( 'description', summary["ontology_events"][o]['method']) + ' (' + summary["ontology_events"][o]['id'] + ' #' + str(o) + ')' events = sorted(summary['ontology_events'].keys()) rxns = summary[summary_type] # convert to sets rxns_in_events = dict((int(el), set()) for el in events) for rxn in rxns: for event in rxns[rxn]: rxns_in_events[event].add(rxn) winning_sets = {} winning_order = [] baseline = 0 df = pd.DataFrame(columns=["E", "C", "T", "L", "R"]) # E=event, C=comparison, T=total, L=left, R=right for _ in range(len(rxns_in_events)): current_right = baseline current_left = baseline # get current winner longest_set_key = self.longest_set(rxns_in_events, winning_sets) # compare current winner to all past winners current = rxns_in_events[longest_set_key] for past_winner in winning_order: overlap = len(winning_sets[past_winner] & current) current_left -= overlap row = [descriptions[longest_set_key], # E descriptions[past_winner], # C overlap, # T current_left, # L current_left + overlap] # R df.loc[len(df)] = row current = current - winning_sets[past_winner] # process current winner row = [descriptions[longest_set_key], # E descriptions[longest_set_key], # C len(current), # T current_right, # L current_right + len(current)] # R df.loc[len(df)] = row # add to df baseline += len(current) # move current winner to past winners winning_sets[longest_set_key] = rxns_in_events[longest_set_key] winning_order.append(longest_set_key) rxns_in_events[longest_set_key] = set() source = ColumnDataSource(df) type1_colormap = factor_cmap('E', palette=viridis( len(df.E.unique())), factors=df.E.unique()) type2_colormap = factor_cmap('C', palette=viridis( len(df.C.unique())), factors=df.C.unique()) p = figure(y_range=df.E.unique().tolist()[::-1], # .tolist()[::-1] reverses the list. plot_height=300, plot_width=1000, title="Annotation events ranked by \'" + str(summary_type) + "\' contribution", tools="wheel_zoom,box_zoom,reset,save") p.hbar(y='E', height=0.9, left='L', right='R', source=source, fill_color=type2_colormap, line_color="black") p.add_tools(HoverTool(tooltips=[("Total", "@T"), ("Comparison", "@C")])) p.title.text_font_size = '12pt' p.xaxis.major_label_text_font_size = "12pt" p.yaxis.major_label_text_font_size = "12pt" return p def longest_set(self, s, w): s = s.copy() for event in s: for winner in w: s[event] = s[event] - w[winner] # https://stackoverflow.com/a/21839239 max_key, max_value = max(s.items(), key=lambda x: len(x[1])) return(max_key) def run(self, ctx, params): # collect some metadata self.genome = mu.get_genome(params['genome'], self.genome_api) self.get_ontology_events(params) self.translations = mu.get_translations(self.datadir) # summarize and make reports summary = self.summarize_gto(params) report = self.html_summary(params, summary) return report
def process_batch_result(self, batch_result, validated_params, reads, input_set_info): n_jobs = len(batch_result['results']) n_success = 0 n_error = 0 ran_locally = 0 ran_njsw = 0 # reads alignment set items items = [] objects_created = [] for k in range(0, len(batch_result['results'])): job = batch_result['results'][k] result_package = job['result_package'] if job['is_error']: n_error += 1 else: n_success += 1 output_info = result_package['result'][0]['output_info'] ra_ref = output_info['upload_results']['obj_ref'] # Note: could add a label to the alignment here? items.append({'ref': ra_ref, 'label': reads[k]['condition']}) objects_created.append({'ref': ra_ref}) if result_package['run_context']['location'] == 'local': ran_locally += 1 if result_package['run_context']['location'] == 'njsw': ran_njsw += 1 # Save the alignment set alignment_set_data = {'description': '', 'items': items} alignment_set_save_params = { 'data': alignment_set_data, 'workspace': validated_params['output_workspace'], 'output_object_name': str(input_set_info[1]) + validated_params['output_obj_name_suffix'] } set_api = SetAPI(self.srv_wiz_url) save_result = set_api.save_reads_alignment_set_v1( alignment_set_save_params) print('Saved ReadsAlignment=') pprint(save_result) objects_created.append({ 'ref': save_result['set_ref'], 'description': 'Set of all reads alignments generated' }) set_name = save_result['set_info'][1] # run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': save_result['set_ref']}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create the report report_text = 'Ran on SampleSet or ReadsSet.\n\n' report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n' report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n' report_text += ' Successful runs = ' + str(n_success) + '\n' report_text += ' Failed runs = ' + str(n_error) + '\n' report_text += ' Ran on main node = ' + str(ran_locally) + '\n' report_text += ' Ran on remote worker = ' + str(ran_njsw) + '\n\n' print('Report text=') print(report_text) kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({ 'message': report_text, 'objects_created': objects_created, 'report_object_name': 'kb_Bowtie2_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'workspace_name': validated_params['output_workspace'] }) result = { 'report_info': { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } } result['batch_output_info'] = batch_result return result
def run_MotifSuite(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_MotifSuite report = KBaseReport(self.callback_url) mfmd_obj = MotifFindermfmd(self.callback_url) homer_obj = MotifFinderHomer(self.callback_url) meme_obj = MotifFinderMEME(self.callback_url) gibbs_obj = MotifFinderGibbs(self.callback_url) ensemble_obj = MotifEnsemble(self.callback_url) '''result = homer_obj.DiscoverMotifsFromSequenceSet(params) print('Homer RESULT:') pprint(result)''' '''if os.path.exists('/kb/module/work/homer_out'): shutil.rmtree('/kb/module/work/homer_out') shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/homer_out/') result = meme_obj.DiscoverMotifsFromSequenceSet(params) print('MEME RESULT:') pprint(result) ''' result = mfmd_obj.DiscoverMotifsFromSequenceSet(params) print('MFMD RESULT:') pprint(result) '''result = ensemble_obj.MotifEnsemble(params) print('Ensemble RESULT:') print(result) if os.path.exists('/kb/module/work/meme_out'): shutil.rmtree('/kb/module/work/meme_out') shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/meme_out/') result = gibbs_obj.ExtractPromotersFromFeatureSetandDiscoverMotifs(params) print('Gibbs RESULT:') pprint(result) if os.path.exists('/kb/module/work/gibbs_out'): shutil.rmtree('/kb/module/work/gibbs_out') shutil.copytree('/kb/module/work/tmp/', '/kb/module/work/gibbs_out/') #fix issue for MotifFindermfmd in catalogue result = mfmd_obj.DiscoverMotifsFromSequenceSet(params) print('MFMD RESULT:') pprint(result) MSU=MotifSuiteUtil() params['motifset_refs']= MSU.get_obj_refs() result = ensemble_obj.MotifEnsemble(params) print('Ensemble RESULT:') print(result) ''' report_info = report.create({'report': {'objects_created':[], 'text_message': params['workspace_name']}, 'workspace_name': params['workspace_name']}) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_MotifSuite # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_MotifSuite return value ' + 'output is not type dict as required.') # return the results return [output]
def run_bsadkhinContigFilter(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_bsadkhinContigFilter # Print statements to stdout/stderr are captured and available as the App log logging.info('Starting run_bsadkhinContigFilter function. Params=' + pformat(params)) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. logging.info('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. logging.info('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 logging.info('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system logging.info('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } logging.info('returning:' + pformat(output)) #END run_bsadkhinContigFilter # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_bsadkhinContigFilter return value ' + 'output is not type dict as required.') # return the results return [output]
def MotifEnsemble(self, ctx, params): """ :param params: instance of type "EnsembleParams" (Internal workflow: 1. Input - list of motifsets , workspace, threshold consensus 2. Download MotifSets -> Utils function 3. Assign motif ids by position in list Use refs to identify MSOs internally! Dictionary of motifsets key: ref, val set list of match sets: each item in the set is a tuple of (ref,index) for each motifset: <- enumerate to avoid duplicate for each motif in motifset for each other motifset: <- enumerate to avoid duplicate for each motif in other: compare(motif1,motif2): if motifs same: search list of sets for motif1: if found add motif2 if not in if not found search list of sets for motif2: if found add motif1 else add a new set with motif1 + motif2) -> structure: parameter "motifset_refs" of list of String, parameter "workspace_name" of String, parameter "threshold" of Double :returns: instance of type "Ensemble_out" -> structure: parameter "motifset_ref" of String """ # ctx is the context object # return variables are: out #BEGIN MotifEnsemble #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.) MotifSetDict = DownloadMotifSet(params['motifset_refs'],self.callback_url) matchSets = [] threshold = float(params['threshold']) for i,MSR1 in enumerate(MotifSetDict.keys()): for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k,MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']): if CompareMotifsBP(motif1,motif2,threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m,mset in enumerate(matchSets): if (MSR1,j) in mset: found1 = True index1 = m if(MSR2,l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1,j)) elif not found2 and found1: matchSets[index1].add((MSR2,l)) elif found1 and found2: if index1 != index2: matchSets[index1].union(matchSets[index2]) matchSets.pop(index2) else: matchSets.append(set([(MSR1,j),(MSR2,l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i,mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys()))/numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) #handle duplicates... #for i,tuple1 in enumerate(matchSets): # for j,tuple2 in enumerate(matchSets): # if j > i: # if tuple1[0] == tuple2[0]: #handle this.... #how...? #merge locations if theyre different #pick one motif by default(p-val) #run motif compare to ensure theyre actually similar enough # print('duplicate') #create new MSO ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = merge(matchSets[keep],MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['id'] = params['workspace_name'] save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #create report htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) MakeReport(htmlDir,ESO) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set #MSO = {} #MSO['Condition'] = 'Temp' #MSO['FeatureSet_ref'] = '123' #MSO['Motifs'] = [] #MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = {} #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(fullMotifList,MSO) #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)) #Pass motif set into this #save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}] #info = dfu.save_objects(save_objects_params)[0] #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END MotifEnsemble # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method MotifEnsemble return value ' + 'out is not type dict as required.') # return the results return [out]
class QualiMapRunner: QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap' JAVA_MEM_DEFAULT_SIZE = '16G' LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024 # 20 GB TIMEOUT = 72 * 60 * 60 # 72 hours def _get_file_size(self, file_path): file_size = os.path.getsize(file_path) print('File size: {} -- {}'.format(file_size, file_path)) return file_size def _large_file(self, file_path): filename, file_extension = os.path.splitext(file_path) multiplier = 0 if file_extension == '.txt': total_file_size = 0 with open(file_path, 'r') as f: for line in f: bam_file_path = line.split('\t')[1] total_file_size += self._get_file_size(bam_file_path) print('Total file size: {}'.format(total_file_size)) multiplier = int(total_file_size) // int(self.LARGE_BAM_FILE_SIZE) else: multiplier = int(self._get_file_size(file_path)) // int( self.LARGE_BAM_FILE_SIZE) print('setting number of windows multiplier to: {}'.format(multiplier)) return multiplier def _timeout_handler(self, signum, frame): print('Signal handler called with signal', signum) raise ValueError('QualiMap takes too long') def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc'] def run_app(self, params): self.validate_params(params) print('Validated Params = ') pprint(params) run_info = self.get_run_info(params) if run_info.get('mode') not in ['single', 'multi']: raise ValueError( 'Error in fetching the type to determine run settings.') run_error = False try: signal.signal(signal.SIGALRM, self._timeout_handler) signal.alarm(self.TIMEOUT) if run_info['mode'] == 'single': result = self.run_bamqc(params['input_ref'], run_info['input_info']) elif run_info['mode'] == 'multi': result = self.run_multi_sample_qc(params['input_ref'], run_info['input_info']) signal.alarm(0) except Exception: run_error = True workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) os.makedirs(workdir) with open(os.path.join(workdir, 'qualimapReport.html'), 'w') as report: report.write('<html><body><p></p></body></html>') package_info = self.package_output_folder( workdir, 'QualiMap_report', 'EMPTY HTML report directory for QualiMap BAM QC', 'qualimapReport.html') result = { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info, 'shock_id': None } error_msg = 'Running QualiMap returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Generating simple report instead\n' print(error_msg) if params['create_report']: result = self.create_report(result, params['output_workspace'], run_error, params['input_ref']) return result def create_report(self, result, output_workspace, run_error=None, input_ref=None): if run_error: objects_created = [] info = self.get_obj_info(input_ref) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: objects_created.append({ 'ref': input_ref, 'description': 'Alignment' }) if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: objects_created.append({ 'ref': input_ref, 'description': 'AlignmentSet' }) reads_alignment_info = self.get_alignments_from_set(input_ref) for alignment in reads_alignment_info: alignment_ref = alignment.get('ref') objects_created.append({ 'ref': alignment_ref, 'description': 'Alignment' }) report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result qc_result_zip_info = result['qc_result_zip_info'] report_info = self.kbr.create_extended_report({ 'message': '', 'objects_created': [], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result def get_gtf_file(self, input_ref, set_op=False): print('Start fetching GFF file from genome') if set_op: set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) input_ref = set_data['data']['items'][0]['ref'] obj_data = self.dfu.get_objects({"object_refs": [input_ref]})['data'][0]['data'] genome_ref = obj_data.get('genome_id') if not genome_ref: raise ValueError( 'Alignment is not associated with a Genome object') result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4())) os.makedirs(result_directory) genome_gtf_file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'is_gtf': True, 'target_dir': result_directory })['file_path'] return genome_gtf_file def run_bamqc(self, input_ref, input_info): # download the input and setup a working dir alignment_info = self.rau.download_alignment({'source_ref': input_ref}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) try: gtf_file = self.get_gtf_file(input_ref) except Exception: gtf_file = '' workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) options = [ '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) # always use large mem multiplier = self._large_file(bam_file_path) if multiplier: window_size = multiplier * 400 print(f'using larger window size: {window_size} and Java memory: ' f'{self.JAVA_MEM_DEFAULT_SIZE}') options.append( '-nw {}'.format(window_size)) # increase size of windows self.run_cli_command('bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap BAM QC', 'qualimapReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def run_multi_sample_qc(self, input_ref, input_info): # download the input and setup a working dir reads_alignment_info = self.get_alignments_from_set(input_ref) try: gtf_file = self.get_gtf_file(input_ref, set_op=True) except Exception: gtf_file = '' suffix = 'qualimap_' + str(int(time.time() * 10000)) workdir = os.path.join(self.scratch_dir, suffix) os.makedirs(workdir) input_file_path = self.create_multi_qualimap_cfg( reads_alignment_info, workdir) options = [ '-d', input_file_path, '-r', '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] multiplier = self._large_file(input_file_path) if multiplier: window_size = multiplier * 400 print(f'using larger window size: {window_size} and Java memory: ' f'{self.JAVA_MEM_DEFAULT_SIZE}') options.append(f'-nw {window_size}') # increase size of windows options.append(f'--java-mem-size={self.JAVA_MEM_DEFAULT_SIZE}') self.run_cli_command('multi-bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap Multi-sample BAM QC', 'multisampleBamQcReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def get_alignments_from_set(self, alignment_set_ref): set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 1 }) items = set_data['data']['items'] reads_alignment_data = [] for alignment in items: alignment_info = self.rau.download_alignment( {'source_ref': alignment['ref']}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) label = None if 'label' in alignment: label = alignment['label'] # remove spacing in label label = '_'.join(label.split(' ')) reads_alignment_data.append({ 'bam_file_path': bam_file_path, 'ref': alignment['ref'], 'label': label, 'info': alignment['info'] }) return reads_alignment_data def create_multi_qualimap_cfg(self, reads_alignment_info, workdir): # Group by labels if there is at least one defined print('reads_alignment_info: {}'.format(reads_alignment_info)) use_labels = False for alignment in reads_alignment_info: if alignment['label']: use_labels = True break # write the file input_file_path = os.path.join(workdir, 'multi_input.txt') print('Start generating: {}'.format(input_file_path)) input_file = open(input_file_path, 'w') name_lookup = {} for alignment in reads_alignment_info: name = alignment['info'][1] if name in name_lookup: name_lookup[name] += 1 name = name + '_' + str(name_lookup[name]) else: name_lookup[name] = 1 input_file.write(name + '\t' + alignment['bam_file_path']) if use_labels: if alignment['label']: input_file.write('\t' + alignment['label']) else: input_file.write('\tunlabeled') input_file.write('\n') input_file.close() with open(input_file_path, 'r') as f: print('Generated: {}'.format(input_file_path)) print(f.read()) return input_file_path def get_run_info(self, params): info = self.get_obj_info(params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: return {'mode': 'single', 'input_info': info} if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: return {'mode': 'multi', 'input_info': info} raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def validate_params(self, params): if 'input_ref' not in params: raise ValueError( 'required parameter field "input_ref" was not set') create_report = False if 'create_report' in params: if int(params['create_report']) == 1: if 'output_workspace' not in params: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) if not params['output_workspace']: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) create_report = True params['create_report'] = create_report def run_cli_command(self, command, options, cwd=None): if command not in self.valid_commands: raise ValueError('Invalid QualiMap command: ' + str(command)) command = [self.QUALIMAP_PATH, command] + options print('Running: ' + ' '.join(command)) if not cwd: cwd = self.scratch_dir p = subprocess.Popen(command, cwd=cwd, shell=False) exitCode = p.wait() if exitCode == 0: print('Success, exit code was: ' + str(exitCode)) else: raise ValueError('Error running command: ' + ' '.join(command) + '\n' + 'Exit Code: ' + str(exitCode)) def find_my_bam_file(self, dirpath): bam_path = None for f in os.listdir(dirpath): fullpath = os.path.join(dirpath, f) if os.path.isfile(fullpath) and f.lower().endswith('.bam'): if bam_path is not None: raise ValueError( 'Error! Too many BAM files were downloaded for this alignment!' ) bam_path = fullpath if bam_path is None: raise ValueError( 'Error! No BAM files were downloaded for this alignment!') return bam_path def package_output_folder(self, folder_path, zip_file_name, zip_file_description, index_html_file): """ Simple utility for packaging a folder and saving to shock """ output = self.dfu.file_to_shock({ 'file_path': folder_path, 'make_handle': 0, 'pack': 'zip' }) return { 'shock_id': output['shock_id'], 'name': zip_file_name, 'description': zip_file_description, 'index_html_file_name': index_html_file } def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
def run_VariationAnalyzer(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "InputParams" -> structure: parameter "obj_name" of String, parameter "workspace_name" of String, parameter "fastq_ref" of String, parameter "map_qual" of Long, parameter "base_qual" of Long, parameter "min_cov" of Long, parameter "min_qual" of Long :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_VariationAnalyzer self.su.validate_params(params) logging.info("Downloading Fastq File") fastq_file = self.dfu._stage_input_file(params['fastq_ref'], "paired_end") logging.info("Downloading assembly file") genome_assembly = self.dfu.download_genome( params['genome_or_assembly_ref']) self.su.deinterleave(fastq_file['files']['fwd'], self.shared_folder) sample_name = "snippy_output" #hardcoded to match with attribute mapping file snippy_output = self.shared_folder + "/" + sample_name cmd = self.su.build_snippy_command(genome_assembly['path'], snippy_output, self.shared_folder) self.su.run_snippy_command(cmd) params[ 'vcf_staging_file_path'] = self.shared_folder + "/" + sample_name + "/snps.vcf" self.vu.save_variation_from_vcf(params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': params['fastq_ref'] }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_VariationAnalyzer # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_VariationAnalyzer return value ' + 'output is not type dict as required.') # return the results return [output]
class nmdc_mg_assembly: def __init__(self, callbaack_url, scratch, wdl='../../metaAssembly/'): self.callback_url = callbaack_url self.scratch = scratch self.special = special(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.report = KBaseReport(self.callback_url) self.wdl_base = wdl def validate_params(self, params): pass def fetch_reads_files(self, reads_upas): """ From a list of reads UPAs, uses ReadsUtils to fetch the reads as files. Returns them as a dictionary from reads_upa -> filename """ if reads_upas is None: raise ValueError("reads_upas must be a list of UPAs") if len(reads_upas) == 0: raise ValueError("reads_upas must contain at least one UPA") reads_info = self.ru.download_reads(({ 'read_libraries': reads_upas, 'interleaved': 'true', 'gzipped': None }))['files'] file_set = dict() for reads in reads_info: file_set[reads] = reads_info[reads]['files']['fwd'] return file_set def run_wdl(self, rf): print(os.getcwd()) wdl_files = ['jgi_assembly.wdl'] for f in wdl_files: src = self.wdl_base + f dst = './' + f shutil.copy(src, dst) ins = { "jgi_metaASM.input_file": [rf.replace(self.scratch, './')], "jgi_metaASM.rename_contig_prefix": "contig", "jgi_metaASM.outdir": "/out/" } input_file = os.path.join(self.scratch, 'inputs.json') with open(input_file, 'w') as f: f.write(json.dumps(ins)) p = {'workflow': wdl_files[0], 'inputs': 'inputs.json'} res = self.special.wdl(p) print('wdl: ' + str(res)) def upload_assembly(self, file_path, workspace_name, assembly_name): """ From a list of file paths, uploads them to KBase, generates Assembly objects, then returns the generated UPAs. """ if not file_path: raise ValueError("file_path must be defined") if not os.path.exists(file_path): raise ValueError( "The given assembly file '{}' does not exist".format( file_path)) if not workspace_name: raise ValueError("workspace_name must be defined") if not assembly_name: raise ValueError("assembly_name must be defined") assembly_upa = self.au.save_assembly_from_fasta({ "file": { "path": file_path }, "workspace_name": workspace_name, "assembly_name": assembly_name }) return assembly_upa def _upload_pipeline_result(self, pipeline_result, workspace_name, assembly_name, filtered_reads_name=None, cleaned_reads_name=None, skip_rqcfilter=False, input_reads=None): """ This is very tricky and uploads (optionally!) a few things under different cases. 1. Uploads assembly - this always happens after a successful run. 2. Cleaned reads - passed RQCFilter / BFC / SeqTK - optional, if cleaned_reads_name isn't None 3. Filtered reads - passed RQCFilter - optional, if filtered_reads_name isn't None AND skip_rqcfilter is False returns a dict of UPAs with the following keys: - assembly_upa - the assembly (always) - filtered_reads_upa - the RQCFiltered reads (optionally) - cleaned_reads_upa - the RQCFiltered -> BFC -> SeqTK cleaned reads (optional) """ # upload the assembly uploaded_assy_upa = self.file_util.upload_assembly( pipeline_result["spades"]["contigs_file"], workspace_name, assembly_name) upload_result = {"assembly_upa": uploaded_assy_upa} # upload filtered reads if we didn't skip RQCFilter (otherwise it's just a copy) if filtered_reads_name and not skip_rqcfilter: # unzip the cleaned reads because ReadsUtils won't do it for us. decompressed_reads = os.path.join(self.output_dir, "filtered_reads.fastq") pigz_command = "{} -d -c {} > {}".format( PIGZ, pipeline_result["rqcfilter"]["filtered_fastq_file"], decompressed_reads) p = subprocess.Popen(pigz_command, cwd=self.scratch_dir, shell=True) exit_code = p.wait() if exit_code != 0: raise RuntimeError( "Unable to decompress filtered reads for validation! Can't upload them, either!" ) filtered_reads_upa = self.file_util.upload_reads( decompressed_reads, workspace_name, filtered_reads_name, input_reads) upload_result["filtered_reads_upa"] = filtered_reads_upa # upload the cleaned reads if cleaned_reads_name: # unzip the cleaned reads because ReadsUtils won't do it for us. decompressed_reads = os.path.join(self.output_dir, "cleaned_reads.fastq") pigz_command = "{} -d -c {} > {}".format( PIGZ, pipeline_result["seqtk"]["cleaned_reads"], decompressed_reads) p = subprocess.Popen(pigz_command, cwd=self.scratch_dir, shell=True) exit_code = p.wait() if exit_code != 0: raise RuntimeError( "Unable to decompress cleaned reads for validation! Can't upload them, either!" ) cleaned_reads_upa = self.file_util.upload_reads( decompressed_reads, workspace_name, cleaned_reads_name, input_reads) upload_result["cleaned_reads_upa"] = cleaned_reads_upa return upload_result def assemble(self, params): self.validate_params(params) workspace_name = params['workspace_name'] assembly_name = params['output_assembly_name'] # Stage Data files = self.fetch_reads_files([params["reads_upa"]]) reads_files = list(files.values()) # Run WDL self.run_wdl(reads_files[0]) # Check if things ran mfile = os.path.join(self.scratch, 'meta.json') print(mfile) if not os.path.exists(mfile): raise OSError("Failed to run workflow") with open(mfile) as f: pipeline_output = json.loads(f.read()) out = pipeline_output["calls"]["jgi_metaASM.create_agp"][0]["outputs"] print(out) # Generate Output Objects contigs_fn = out['outcontigs'] upa = self.upload_assembly(contigs_fn, workspace_name, assembly_name) upload_kwargs = {} print("upload complete") # Do report report_info = self.report.create({ 'report': { 'objects_created': [], 'text_message': "Assemble metagenomic reads" }, 'workspace_name': workspace_name }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'], }
def run_Gblocks(self, ctx, params): """ Method for trimming MSAs of either DNA or PROTEIN sequences ** ** input_type: MSA ** output_type: MSA :param params: instance of type "Gblocks_Params" (Gblocks Input Params) -> structure: parameter "workspace_name" of type "workspace_name" (** The workspace object refs are of form: ** ** objects = ws.get_objects([{'ref': params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means the entire name combining the workspace id and the object name ** "id" is a numerical identifier of the workspace or object, and should just be used for workspace ** "name" is a string identifier of a workspace or object. This is received from Narrative.), parameter "desc" of String, parameter "input_ref" of type "data_obj_ref", parameter "output_name" of type "data_obj_name", parameter "trim_level" of Long, parameter "min_seqs_for_conserved" of Long, parameter "min_seqs_for_flank" of Long, parameter "max_pos_contig_nonconserved" of Long, parameter "min_block_len" of Long, parameter "remove_mask_positions_flag" of Long :returns: instance of type "Gblocks_Output" (Gblocks Output) -> structure: parameter "report_name" of type "data_obj_name", parameter "report_ref" of type "data_obj_ref" """ # ctx is the context object # return variables are: returnVal #BEGIN run_Gblocks console = [] invalid_msgs = [] self.log(console,'Running run_Gblocks with params=') self.log(console, "\n"+pformat(params)) report = '' # report = 'Running run_Gblocks with params=' # report += "\n"+pformat(params) #### do some basic checks # if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'input_ref' not in params: raise ValueError('input_ref parameter is required') if 'output_name' not in params: raise ValueError('output_name parameter is required') #### Get the input_ref MSA object ## try: ws = workspaceService(self.workspaceURL, token=ctx['token']) objects = ws.get_objects([{'ref': params['input_ref']}]) data = objects[0]['data'] info = objects[0]['info'] input_name = info[1] input_type_name = info[2].split('.')[1].split('-')[0] except Exception as e: raise ValueError('Unable to fetch input_ref object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() if input_type_name == 'MSA': MSA_in = data row_order = [] default_row_labels = dict() if 'row_order' in MSA_in.keys(): row_order = MSA_in['row_order'] else: row_order = sorted(MSA_in['alignment'].keys()) if 'default_row_labels' in MSA_in.keys(): default_row_labels = MSA_in['default_row_labels'] else: for row_id in row_order: default_row_labels[row_id] = row_id if len(row_order) < 2: self.log(invalid_msgs,"must have multiple records in MSA: "+params['input_ref']) # export features to FASTA file input_MSA_file_path = os.path.join(self.scratch, input_name+".fasta") self.log(console, 'writing fasta file: '+input_MSA_file_path) records = [] for row_id in row_order: #self.log(console,"row_id: '"+row_id+"'") # DEBUG #self.log(console,"alignment: '"+MSA_in['alignment'][row_id]+"'") # DEBUG # using SeqIO makes multiline sequences. (Gblocks doesn't care, but FastTree doesn't like multiline, and I don't care enough to change code) #record = SeqRecord(Seq(MSA_in['alignment'][row_id]), id=row_id, description=default_row_labels[row_id]) #records.append(record) #SeqIO.write(records, input_MSA_file_path, "fasta") records.extend(['>'+row_id, MSA_in['alignment'][row_id] ]) with open(input_MSA_file_path,'w',0) as input_MSA_file_handle: input_MSA_file_handle.write("\n".join(records)+"\n") # Determine whether nuc or protein sequences # NUC_MSA_pattern = re.compile("^[\.\-_ACGTUXNRYSWKMBDHVacgtuxnryswkmbdhv \t\n]+$") all_seqs_nuc = True for row_id in row_order: #self.log(console, row_id+": '"+MSA_in['alignment'][row_id]+"'") if NUC_MSA_pattern.match(MSA_in['alignment'][row_id]) == None: all_seqs_nuc = False break # Missing proper input_type # else: raise ValueError('Cannot yet handle input_ref type of: '+type_name) # DEBUG: check the MSA file contents # with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle: # for line in input_MSA_file_handle: # #self.log(console,"MSA_LINE: '"+line+"'") # too big for console # self.log(invalid_msgs,"MSA_LINE: '"+line+"'") # validate input data # N_seqs = 0 L_first_seq = 0 with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle: for line in input_MSA_file_handle: if line.startswith('>'): N_seqs += 1 continue if L_first_seq == 0: for c in line: if c != '-' and c != ' ' and c != "\n": L_first_seq += 1 # min_seqs_for_conserved if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0: if int(params['min_seqs_for_conserved']) < int(0.5*N_seqs)+1: self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be >= N/2+1 (N="+str(N_seqs)+", N/2+1="+str(int(0.5*N_seqs)+1)+")\n") if int(params['min_seqs_for_conserved']) > int(params['min_seqs_for_flank']): self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be <= Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+")\n") # min_seqs_for_flank if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0: if int(params['min_seqs_for_flank']) > N_seqs: self.log(invalid_msgs,"Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+") must be <= N (N="+str(N_seqs)+")\n") # max_pos_contig_nonconserved if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) != 0: if int(params['max_pos_contig_nonconserved']) < 0: self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be >= 0"+"\n") if int(params['max_pos_contig_nonconserved']) > L_first_seq or int(params['max_pos_contig_nonconserved']) >= 32000: self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n") # min_block_len if 'min_block_len' in params and params['min_block_len'] != None and int(params['min_block_len']) != 0: if int(params['min_block_len']) < 2: self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be >= 2"+"\n") if int(params['min_block_len']) > L_first_seq or int(params['min_block_len']) >= 32000: self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n") # trim_level if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0: if int(params['trim_level']) < 0 or int(params['trim_level']) > 2: self.log(invalid_msgs,"Trim Level ("+str(params['trim_level'])+") must be >= 0 and <= 2"+"\n") if len(invalid_msgs) > 0: # load the method provenance from the context object self.log(console,"SETTING PROVENANCE") # DEBUG provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [] provenance[0]['input_ws_objects'].append(params['input_ref']) provenance[0]['service'] = 'kb_gblocks' provenance[0]['method'] = 'run_Gblocks' # report report += "FAILURE\n\n"+"\n".join(invalid_msgs)+"\n" reportObj = { 'objects_created':[], 'text_message':report } reportName = 'gblocks_report_'+str(uuid.uuid4()) report_obj_info = ws.save_objects({ # 'id':info[6], 'workspace':params['workspace_name'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] self.log(console,"BUILDING RETURN OBJECT") returnVal = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]) # 'output_ref': None } self.log(console,"run_Gblocks DONE") return [returnVal] ### Construct the command # # e.g. # for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks # for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks # gblocks_cmd = [self.GBLOCKS_bin] # check for necessary files if not os.path.isfile(self.GBLOCKS_bin): raise ValueError("no such file '"+self.GBLOCKS_bin+"'") if not os.path.isfile(input_MSA_file_path): raise ValueError("no such file '"+input_MSA_file_path+"'") if not os.path.getsize(input_MSA_file_path) > 0: raise ValueError("empty file '"+input_MSA_file_path+"'") # DEBUG # with open(input_MSA_file_path,'r',0) as input_MSA_file_handle: # for line in input_MSA_file_handle: # #self.log(console,"MSA LINE: '"+line+"'") # too big for console # self.log(invalid_msgs,"MSA LINE: '"+line+"'") # set the output path timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000) output_dir = os.path.join(self.scratch,'output.'+str(timestamp)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Gblocks names output blocks MSA by appending "-gb" to input file #output_GBLOCKS_file_path = os.path.join(output_dir, input_name+'-gb') output_GBLOCKS_file_path = input_MSA_file_path+'-gb' output_aln_file_path = output_GBLOCKS_file_path # Gblocks is interactive and only accepts args from pipe input #if 'arg' in params and params['arg'] != None and params['arg'] != 0: # fasttree_cmd.append('-arg') # fasttree_cmd.append(val) # Run GBLOCKS, capture output as it happens # self.log(console, 'RUNNING GBLOCKS:') self.log(console, ' '+' '.join(gblocks_cmd)) # report += "\n"+'running GBLOCKS:'+"\n" # report += ' '+' '.join(gblocks_cmd)+"\n" # FastTree requires shell=True in order to see input data env = os.environ.copy() #joined_fasttree_cmd = ' '.join(fasttree_cmd) # redirect out doesn't work with subprocess unless you join command first #p = subprocess.Popen([joined_fasttree_cmd], \ p = subprocess.Popen(gblocks_cmd, \ cwd = self.scratch, \ stdin = subprocess.PIPE, \ stdout = subprocess.PIPE, \ stderr = subprocess.PIPE, \ shell = True, \ env = env) # executable = '/bin/bash' ) # write commands to process # # for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks # for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks p.stdin.write("o"+"\n") # open MSA file p.stdin.write(input_MSA_file_path+"\n") if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0: p.stdin.write("b"+"\n") if int(params['trim_level']) >= 1: self.log (console,"changing trim level") p.stdin.write("5"+"\n") # set to "half" if int(params['trim_level']) == 2: self.log (console,"changing trim level") p.stdin.write("5"+"\n") # set to "all" elif int(params['trim_level']) > 2: raise ValueError ("trim_level ("+str(params['trim_level'])+") was not between 0-2") p.stdin.write("m"+"\n") # flank must precede conserved because it acts us upper bound for acceptable conserved values if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0: self.log (console,"changing min_seqs_for_flank") p.stdin.write("b"+"\n") p.stdin.write("2"+"\n") p.stdin.write(str(params['min_seqs_for_flank'])+"\n") p.stdin.write("m"+"\n") if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0: self.log (console,"changing min_seqs_for_conserved") p.stdin.write("b"+"\n") p.stdin.write("1"+"\n") p.stdin.write(str(params['min_seqs_for_conserved'])+"\n") p.stdin.write("m"+"\n") if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) > -1: self.log (console,"changing max_pos_contig_nonconserved") p.stdin.write("b"+"\n") p.stdin.write("3"+"\n") p.stdin.write(str(params['max_pos_contig_nonconserved'])+"\n") p.stdin.write("m"+"\n") if 'min_block_len' in params and params['min_block_len'] != None and params['min_block_len'] != 0: self.log (console,"changing min_block_len") p.stdin.write("b"+"\n") p.stdin.write("4"+"\n") p.stdin.write(str(params['min_block_len'])+"\n") p.stdin.write("m"+"\n") p.stdin.write("g"+"\n") # get blocks p.stdin.write("q"+"\n") # quit p.stdin.close() p.wait() # Read output # while True: line = p.stdout.readline() #line = p.stderr.readline() if not line: break self.log(console, line.replace('\n', '')) p.stdout.close() #p.stderr.close() p.wait() self.log(console, 'return code: ' + str(p.returncode)) # if p.returncode != 0: if p.returncode != 1: raise ValueError('Error running GBLOCKS, return code: '+str(p.returncode) + '\n\n'+ '\n'.join(console)) # Check that GBLOCKS produced output # if not os.path.isfile(output_GBLOCKS_file_path): raise ValueError("failed to create GBLOCKS output: "+output_GBLOCKS_file_path) elif not os.path.getsize(output_GBLOCKS_file_path) > 0: raise ValueError("created empty file for GBLOCKS output: "+output_GBLOCKS_file_path) # load the method provenance from the context object # self.log(console,"SETTING PROVENANCE") # DEBUG provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [] provenance[0]['input_ws_objects'].append(params['input_ref']) provenance[0]['service'] = 'kb_gblocks' provenance[0]['method'] = 'run_Gblocks' # reformat output to single-line FASTA MSA and check that output not empty (often happens when param combinations don't produce viable blocks # output_fasta_buf = [] id_order = [] this_id = None ids = dict() alignment = dict() L_alignment = 0; L_alignment_set = False with open(output_GBLOCKS_file_path,'r',0) as output_GBLOCKS_file_handle: for line in output_GBLOCKS_file_handle: line = line.rstrip() if line.startswith('>'): this_id = line[1:] output_fasta_buf.append ('>'+re.sub('\s','_',default_row_labels[this_id])) id_order.append(this_id) alignment[this_id] = '' if L_alignment != 0 and not L_alignment_set: L_alignment_set = True continue output_fasta_buf.append (line) for c in line: if c != ' ' and c != "\n": alignment[this_id] += c if not L_alignment_set: L_alignment += 1 if L_alignment == 0: self.log(invalid_msgs,"params produced no blocks. Consider changing to less stringent values") else: if 'remove_mask_positions_flag' in params and params['remove_mask_positions_flag'] != None and params['remove_mask_positions_flag'] != '' and params['remove_mask_positions_flag'] == 1: self.log (console,"removing mask positions") mask = [] new_alignment = dict() for i in range(0,L_alignment): mask[i] = '+' if alignment[id_order[0]][i] == '-' \ or alignment[id_order[0]][i] == 'X' \ or alignment[id_order[0]][i] == 'x': mask[i] = '-' for row_id in id_order: new_alignment[row_id] = '' for i,c in enumerate(alignment[row_id]): if mask[i] == '+': new_alignment[row_id] += c alignment = new_alignment L_alignment = len(alignment[id_order[0]]) # write fasta with tidied ids output_MSA_file_path = os.path.join(output_dir, params['output_name']+'.fasta'); with open(output_MSA_file_path,'w',0) as output_MSA_file_handle: output_MSA_file_handle.write("\n".join(output_fasta_buf)+"\n") # Upload results # if len(invalid_msgs) == 0: self.log(console,"UPLOADING RESULTS") # DEBUG # Didn't write file # with open(output_MSA_file_path,'r',0) as output_MSA_file_handle: # output_MSA_buf = output_MSA_file_handle.read() # output_MSA_buf = output_MSA_buf.rstrip() # self.log(console,"\nMSA:\n"+output_MSA_buf+"\n") # Build output_MSA structure # first extract old info from MSA (labels, ws_refs, etc.) # MSA_out = dict() for key in MSA_in.keys(): MSA_out[key] = MSA_in[key] # then replace with new info # MSA_out['alignment'] = alignment MSA_out['name'] = params['output_name'] MSA_out['alignment_length'] = alignment_length = L_alignment MSA_name = params['output_name'] MSA_description = '' if 'desc' in params and params['desc'] != None and params['desc'] != '': MSA_out['desc'] = MSA_description = params['desc'] # Store MSA_out # new_obj_info = ws.save_objects({ 'workspace': params['workspace_name'], 'objects':[{ 'type': 'KBaseTrees.MSA', 'data': MSA_out, 'name': params['output_name'], 'meta': {}, 'provenance': provenance }] })[0] # create CLW formatted output file max_row_width = 60 id_aln_gap_width = 1 gap_chars = '' for sp_i in range(id_aln_gap_width): gap_chars += ' ' # DNA if all_seqs_nuc: strong_groups = { 'AG': True, 'CTU': True } weak_groups = None # PROTEINS else: strong_groups = { 'AST': True, 'EKNQ': True, 'HKNQ': True, 'DENQ': True, 'HKQR': True, 'ILMV': True, 'FILM': True, 'HY': True, 'FWY': True } weak_groups = { 'ACS': True, 'ATV': True, 'AGS': True, 'KNST': True, 'APST': True, 'DGNS': True, 'DEKNQS': True, 'DEHKNQ': True, 'EHKNQR': True, 'FILMV': True, 'FHY': True } clw_buf = [] clw_buf.append ('CLUSTALW format of GBLOCKS trimmed MSA '+MSA_name+': '+MSA_description) clw_buf.append ('') long_id_len = 0 aln_pos_by_id = dict() for row_id in row_order: aln_pos_by_id[row_id] = 0 row_id_disp = default_row_labels[row_id] if long_id_len < len(row_id_disp): long_id_len = len(row_id_disp) full_row_cnt = alignment_length // max_row_width if alignment_length % max_row_width == 0: full_row_cnt -= 1 for chunk_i in range (full_row_cnt + 1): for row_id in row_order: row_id_disp = re.sub('\s','_',default_row_labels[row_id]) for sp_i in range (long_id_len-len(row_id_disp)): row_id_disp += ' ' aln_chunk_upper_bound = (chunk_i+1)*max_row_width if aln_chunk_upper_bound > alignment_length: aln_chunk_upper_bound = alignment_length aln_chunk = alignment[row_id][chunk_i*max_row_width:aln_chunk_upper_bound] for c in aln_chunk: if c != '-': aln_pos_by_id[row_id] += 1 clw_buf.append (row_id_disp+gap_chars+aln_chunk+' '+str(aln_pos_by_id[row_id])) # conservation line cons_line = '' for pos_i in range(chunk_i*max_row_width, aln_chunk_upper_bound): col_chars = dict() seq_cnt = 0 for row_id in row_order: char = alignment[row_id][pos_i] if char != '-': seq_cnt += 1 col_chars[char] = True if seq_cnt <= 1: cons_char = ' ' elif len(col_chars.keys()) == 1: cons_char = '*' else: strong = False for strong_group in strong_groups.keys(): this_strong_group = True for seen_char in col_chars.keys(): if seen_char not in strong_group: this_strong_group = False break if this_strong_group: strong = True break if not strong: weak = False if weak_groups != None: for weak_group in weak_groups.keys(): this_weak_group = True for seen_char in col_chars.keys(): if seen_char not in weak_group: this_strong_group = False break if this_weak_group: weak = True if strong: cons_char = ':' elif weak: cons_char = '.' else: cons_char = ' ' cons_line += cons_char lead_space = '' for sp_i in range(long_id_len): lead_space += ' ' lead_space += gap_chars clw_buf.append(lead_space+cons_line) clw_buf.append('') # write clw to file clw_buf_str = "\n".join(clw_buf)+"\n" output_clw_file_path = os.path.join(output_dir, input_name+'-MSA.clw'); with open (output_clw_file_path, "w", 0) as output_clw_file_handle: output_clw_file_handle.write(clw_buf_str) output_clw_file_handle.close() # upload GBLOCKS FASTA output to SHOCK for file_links dfu = DFUClient(self.callbackURL) try: output_upload_ret = dfu.file_to_shock({'file_path': output_aln_file_path, # DEBUG # 'make_handle': 0, # 'pack': 'zip'}) 'make_handle': 0}) except: raise ValueError ('error loading aln_out file to shock') # upload GBLOCKS CLW output to SHOCK for file_links try: output_clw_upload_ret = dfu.file_to_shock({'file_path': output_clw_file_path, # DEBUG # 'make_handle': 0, # 'pack': 'zip'}) 'make_handle': 0}) except: raise ValueError ('error loading clw_out file to shock') # make HTML reports # # HERE # build output report object # self.log(console,"BUILDING REPORT") # DEBUG reportName = 'gblocks_report_'+str(uuid.uuid4()) reportObj = { 'objects_created':[{'ref':params['workspace_name']+'/'+params['output_name'], 'description':'GBLOCKS MSA'}], #'message': '', 'message': clw_buf_str, 'direct_html': '', #'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'workspace_name': params['workspace_name'], 'report_object_name': reportName } reportObj['file_links'] = [{'shock_id': output_upload_ret['shock_id'], 'name': params['output_name']+'-GBLOCKS.FASTA', 'label': 'GBLOCKS-trimmed MSA FASTA' }, {'shock_id': output_clw_upload_ret['shock_id'], 'name': params['output_name']+'-GBLOCKS.CLW', 'label': 'GBLOCKS-trimmed MSA CLUSTALW' }] # save report object # SERVICE_VER = 'release' reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']}) report_info = reportClient.create_extended_report(reportObj) else: # len(invalid_msgs) > 0 reportName = 'gblocks_report_'+str(uuid.uuid4()) report += "FAILURE:\n\n"+"\n".join(invalid_msgs)+"\n" reportObj = { 'objects_created':[], 'text_message':report } ws = workspaceService(self.workspaceURL, token=ctx['token']) report_obj_info = ws.save_objects({ #'id':info[6], 'workspace':params['workspace_name'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] report_info = dict() report_info['name'] = report_obj_info[1] report_info['ref'] = str(report_obj_info[6])+'/'+str(report_obj_info[0])+'/'+str(report_obj_info[4]) # done returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } self.log(console,"run_Gblocks DONE") #END run_Gblocks # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method run_Gblocks return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]