class MotifSaver: def __init__(self, callback, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) def saveMotifSet(self, motifset, params): if isinstance(motifset, list): logging.info('Saving multiple motifset objects...') # TODO: accept lists of constructed motif set object # TODO: check if list is a save_objects list or list of motifsets process accordingly # TODO: accept list of object names obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['ws_name']), 'objects': [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': motifset[0], 'name': str(uuid.uuid4()) }] })[0] return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4]) elif isinstance(motifset, dict): logging.info('Saving a single motifset object...') # TODO: accept object name obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['ws_name']), 'objects': [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': motifset, 'name': str(uuid.uuid4()) }] })[0] return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4]) else: raise ValueError( 'Input to motif saver should be either: ' + '\n' '1. a list of constructed KBaseGeneRegulation.MotifSet objects (dictionary)\n' + '2. a single KBaseGeneRegulation.MotifSet object (dictionary)')
def test_AssemblySet_input(self): # Initiate empty data dictionaries and get data_util dfu = DataFileUtil(self.callback_url) assembly_dict = dict() assembly_set_dict = dict() dfu_dict = dict() dfu_dict_2 = dict() # Get workspace id and name wsName = self.getWsName() ws_id = dfu.ws_name_to_id(wsName) # FASTA to assembly object Fasta_assembly_dict = { "path": "/kb/module/work/tmp/NC_021490.fasta", "assembly_name": "test_assembly" } params = { "file": Fasta_assembly_dict, "workspace_name": wsName, "assembly_name": "test_assembly" } ref = self.getImpl().save_assembly_from_fasta(self.ctx, params) # Create assembly data dictionaries assembly_dict.update({"label": "assemblySetTest", "ref": ref[0]}) assembly_set_dict.update({ "description": " ", "items": [assembly_dict] }) # Create DataFileUtil dictionaries dfu_dict.update({ "type": "KBaseSets.AssemblySet", "data": assembly_set_dict, "name": "Assembly_Test" }) dfu_dict_2.update({'id': ws_id, 'objects': [dfu_dict]}) # Create assembly set object assembly_set_obj = dfu.save_objects(dfu_dict_2) assembly_set_ref = [ str(assembly_set_obj[0][6]) + '/' + str(assembly_set_obj[0][0]) + '/' + str(assembly_set_obj[0][4]) ] # Get FASTA ret = self.getImpl().get_fastas(self.callback_url, assembly_set_ref)
def UploadFromMdscan(self, callback_url, params): """ :param params: instance of type "UploadmfmdInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFromMdscan print('Extracting motifs') motifList = self.parse_mdscan_output(params['path']) print(motifList) MSO = {} MSO = motifList dfu = DataFileUtil(callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': MSO, 'name': params['obj_name'] }] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref': motif_set_ref} print(output) #exit("test") #END UploadFromMdscan # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFrommfmd return value ' + 'output is not type dict as required.') # return the results return [output]
def upload_pangenome(cb_url, scratch, Pangenome, workspace_name, pangenome_name): """ params: cb_url : callback url scratch : folder path to Pangenome object pangenome : KBaseGenomes.Pangenome like object workspace_name : workspace name pangenome_name : Pangenome display name Returns: pangenome_ref: Pangenome workspace reference pangenome_info: info on pangenome object """ dfu = DataFileUtil(cb_url) meta = {} hidden = 0 # dump pangenome to scratch for upload # data_path = os.path.join(scratch, pangenome_name + '.json') # json.dump(pangenome, open(data_path, 'w')) if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = dfu.ws_name_to_id(workspace_name) save_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseGenomes.Pangenome', 'data': Pangenome, 'name': pangenome_name, 'meta': meta, 'hidden': hidden }] } info = dfu.save_objects(save_params)[0] ref = "{}/{}/{}".format(info[6], info[0], info[4]) print("Pangenome saved to {}".format(ref)) return {'pangenome_ref': ref, 'pangenome_info': info}
def test_metagenome_binned_input(self): # Setup path = "data/binnedContigs.json" ws_path = '/kb/module/work/tmp' assembly_path = "data/CCESR16_SPAdes.assembly.fa" shutil.copy2(path, ws_path) shutil.copy2(assembly_path, ws_path) dfu = DataFileUtil(self.callback_url) wsName = self.getWsName() ws_id = dfu.ws_name_to_id(wsName) # FASTA to assembly object Fasta_assembly_dict = { "path": '/kb/module/work/tmp/CCESR16_SPAdes.assembly.fa', "assembly_name": "meta_assembly" } assembly_params = { "file": Fasta_assembly_dict, "workspace_name": wsName, "assembly_name": "test_assembly" } meta_assembly_ref = self.getImpl().save_assembly_from_fasta( self.ctx, assembly_params)[0] # Upload genome, copy genome to workspace folder, & genome data dictionary input meta_data = json.load(open(path)) meta_data['assembly_ref'] = meta_assembly_ref meta_dict = [{ 'name': 'Meta_test', 'type': 'KBaseMetagenomes.BinnedContigs', 'data': meta_data }] # Create .Genome object in workspace with save_objects binned_obj = dfu.save_objects({'id': ws_id, 'objects': meta_dict}) binned_obj_info = binned_obj[0] binned_obj_ref = str(binned_obj_info[6]) + '/' + str( binned_obj_info[0]) + '/' + str(binned_obj_info[4]) # Get FASTA ret = self.getImpl().get_fastas(self.callback_url, [binned_obj_ref])
class PDBUtil: def _validate_import_pdb_file_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ # check for required parameters for p in ['structure_name', 'workspace_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file( {'shock_id': params['input_shock_id'], 'file_path': self.scratch}).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file( {'staging_file_subdir_path': params.get('input_staging_file_path')} ).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params.get('workspace_name'), params.get('structure_name') def _file_to_data(self, file_path): """Do the PDB conversion""" pdb1 = file_path structure = parser.get_structure("test", pdb1) model = structure[0] chain_no = 0 res_no = 0 atom_no = 0 pp_list = [] pp_no = 0 for model in structure: for chain in model: chain_no += 1 for residue in model.get_residues(): if PDB.is_aa(residue): res_no += 1 for atom in residue.get_atoms(): atom_no += 1 for pp in ppb.build_peptides(structure): pp_no += 1 my_seq= pp.get_sequence() pp_list += str(my_seq) seq = ''.join(pp_list) return { 'name': os.path.basename(file_path), 'num_chains': chain_no, 'num_residues': res_no, 'num_atoms': atom_no, 'protein': { 'id': os.path.basename(file_path), 'sequence': seq, 'md5': hashlib.md5(seq.encode()).hexdigest() }, } def _get_pdb_shock_id(self, obj_ref): """Return the shock id for the PDB file""" obj_data = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]['data'] return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock(file_to_shock_params)['handle']['hid'] return shock_id def _generate_html_report(self, header_str, table_str): #TODO: make this work with the PDB viewer html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('//HEADER_STR', header_str) report_template = report_template.replace('//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': output_directory, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App'}) return html_report def _generate_report(self, pdb_obj_ref, workspace_name): """ _generate_report: generate summary report """ # included as an example. Replace with your own implementation # output_html_files = self._generate_html_report(header_str, table_str) report_params = {'message': 'You uploaded a PDB file!', #'html_links': output_html_files, #'direct_html_link_index': 0, 'objects_created': [{'ref': pdb_obj_ref, 'description': 'Imported PDB'}], 'workspace_name': workspace_name, 'report_object_name': 'import_pdb_from_staging_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.hs = AbstractHandle(config['handle-service-url']) def import_model_pdb_file(self, params): file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path) data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') logging.info(data) info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [ {'type': 'KBaseStructure.ModelProteinStructure', 'name': pdb_name, 'data': data}] })[0] obj_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'structure_obj_ref': obj_ref} report_output = self._generate_report(obj_ref, workspace_name) returnVal.update(report_output) return returnVal def export_pdb(self, params): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def structure_to_pdb_file(self, params): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") shock_id = self._get_pdb_shock_id(params['input_ref']) file_path = self.dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': params['destination_dir'], 'unpack': 'uncompress' })['file_path'] return {'file_path': file_path}
class plant_fba: ''' Module Name: plant_fba Module Description: A KBase module: plant_fba ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "1.1.1" GIT_URL = "[email protected]:kbaseapps/plant_fba.git" GIT_COMMIT_HASH = "6f0b5af5a458c5158b9f0007399653a256edcd14" #BEGIN_CLASS_HEADER def convert_search_role(self, role): searchrole = role #Remove spaces searchrole = searchrole.strip() searchrole = searchrole.replace(' ', '') #Make all lowercase searchrole = searchrole.lower() #Remove EC and parentheses searchrole = re.sub(r'\(ec[\d-]+\.[\d-]\.[\d-]\.[\d-]\)', '', searchrole) return searchrole #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.token = os.environ['KB_AUTH_TOKEN'] self.shared_folder = config['scratch'] self.config = config self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass def integrate_abundances_with_metabolism(self, ctx, input_params): """ :param input_params: instance of type "IntegrateAbundancesParams" (@optional input_columns) -> structure: parameter "input_ws" of String, parameter "input_expression_matrix" of String, parameter "input_fbamodel" of String, parameter "input_columns" of String, parameter "output_reaction_matrix" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output_report #BEGIN integrate_abundances_with_metabolism app = IntegrateAppImpl(self.config, ctx, input_params) output_report = app.integrate_abundances_with_metabolism() #END integrate_abundances_with_metabolism # At some point might do deeper type checking... if not isinstance(output_report, dict): raise ValueError( 'Method integrate_abundances_with_metabolism return value ' + 'output_report is not type dict as required.') # return the results return [output_report] def reconstruct_plant_metabolism(self, ctx, input_params): """ :param input_params: instance of type "ReconstructMetabolismParams" -> structure: parameter "input_ws" of String, parameter "input_genome" of String, parameter "output_ws" of String, parameter "output_fbamodel" of String, parameter "template" of String, parameter "template_ws" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output_report #BEGIN reconstruct_plant_metabolism #Compile biochemistry information abbrev_cpt_dict = dict() cpt_name_dict = dict() with open('/kb/module/data/compartments.txt') as fh: for line in fh.readlines(): line = line.strip('\r\n') array = line.split('\t') abbrev_cpt_dict[array[3]] = array[0] cpt_name_dict[array[0]] = array[2] # Fetch and parse biochemistry data with open( os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry", "reactions.json")) as msd_rxn_fh: MSD_reactions = json.load(msd_rxn_fh) MSD_reactions_dict = dict() for entry in MSD_reactions: MSD_reactions_dict[entry['id']] = entry with open( os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry", "compounds.json")) as msd_rxn_fh: MSD_compounds = json.load(msd_rxn_fh) MSD_compounds_dict = dict() for entry in MSD_compounds: MSD_compounds_dict[entry['id']] = entry # Retrieve Template, and compile indexes of roles and complexes if ('template_ws' not in input_params or input_params['template_ws'] == ''): input_params['template_ws'] = 'NewKBaseModelTemplates' if ('template' not in input_params or input_params['template'] == ''): input_params['template'] = 'PlantModelTemplate' template_ref = input_params['template_ws'] + '/' + input_params[ 'template'] template_obj = self.dfu.get_objects({'object_refs': [template_ref]})['data'][0] searchroles_dict = dict() roles_dict = dict() for role in template_obj['data']['roles']: searchrole = self.convert_search_role(role['name']) searchroles_dict[searchrole] = role['id'] roles_dict[role['id']] = role complex_dict = dict() for cpx in template_obj['data']['complexes']: complex_dict[cpx['id']] = cpx #Retrieve Genome annotation as dict role_cpt_ftr_dict = dict() genome_ref = input_params['input_ws'] + '/' + input_params[ 'input_genome'] genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]})['data'][0] for feature in genome_obj['data']['features']: if ('functions' in feature and len(feature['functions']) > 0): for function_comment in feature['functions']: #Split for comments and retrieve compartments function_cpt_list = function_comment.split("#") for i in range(len(function_cpt_list)): function_cpt_list[i] = function_cpt_list[i].strip() function = function_cpt_list.pop(0) roles = re.split("\s*;\s+|\s+[\@\/]\s+", function) for role in roles: searchrole = self.convert_search_role(role) if (searchrole not in searchroles_dict): continue role_id = searchroles_dict[searchrole] if (role_id not in role_cpt_ftr_dict): role_cpt_ftr_dict[role_id] = dict() #Defaults to cytosol if (len(function_cpt_list) == 0): function_cpt_list.append('cytosol') for cpt in function_cpt_list: abbrev_cpt = cpt if (cpt not in abbrev_cpt_dict): print( "No compartmental abbreviation found for " + cpt) else: abbrev_cpt = abbrev_cpt_dict[cpt] if (abbrev_cpt not in role_cpt_ftr_dict[role_id]): role_cpt_ftr_dict[role_id][abbrev_cpt] = dict() role_cpt_ftr_dict[role_id][abbrev_cpt][ feature['id']] = 1 #Default dictionaries for objects needed for a model reaction default_mdlcpt_dict = { 'id': 'u0', 'label': 'unknown', 'pH': 7, 'potential': 0, 'compartmentIndex': 0, 'compartment_ref': '~//' } default_mdlcpd_dict = { 'id': '', 'charge': 0, 'formula': '', 'name': '', 'compound_ref': '', 'modelcompartment_ref': '~/modelcompartments/id/u0' } default_mdlrxn_dict = { 'id': '', 'direction': '', 'protons': 0, 'name': '', 'reaction_ref': '', 'probability': 0, 'modelcompartment_ref': '', 'modelReactionReagents': [], 'modelReactionProteins': [] } #Lookup dictionaries for compartments and compounds, to avoid duplicating them mdlcpts_dict = dict() mdlcpds_dict = dict() #Reaction complexes for the generated table rxncplxs_dict = dict() #Create New, but Empty Plant Reconstruction new_model_obj = { 'id': input_params['output_fbamodel'], 'type': "GenomeScale", 'source': "KBase", 'source_id': "PlantSEED_v2", 'template_ref': template_ref, 'genome_ref': genome_ref, 'name': input_params['output_fbamodel'], 'modelreactions': [], 'modelcompounds': [], 'modelcompartments': [], 'biomasses': [], 'gapgens': [], 'gapfillings': [] } for template_rxn in template_obj['data']['reactions']: if (template_rxn['type'] == 'gapfilling'): continue template_rxn_cpt = template_rxn['templatecompartment_ref'].split( '/')[-1] proteins_list = list() prots_str_list = list() #complex_ref and source are optional fields default_protein_dict = { 'note': template_rxn['type'], 'complex_ref': '', 'modelReactionProteinSubunits': [] } for cpx_ref in template_rxn['templatecomplex_refs']: cpx_id = cpx_ref.split('/')[-1] model_complex_ref = "~/template/complexes/id/" + cpx_id new_protein_dict = copy.deepcopy(default_protein_dict) new_protein_dict['complex_ref'] = model_complex_ref complex_present = False subunits_list = list() default_subunit_dict = { 'role': '', 'triggering': 0, 'optionalSubunit': 0, 'note': '', 'feature_refs': [] } matched_role_dict = dict() for cpxrole in complex_dict[cpx_id]['complexroles']: role_id = cpxrole['templaterole_ref'].split('/')[-1] if (role_id in role_cpt_ftr_dict): for role_cpt in role_cpt_ftr_dict[role_id]: role_cpt_present = False if (template_rxn_cpt == role_cpt and cpxrole['triggering'] == 1): complex_present = True role_cpt_present = True if (role_cpt_present == True): new_subunit_dict = copy.deepcopy( default_subunit_dict) new_subunit_dict['triggering'] = cpxrole[ 'triggering'] new_subunit_dict['optionalSubunit'] = cpxrole[ 'optional_role'] new_subunit_dict['role'] = roles_dict[role_id][ 'name'] if (len(roles_dict[role_id]['features']) > 0): new_subunit_dict[ 'note'] = 'Features characterized and annotated' else: #This never happens as of Fall 2019 print("Warning: " + roles_dict[role_id]['name'] + " is apparently uncharacterized!") new_subunit_dict[ 'note'] = 'Features uncharacterized but annotated' pass for ftr in role_cpt_ftr_dict[role_id][ role_cpt]: feature_ref = "~/genome/features/id/" + ftr new_subunit_dict['feature_refs'].append( feature_ref) matched_role_dict[role_id] = 1 subunits_list.append(new_subunit_dict) if (role_id not in role_cpt_ftr_dict and template_rxn['type'] == 'universal'): #This should still be added, with zero features to indicate the universality of the role in plant primary metabolism new_subunit_dict = copy.deepcopy(default_subunit_dict) new_subunit_dict['triggering'] = cpxrole['triggering'] new_subunit_dict['optionalSubunit'] = cpxrole[ 'optional_role'] new_subunit_dict['role'] = roles_dict[role_id]['name'] #Un-necessary, but explicitly stated new_subunit_dict['feature_refs'] = [] if (len(roles_dict[role_id]['features']) == 0): new_subunit_dict[ 'note'] = 'Features uncharacterized and unannotated' else: #As of Fall 2019, this includes two reactions new_subunit_dict[ 'note'] = "Features characterized but unannotated" print("Missing annotation: ", cpx_id, role_id, roles_dict[role_id]) matched_role_dict[role_id] = 1 subunits_list.append(new_subunit_dict) if (complex_present == True): #Check to see if members of a detected protein complex are missing #and add them if so, to round off the complex #This will only happen to a complex that is conditional (see above) for cpxrole in complex_dict[cpx_id]['complexroles']: role_id = cpxrole['templaterole_ref'].split('/')[-1] if (role_id not in matched_role_dict): print("Gapfilling complex: ", cpx_id, roles_dict[role_id]) new_subunit_dict = copy.deepcopy( default_subunit_dict) new_subunit_dict['triggering'] = cpxrole[ 'triggering'] new_subunit_dict['optionalSubunit'] = cpxrole[ 'optional_role'] new_subunit_dict[ 'note'] = "Complex-based-gapfilling" subunits_list.append(new_subunit_dict) if (len(subunits_list) > 0): new_protein_dict[ 'modelReactionProteinSubunits'] = subunits_list #Store features and subunits as complex string for table subs_str_list = list() for subunit in subunits_list: ftrs_str_list = list() for ftr_ref in subunit['feature_refs']: ftr = ftr_ref.split('/')[-1] ftrs_str_list.append(ftr) ftr_str = "(" + ", ".join(ftrs_str_list) + ")" subs_str_list.append(ftr_str) sub_str = "[" + ", ".join(subs_str_list) + "]" prots_str_list.append(sub_str) proteins_list.append(new_protein_dict) prot_str = ", ".join(prots_str_list) #This is important, we need to use role-based annotation to determine whether #a reaction should even be added to the model if (template_rxn['type'] == 'conditional' and len(proteins_list) == 0): continue #If the check passes, then, here, we instantiate the actual reaction that goes into the model new_mdlrxn_id = template_rxn['id'] + '0' new_mdlcpt_id = template_rxn_cpt + '0' base_rxn_id = template_rxn['id'].split('_')[0] #For table rxncplxs_dict[new_mdlrxn_id] = prot_str new_mdlrxn_dict = copy.deepcopy(default_mdlrxn_dict) new_mdlrxn_dict['id'] = new_mdlrxn_id new_mdlrxn_dict['name'] = MSD_reactions_dict[base_rxn_id][ 'abbreviation'] if (MSD_reactions_dict[base_rxn_id]['abbreviation'] == ""): new_mdlrxn_dict['name'] = base_rxn_id new_mdlrxn_dict['direction'] = template_rxn['direction'] new_mdlrxn_dict[ 'reaction_ref'] = '~/template/reactions/id/' + template_rxn[ 'id'] new_mdlrxn_dict[ 'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id #Here we check and instantiate a new modelcompartment if (new_mdlcpt_id not in mdlcpts_dict): new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict) new_mdlcpt_dict['id'] = new_mdlcpt_id new_mdlcpt_dict['label'] = cpt_name_dict[template_rxn_cpt] new_mdlcpt_dict[ 'compartment_ref'] = '~/template/compartments/id/' + template_rxn_cpt mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict #Add Proteins as previously determined new_mdlrxn_dict['modelReactionProteins'] = proteins_list #Add Reagents for template_rgt in template_rxn['templateReactionReagents']: template_rgt_cpd_cpt_id = template_rgt[ 'templatecompcompound_ref'].split('/')[-1] (template_rgt_cpd, template_rgt_cpt) = template_rgt_cpd_cpt_id.split('_') #Check and add new model compartment new_mdlcpt_id = template_rgt_cpt + '0' if (new_mdlcpt_id not in mdlcpts_dict): new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict) new_mdlcpt_dict['id'] = new_mdlcpt_id new_mdlcpt_dict['label'] = cpt_name_dict[template_rgt_cpt] new_mdlcpt_dict[ 'compartment_ref'] = '~/template/compartments/id/' + template_rgt_cpt mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict #Add new model compounds new_mdlcpd_id = template_rgt_cpd_cpt_id + '0' base_cpd_id = template_rgt_cpd_cpt_id.split('_')[0] if (new_mdlcpd_id not in mdlcpds_dict): new_mdlcpd_dict = copy.deepcopy(default_mdlcpd_dict) new_mdlcpd_dict['id'] = new_mdlcpd_id new_mdlcpd_dict['name'] = MSD_compounds_dict[base_cpd_id][ 'name'] new_mdlcpd_dict['charge'] = float( MSD_compounds_dict[base_cpd_id]['charge']) new_mdlcpd_dict['formula'] = MSD_compounds_dict[ base_cpd_id]['formula'] if(MSD_compounds_dict[base_cpd_id]['formula'] == "" or \ MSD_compounds_dict[base_cpd_id]['formula'] is None): print("Formula: ", base_cpd_id, MSD_compounds_dict[base_cpd_id]) new_mdlcpd_dict['formula'] = "" new_mdlcpd_dict[ 'compound_ref'] = '~/template/compounds/id/' + template_rgt_cpd new_mdlcpd_dict[ 'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id mdlcpds_dict[new_mdlcpd_id] = new_mdlcpd_dict new_rgt_dict = { 'coefficient': template_rgt['coefficient'], 'modelcompound_ref': '~/modelcompounds/id/' + new_mdlcpd_id } new_mdlrxn_dict['modelReactionReagents'].append(new_rgt_dict) new_model_obj['modelreactions'].append(new_mdlrxn_dict) #Having populated with list of reactions and biomass (to come), then add all compartments and compounds for cpt_id in mdlcpts_dict: new_model_obj['modelcompartments'].append(mdlcpts_dict[cpt_id]) #Last, but key modelcompound is the biomass, need to add it explicitly biocpd_id = "cpd11416" mdlbiocpd_dict = copy.deepcopy(default_mdlcpd_dict) mdlbiocpd_dict['id'] = biocpd_id + '_c0' mdlbiocpd_dict['name'] = 'Biomass' mdlbiocpd_dict['compound_ref'] = "~/template/compounds/id/" + biocpd_id mdlbiocpd_dict['modelcompartment_ref'] = "~/modelcompartments/id/c0" mdlcpds_dict[mdlbiocpd_dict['id']] = mdlbiocpd_dict for cpd_id in mdlcpds_dict: new_model_obj['modelcompounds'].append(mdlcpds_dict[cpd_id]) default_biomass_dict = { 'id': 'bio1', 'name': 'Plant leaf biomass', 'other': 1, 'dna': 0, 'rna': 0, 'protein': 0, 'cellwall': 0, 'lipid': 0, 'cofactor': 0, 'energy': 0, 'biomasscompounds': [] } default_biocpd_dict = {'modelcompound_ref': '', 'coefficient': 0} for template_biomass in template_obj['data']['biomasses']: new_template_biomass = copy.deepcopy(default_biomass_dict) new_template_biomass['id'] = template_biomass['id'] new_template_biomass['name'] = template_biomass['name'] for entry in [ 'dna', 'rna', 'protein', 'cellwall', 'lipid', 'cofactor', 'energy', 'other' ]: new_template_biomass[entry] = template_biomass[entry] for template_cpd in template_biomass['templateBiomassComponents']: new_biocpd_dict = copy.deepcopy(default_biocpd_dict) mdlcpd_id = template_cpd['templatecompcompound_ref'].split( '/')[-1] + '0' if (mdlcpd_id not in mdlcpds_dict): print("Missing: ", template_cpd) continue new_biocpd_dict[ 'modelcompound_ref'] = '~/modelcompounds/id/' + mdlcpd_id new_biocpd_dict['coefficient'] = template_cpd['coefficient'] new_template_biomass['biomasscompounds'].append( new_biocpd_dict) new_model_obj['biomasses'].append(new_template_biomass) print("Saving metabolic reconstruction") model_ws_object = { 'type': 'KBaseFBA.FBAModel', 'name': input_params['output_fbamodel'], 'data': new_model_obj } if ('output_ws' not in input_params or input_params['output_ws'] == ''): input_params['output_ws'] = input_params['input_ws'] ws_id = self.dfu.ws_name_to_id(input_params['output_ws']) saved_model_list = self.dfu.save_objects({ 'id': ws_id, 'objects': [model_ws_object] })[0] #Compose report string html_string = "<html><head><title>Reconstruct Plant Metabolism Report</title></head><body>" html_string += "<h2>Reconstruct Plant Metabolism Report</h2>" html_string += "<p>The \"Reconstruct Plant Metabolism\" app has finished running, " html_string += "reconstructing the primary metabolism from the " html_string += "enzymatic annotations in " + input_params[ 'input_genome'] + "</p>" html_string += "<p>Below we present the table of compartmentalized reactions in the metabolic reconstruction, " html_string += "it is similar to what you can see in the FBAModel viewer widget that appears " html_string += "below the report, but it has some additional information. Each row in the table is unique " html_string += "to each combination of reaction and compartment.</p>" html_string += "<p><ul>" html_string += "<li><b>Subsystems and Classes:</b> The table contains the metabolic subsystems and " html_string += "the general class of metabolism they fall into.</li>" html_string += "<li><b>Metabolic functions and EC numbers:</b> The table contains the original enzymatic " html_string += "annotation ('Roles') and their EC numbers that were associated with each biochemical reaction.</li>" html_string += "<li><b>Complexes:</b> The table contains the genes that were annotated with the metabolic functions. " html_string += "These genes that are associated with each reaction can be seen in the FBAModel viewer widget, but here " html_string += " one can see how they may be organized into protein complexes. Each set of parentheses '()' " html_string += "represents a single protein subunit (which may be the entire enzyme, or part of a large enzymatic " html_string += "complex). Each set of square brackets '[]' represents an entire enzyme, regardless of how many " html_string += "subunits it consists of. Each reaction may be catalyzed by different enzymes, each in turn composed " html_string += "of different subunits. The complexes reflect how the enzymes were curated in <i>Arabidopsis thaliana</i> " html_string += " so if any complex is shown to be empty, this means that the enzymatic annotation was not propagated " html_string += "from the original Arabidopsis gene. The original Arabidopsis curation also included protein localization " html_string += "so if a reaction has empty complexes in some compartments as opposed to others, this is an indication " html_string += "that annotation was only propagated for some localized Arabidopsis enzymes, and not others." html_string += "</ul></p>" # Fetch PlantSEED Data with open( os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3", "PlantSEED_Roles.json")) as plsd_fh: PS_Roles = json.load(plsd_fh) plantseed = FetchPlantSEEDImpl() reactions_data = plantseed.fetch_reactions(PS_Roles) table = GenerateTableImpl() table_html_string = table.generate_table(reactions_data, complexes=rxncplxs_dict) with open( os.path.join( '/kb/module/data', 'app_report_templates', 'integrate_abundances_report_tables_template.html') ) as report_template_file: report_template_string = report_template_file.read() # Generate and insert html Title report_template_string = report_template_string.replace( '*TITLE*', input_params['output_fbamodel']) # Insert html table table_report_string = report_template_string.replace( '*TABLES*', html_string + table_html_string) #Make folder for report files uuid_string = str(uuid.uuid4()) report_file_path = os.path.join(self.shared_folder, uuid_string) os.mkdir(report_file_path) #Write html files with open(os.path.join(report_file_path, "index.html"), 'w') as index_file: index_file.write(table_report_string) #Cache it in shock as an archive upload_info = self.dfu.file_to_shock({ 'file_path': report_file_path, 'pack': 'zip' }) #Prepare report parameters report_params = { 'direct_html_link_index': 0, #Use to refer to index of 'html_links' 'workspace_name': input_params['input_ws'], 'report_object_name': 'plant_fba_' + uuid_string, 'objects_created': [], 'html_links': [] } #Html Link object html_link = { 'shock_id': upload_info['shock_id'], 'name': 'index.html', 'label': 'html files', 'description': 'HTML files' } report_params['html_links'].append(html_link) #Objects created object saved_model_ref = "{}/{}/{}".format(saved_model_list[6], saved_model_list[0], saved_model_list[4]) saved_model_desc = "FBAModel: " + input_params['output_fbamodel'] report_params['objects_created'].append({ 'ref': saved_model_ref, 'description': saved_model_desc }) kbase_report_client = KBaseReport(self.callback_url, token=self.token) report_client_output = kbase_report_client.create_extended_report( report_params) output_report = dict() output_report['report_name'] = report_client_output['name'] output_report['report_ref'] = report_client_output['ref'] #END reconstruct_plant_metabolism # At some point might do deeper type checking... if not isinstance(output_report, dict): raise ValueError( 'Method reconstruct_plant_metabolism return value ' + 'output_report is not type dict as required.') # return the results return [output_report] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class VariationUtil: ''' Module Name: VariationUtil Module Description: A KBase module: VariationUtil ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.4" GIT_URL = "" GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR # TODO: Make sure we need to define config just once # TODO: Change the code tp match this style self.config = config self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL'] self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.config['ws_url'] = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shared_folder = config['scratch'] self.hr = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.shock_url = config['shock-url'] self.sw_url = config['srv-wiz-url'] pass #END_CONSTRUCTOR pass def save_variation_from_vcf(self, ctx, params): """ Save a variation (and trait?) object to Kbase given a reference genome, object output name, Variant Call Format (VCF) file, and sample attribute file. :param params: instance of type "save_variation_input" (## funcdef save_variation_from_vcf ## required input params: genome_or_assembly_ref: KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly object reference *** variation input data *** vcf_staging_file_path: path to location data associated with samples variation_object_name: output name for KBase variation object *** sample input data *** sample_attribute_ref: x/y/z reference to kbase sample attribute optional params: NA output report: report_name report_ref HTML visualization: Manhattan plot *** Visualization *** plot_maf: generate histogram of minor allele frequencies plot_hwe: generate histogram of Hardy-Weinberg Equilibrium p-values) -> structure: parameter "workspace_name" of String, parameter "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "vcf_staging_file_path" of type "filepath" (KBase file path to staging files), parameter "variation_object_name" of String, parameter "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "save_variation_output" -> structure: parameter "variation_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: report #BEGIN save_variation_from_vcf # Get workspace id ws_id = self.dfu.ws_name_to_id(params['workspace_name']) genome_ref = None assembly_ref = None # 1) Find whether the input is a genome or assembly # and get genome_ref and assembly_ref genome_or_assembly_ref = params['genome_or_assembly_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') # 2) Validate VCF, compress, and build VCF index logging.info("Validating VCF, Compressing VCF and Indexing VCF") VCFUtilsConfig = {"scratch": self.scratch} VCFUtilsParams = { 'vcf_staging_file_path': params['vcf_staging_file_path'] } VCU = VCFUtils(VCFUtilsConfig) vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf( VCFUtilsParams) if vcf_index is not None: logging.info("vcf compressed :" + str(vcf_compressed)) logging.info("vcf index :" + str(vcf_index)) logging.info("vcf strain ids :" + str(vcf_strain_ids)) else: raise ValueError( "No result obtained after compression and indexing step") # Get strain info # TODO: Remove hard coded stuff StrainInfoConfig = self.config StrainInfoParams = { "ws_id": ws_id, "vcf_strain_ids": vcf_strain_ids, "sample_set_ref": params["sample_set_ref"], "sample_attribute_name": params["sample_attribute_name"] } si = StrainInfo(StrainInfoConfig) sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams) print(sample_attribute_ref) print(strains) # 3) Create json for variation object. In a following step genomic_indexes will be # added to this json before it is saved as Variation object VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch} VCFToVariationParams = { "vcf_compressed": vcf_compressed, "vcf_index": vcf_index, "assembly_ref": assembly_ref } if genome_ref is not None: VCFToVariationParams['genome_ref'] = genome_ref vtv = VCFToVariation(VCFToVariationConfig) variation_object_data = vtv.generate_variation_object_data( VCFToVariationParams) # Append sample information if sample_attribute_ref: variation_object_data[ 'sample_attribute_ref'] = sample_attribute_ref else: raise ValueError(f'sample attribute ref not found') if strains: variation_object_data['strains'] = strains else: raise ValueError(f'strains not found') if 'sample_set_ref' in params: variation_object_data['sample_set_ref'] = params['sample_set_ref'] else: raise ValueError(f'sample_set_ref not found in params') # 4) JbrowseConfig = { "ws_url": self.ws_url, "scratch": self.scratch, "sw_url": self.sw_url, "shock_url": self.shock_url } JbrowseParams = { "vcf_path": vcf_compressed, "assembly_ref": assembly_ref, "binsize": 10000, "vcf_shock_id": variation_object_data['vcf_handle']['id'], "vcf_index_shock_id": variation_object_data['vcf_index_handle']['id'] } if genome_ref is not None: JbrowseParams["genome_ref"] = genome_ref jb = JbrowseUtil(JbrowseConfig) jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams) # 5) Now we have the genomic indices and we have all the information needed to save # the variation object # TODO: Take out the genomic_indexes field from the object spec # TODO: Take out the vcf_handle stuff not needed variation_object_data['genomic_indexes'] = jbrowse_report[ 'genomic_indexes'] var_obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_object_data, 'name': params['variation_object_name'] }] })[0] var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str( var_obj[4]) print(var_obj_ref) # 5) Build Variation report # This is a simple report # workspace = params['workspace_name'] created_objects = [] created_objects.append({ "ref": var_obj_ref, "description": "Variation Object" }) ReportConfig = { "ws_url": self.ws_url, "scratch": self.scratch, } ReportParams = {"variation_ref": var_obj_ref} vr = VariationReport(ReportConfig) htmlreport_dir = vr.create_variation_report(ReportParams) report = self.hr.create_html_report(htmlreport_dir, workspace, created_objects) report['variation_ref'] = var_obj_ref print(report) #END save_variation_from_vcf # At some point might do deeper type checking... if not isinstance(report, dict): raise ValueError('Method save_variation_from_vcf return value ' + 'report is not type dict as required.') # return the results return [report] def export_variation_as_vcf(self, ctx, params): """ Export KBase variation object as Variant Call Format (VCF) file :param params: instance of type "export_variation_input" (## funcdef export_variation_as_vcf ## required input params: Variation object reference optional params: NA output report: Shock id pointing to exported vcf file) -> structure: parameter "input_var_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "export_variation_output" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) output = vtv.export_as_vcf(params) #END export_variation_as_vcf # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_variation_as_vcf return value ' + 'output is not type dict as required.') # return the results return [output] def get_variation_as_vcf(self, ctx, params): """ Given a reference to a variation object, and output name: return a Variant Call Format (VCF) file path and name. :param params: instance of type "get_variation_input" (## funcdef get_variation_as_vcf ## required input params: Variation object reference output file name optional params: NA output report: path to returned vcf name of variation object) -> structure: parameter "variation_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "filename" of String :returns: instance of type "get_variation_output" -> structure: parameter "path" of type "filepath" (KBase file path to staging files), parameter "variation_name" of String """ # ctx is the context object # return variables are: file #BEGIN get_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) file = vtv.variation_to_vcf(params) #END get_variation_as_vcf # At some point might do deeper type checking... if not isinstance(file, dict): raise ValueError('Method get_variation_as_vcf return value ' + 'file is not type dict as required.') # return the results return [file] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class MatrixUtil: def _validate_import_matrix_from_excel_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ logging.info('start validating import_matrix_from_excel params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) refs = {k: v for k, v in params.items() if "_ref" in k} return (obj_type, file_path, params.get('workspace_name'), params.get('matrix_name'), refs, scale) def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = {'file_path': file_path, 'pack': 'zip'} shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id') return shock_id @staticmethod def _mkdir_p(path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) @staticmethod def _write_mapping_sheet(file_path, sheet_name, mapping, index): """ _write_mapping_sheet: write mapping to sheet """ df_dict = collections.OrderedDict() df_dict[index[0]] = [] df_dict[index[1]] = [] for key, value in mapping.items(): df_dict.get(index[0]).append(key) df_dict.get(index[1]).append(value) df = pd.DataFrame.from_dict(df_dict) with pd.ExcelWriter(file_path, engine='openpyxl') as writer: writer.book = load_workbook(file_path) df.to_excel(writer, sheet_name=sheet_name) def _generate_report(self, matrix_obj_ref, workspace_name): """ _generate_report: generate summary report """ report_params = { 'message': '', 'objects_created': [{ 'ref': matrix_obj_ref, 'description': 'Imported Matrix' }], 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_excel_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output @staticmethod def _process_mapping_sheet(file_path, sheet_name): """ _process_mapping: process mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name, dtype='str') except XLRDError: return dict() else: mapping = {value[0]: value[1] for value in df.values.tolist()} return mapping def _process_attribute_mapping_sheet(self, file_path, sheet_name, matrix_name, workspace_id): """ _process_attribute_mapping_sheet: process attribute_mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return '' else: obj_name = f'{matrix_name}_{sheet_name}' result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) df.to_excel(file_path) import_attribute_mapping_params = { 'output_obj_name': obj_name, 'output_ws_id': workspace_id, 'input_file_path': file_path } ref = self.attr_util.file_to_attribute_mapping( import_attribute_mapping_params) return ref.get('attribute_mapping_ref') @staticmethod def _file_to_df(file_path): logging.info('start parsing file content to data frame') try: df = pd.read_excel(file_path, sheet_name='data', index_col=0) except XLRDError: try: df = pd.read_excel(file_path, index_col=0) logging.warning( 'WARNING: A sheet named "data" was not found in the attached file,' ' proceeding with the first sheet as the data sheet.') except XLRDError: try: reader = pd.read_csv(file_path, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(file_path, sep=inferred_sep, index_col=0) except Exception: raise ValueError( 'Cannot parse file. Please provide valide tsv, excel or csv file' ) df.index = df.index.astype('str') df.columns = df.columns.astype('str') # fill NA with "None" so that they are properly represented as nulls in the KBase Object df = df.where((pd.notnull(df)), None) return df def _file_to_data(self, file_path, refs, matrix_name, workspace_id): logging.info('Start reading and converting excel file data') data = refs df = self._file_to_df(file_path) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } data.update({'data': matrix_data}) data.update( self._get_axis_attributes('col', matrix_data, refs, file_path, matrix_name, workspace_id)) data.update( self._get_axis_attributes('row', matrix_data, refs, file_path, matrix_name, workspace_id)) # processing metadata metadata = self._process_mapping_sheet(file_path, 'metadata') data['attributes'] = {} data['search_attributes'] = [] for k, v in metadata.items(): k = k.strip() v = v.strip() if k in TYPE_ATTRIBUTES: data[k] = v else: data['attributes'][k] = v data['search_attributes'].append(" | ".join((k, v))) return data def _get_axis_attributes(self, axis, matrix_data, refs, file_path, matrix_name, workspace_id): """Get the row/col_attributemapping and mapping of ids, validating as needed""" # Parameter specified mappings should take precedence over tabs in excel so only process # if attributemapping_ref is missing: attr_data = {} if refs.get(f'{axis}_attributemapping_ref'): attributemapping_ref = refs[f'{axis}_attributemapping_ref'] else: attributemapping_ref = self._process_attribute_mapping_sheet( file_path, f'{axis}_attribute_mapping', matrix_name, workspace_id) if attributemapping_ref: attr_data[f'{axis}_attributemapping_ref'] = attributemapping_ref # col/row_mappings may not be supplied id_mapping = self._process_mapping_sheet(file_path, f'{axis}_mapping') if id_mapping: attr_data[f'{axis}_mapping'] = id_mapping # if no mapping, axis ids must match the attribute mapping elif attributemapping_ref: am_data = self.dfu.get_objects( {'object_refs': [attributemapping_ref]})['data'][0]['data'] axis_ids = matrix_data[f'{axis}_ids'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: # just gen the IDs in this matrix attr_data[f'{axis}_mapping'] = {x: x for x in axis_ids} return attr_data @staticmethod def _build_header_str(attribute_names): #not going to be used header_str = '' width = 100.0 / len(attribute_names) header_str += '<tr class="header">' header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format( width) for attribute_name in attribute_names: header_str += '<th style="width:{0:.2f}%;"'.format(width) header_str += '>{}</th>'.format(attribute_name) header_str += '</tr>' return header_str def _build_html_str(self, row_mapping, attributemapping_data, row_ids): #not going to be used logging.info('Start building html replacement') attribute_names = [ attributes.get('attribute') for attributes in attributemapping_data.get('attributes') ] header_str = self._build_header_str(attribute_names) table_str = '' instances = attributemapping_data.get('instances') for feature_id, attribute_id in row_mapping.items(): if feature_id in row_ids: feature_instances = instances.get(attribute_id) table_str += '<tr>' table_str += '<td>{}</td>'.format(feature_id) for feature_instance in feature_instances: table_str += '<td>{}</td>'.format(feature_instance) table_str += '</tr>' return header_str, table_str def _generate_search_html_report(self, header_str, table_str): #generate search html report html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') shutil.copy2( os.path.join(os.path.dirname(__file__), 'templates', 'kbase_icon.png'), output_directory) shutil.copy2( os.path.join(os.path.dirname(__file__), 'templates', 'search_icon.png'), output_directory) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'templates', 'search_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '//HEADER_STR', header_str) report_template = report_template.replace( '//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App' }) return html_report def _generate_search_report(self, header_str, table_str, workspace_name): logging.info('Start creating report') output_html_files = self._generate_search_html_report( header_str, table_str) report_params = { 'message': '', 'workspace_name': workspace_name, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_matrix_filter_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output @staticmethod def _filter_value_data(value_data, remove_ids, dimension): """Filters a value matrix based on column or row ids""" def _norm_id(_id): return _id.replace(" ", "_") val_df = pd.DataFrame(value_data['values'], index=value_data['row_ids'], columns=value_data['col_ids'], dtype='object') if dimension == 'row': filtered_df = val_df.drop(remove_ids, axis=0, errors='ignore') filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids], axis=0, errors='ignore') elif dimension == 'col': filtered_df = val_df.drop(remove_ids, axis=1, errors='ignore') filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids], axis=1, errors='ignore') else: raise ValueError('Unexpected dimension: {}'.format(dimension)) filtered_value_data = { "values": filtered_df.values.tolist(), "col_ids": list(filtered_df.columns), "row_ids": list(filtered_df.index), } return filtered_value_data def _standardize_df(self, df, with_mean=True, with_std=True): logging.info("Standardizing matrix data") df.fillna(0, inplace=True) x_train = df.values scaler = preprocessing.StandardScaler(with_mean=with_mean, with_std=with_std).fit(x_train) standardized_values = scaler.transform(x_train) standardize_df = pd.DataFrame(index=df.index, columns=df.columns, data=standardized_values) return standardize_df def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_types = [ x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types() ] def standardize_matrix(self, params): """ standardize a matrix """ input_matrix_ref = params.get('input_matrix_ref') workspace_name = params.get('workspace_name') new_matrix_name = params.get('new_matrix_name') with_mean = params.get('with_mean', 1) with_std = params.get('with_std', 1) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name input_matrix_obj = self.dfu.get_objects( {'object_refs': [input_matrix_ref]})['data'][0] input_matrix_info = input_matrix_obj['info'] input_matrix_name = input_matrix_info[1] input_matrix_data = input_matrix_obj['data'] if not new_matrix_name: current_time = time.localtime() new_matrix_name = input_matrix_name + time.strftime( '_%H_%M_%S_%Y_%m_%d', current_time) data_matrix = self.data_util.fetch_data({ 'obj_ref': input_matrix_ref }).get('data_matrix') df = pd.read_json(data_matrix) standardize_df = self._standardize_df(df, with_mean, with_std) new_matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': standardize_df.values.tolist() } input_matrix_data['data'] = new_matrix_data logging.info("Saving new standardized matrix object") info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": input_matrix_info[2], "data": input_matrix_data, "name": new_matrix_name }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_matrix_obj_ref, 'description': 'Standardized Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def filter_matrix(self, params): #not going to be used """ filter_matrix: create sub-matrix based on input feature_ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name feature_ids: string of feature ids that result matrix contains filtered_matrix_name: name of newly created filtered matrix object """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') remove_ids = params.get('remove_ids') dimension = params.get('dimension') filtered_matrix_name = params.get('filtered_matrix_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_info = matrix_source.get('info') matrix_data = matrix_source.get('data') matrix_type = self._find_between(matrix_info[2], '\.', '\-') value_data = matrix_data.get('data') remove_ids = [x.strip() for x in remove_ids.split(',')] filtered_value_data = self._filter_value_data(value_data, remove_ids, dimension) # if the matrix has changed shape, update the mappings if len(filtered_value_data['row_ids']) < len( matrix_data['data']['row_ids']): if matrix_data.get('row_mapping'): matrix_data['row_mapping'] = { k: matrix_data['row_mapping'][k] for k in filtered_value_data['row_ids'] } if matrix_data.get('feature_mapping'): matrix_data['feature_mapping'] = { k: matrix_data['feature_mapping'][k] for k in filtered_value_data['row_ids'] } if len(filtered_value_data['col_ids']) < len( matrix_data['data']['col_ids']): if matrix_data.get('col_mapping'): matrix_data['col_mapping'] = { k: matrix_data['col_mapping'][k] for k in filtered_value_data['col_ids'] } matrix_data['data'] = filtered_value_data if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name filtered_matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(matrix_type), 'obj_name': filtered_matrix_name, 'data': matrix_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]} report_output = self._generate_report(filtered_matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def search_matrix(self, params): #not going to be used """ search_matrix: generate a HTML report that allows users to select feature ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_data = matrix_source.get('data') row_mapping = matrix_data.get('row_mapping') row_attributemapping_ref = matrix_data.get('row_attributemapping_ref') row_ids = matrix_data['data']['row_ids'] if not (row_mapping and row_attributemapping_ref): raise ValueError( 'Matrix obejct is missing either row_mapping or row_attributemapping_ref' ) attributemapping_data = self.dfu.get_objects( {"object_refs": [row_attributemapping_ref]})['data'][0]['data'] header_str, table_str = self._build_html_str(row_mapping, attributemapping_data, row_ids) returnVal = self._generate_search_report(header_str, table_str, workspace_name) return returnVal def import_matrix_from_excel(self, params): """ import_matrix_from_excel: import matrix object from excel arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (obj_type, file_path, workspace_name, matrix_name, refs, scale) = self._validate_import_matrix_from_excel_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path, refs, matrix_name, workspace_id) data['scale'] = scale if params.get('description'): data['description'] = params['description'] matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def export_matrix(self, params): """ export_matrix: univeral downloader for matrix data object arguments: obj_ref: generics object reference optional arguments: generics_module: select the generics data to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; and only data is needed generics_module should be {'data': 'FloatMatrix2D'} """ logging.info('Start exporting matrix') if 'input_ref' in params: params['obj_ref'] = params.pop('input_ref') obj_source = self.dfu.get_objects( {"object_refs": [params.get('obj_ref')]})['data'][0] obj_data = obj_source.get('data') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_source.get('info')[1])) data_matrix = self.data_util.fetch_data(params).get('data_matrix') df = pd.read_json(data_matrix) df.to_excel(file_path, sheet_name='data') if obj_data.get('col_mapping'): self._write_mapping_sheet(file_path, 'col_mapping', obj_data.get('col_mapping'), ['col_name', 'instance_name']) obj_data.pop('col_mapping') if obj_data.get('row_mapping'): self._write_mapping_sheet(file_path, 'row_mapping', obj_data.get('row_mapping'), ['row_name', 'instance_name']) obj_data.pop('row_mapping') try: obj_data.pop('data') except KeyError: logging.warning('Missing key [data]') obj_data.update(obj_data.get('attributes', {})) # flatten for printing self._write_mapping_sheet(file_path, 'metadata', obj_data, ['name', 'value']) shock_id = self._upload_to_shock(file_path) return {'shock_id': shock_id}
def UploadFrommfmd(callback_url, params): """ :param params: instance of type "UploadmfmdInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFrommfmd print('Extracting motifs') #motifList = MFU.parse_mfmd_output(params['path']) motifList = parse_mfmd_output(params['path']) print(motifList) MSO = {} MSO=motifList '''MSO['Condition'] = 'Temp' MSO['SequenceSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = MSU.GetBackground() #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(motifList,MSO)''' '''params['min_len']=22 #put dummy value for min and max len params['max_len']=22 #MSU.CheckLength(motifList,params['min_len'],params['max_len']) #MSU.CheckLength(MSO,params['min_len'],params['max_len']) for motif in MSO['Motifs']: print() for letter in MSO['Alphabet']: if len(motif['PWM'][letter]) != len(motif['Iupac_sequence']): print('CAUGHT PWM ERROR HERE') exit(1) if 'absolute_locations' in params: for motif in MSO['Motifs']: for loc in motif['Motif_Locations']: if loc['sequence_id'] in params['absolute_locations']: loc['sequence_id'] = params['contig'] absStart = int(params['start']) loc['start'] = absStart loc['end'] = absStart + loc['end'] print("test2")''' dfu = DataFileUtil(callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : MSO , 'name' : params['obj_name']}] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref' : motif_set_ref} print(output) #exit("test") #END UploadFrommfmd # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFrommfmd return value ' + 'output is not type dict as required.') # return the results return [output]
class AttributesUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.data_util = DataUtil(config) self.wsClient = workspaceService(self.ws_url, token=self.token) self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" self.ONT_LABEL_DEL = " - " self.ONT_TERM_DEL = ":" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def file_to_attribute_mapping(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") attr_mapping = self._file_to_am_obj(scratch_file_path) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": attr_mapping, "name": params['output_obj_name'] }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def append_file_to_attribute_mapping(self, staging_file_subdir_path, old_am_ref, output_ws_id, new_am_name=None): """append an attribute mapping file to existing attribute mapping object """ download_staging_file_params = { 'staging_file_subdir_path': staging_file_subdir_path } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') append_am_data = self._file_to_am_obj(scratch_file_path) old_am_obj = self.dfu.get_objects({'object_refs': [old_am_ref]})['data'][0] old_am_info = old_am_obj['info'] old_am_name = old_am_info[1] old_am_data = old_am_obj['data'] new_am_data = self._check_and_append_am_data(old_am_data, append_am_data) if not new_am_name: current_time = time.localtime() new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d', current_time) info = self.dfu.save_objects({ "id": output_ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": new_am_data, "name": new_am_name }] })[0] return { "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4]) } def update_matrix_attribute_mapping(self, params): dimension = params.get('dimension') if dimension not in ['col', 'row']: raise ValueError('Please use "col" or "row" for input dimension') workspace_name = params.get('workspace_name') old_matrix_ref = params.get('input_matrix_ref') old_matrix_obj = self.dfu.get_objects( {'object_refs': [old_matrix_ref]})['data'][0] old_matrix_info = old_matrix_obj['info'] old_matrix_data = old_matrix_obj['data'] old_am_ref = old_matrix_data.get( '{}_attributemapping_ref'.format(dimension)) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name if not old_am_ref: raise ValueError( 'Matrix object does not have {} attribute mapping'.format( dimension)) new_am_ref = self.append_file_to_attribute_mapping( params['staging_file_subdir_path'], old_am_ref, workspace_id, params['output_am_obj_name'])['attribute_mapping_ref'] old_matrix_data['{}_attributemapping_ref'.format( dimension)] = new_am_ref info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "type": old_matrix_info[2], "data": old_matrix_data, "name": params['output_matrix_obj_name'] }] })[0] new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) objects_created = [{ 'ref': new_am_ref, 'description': 'Updated Attribute Mapping' }, { 'ref': new_matrix_obj_ref, 'description': 'Updated Matrix' }] report_params = { 'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) return { 'new_matrix_obj_ref': new_matrix_obj_ref, 'new_attribute_mapping_ref': new_am_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def _check_and_append_am_data(self, old_am_data, append_am_data): exclude_keys = {'attributes', 'instances'} new_am_data = { k: old_am_data[k] for k in set(list(old_am_data.keys())) - exclude_keys } old_attrs = old_am_data.get('attributes') old_insts = old_am_data.get('instances') append_attrs = append_am_data.get('attributes') append_insts = append_am_data.get('instances') # checking duplicate attributes old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs] append_attrs_names = [ append_attr.get('attribute') for append_attr in append_attrs ] duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names) if duplicate_attrs: error_msg = 'Duplicate attribute mappings: [{}]'.format( duplicate_attrs) raise ValueError(error_msg) # checking missing instances missing_inst = old_insts.keys() - append_insts.keys() if missing_inst: error_msg = 'Appended attribute mapping misses [{}] instances'.format( missing_inst) raise ValueError(error_msg) new_attrs = old_attrs + append_attrs new_am_data['attributes'] = new_attrs new_insts = deepcopy(old_insts) for inst_name, val in new_insts.items(): append_val = append_insts.get(inst_name) val.extend(append_val) new_am_data['instances'] = new_insts return new_am_data def _am_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ attributes = pd.DataFrame(data['attributes']) attributes.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) instances = pd.DataFrame(data['instances']) am_df = attributes.join(instances) return am_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.data_util.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ list(cluster.get('id_to_data_position').keys()) for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on columns data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = list(cluster.get('id_to_data_position').keys()) item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a DataFrame""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.AttributeMapping" in obj_type: cs_df = self._am_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _file_to_am_obj(self, scratch_file_path): try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep=None, dtype='str') df = df.replace('nan', '') if df.columns[1].lower() == "attribute ontology id": am_obj = self._df_to_am_obj(df) else: am_obj = self._isa_df_to_am_object(df) return am_obj def _df_to_am_obj(self, am_df): """Converts a dataframe from a user file to a compound set object""" if not len(am_df): raise ValueError("No attributes in supplied files") attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute") instance_df = am_df.drop(attribute_df.columns, axis=1) if not len(instance_df.columns): raise ValueError( "Unable to find any instance columns in supplied file") attribute_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "attribute" not in attribute_df.columns: raise ValueError( "Unable to find a 'attribute' column in supplied file") attribute_df['source'] = 'upload' attribute_fields = ('attribute', 'unit', 'attribute_ont_id', 'unit_ont_id', 'source') attributes = attribute_df.filter( items=attribute_fields).to_dict('records') print(attributes) self._validate_attribute_values( am_df.set_index(attribute_df.attribute).iterrows()) attribute_mapping = { 'ontology_mapping_method': "User Curation", 'attributes': [self._add_ontology_info(f) for f in attributes], 'instances': instance_df.to_dict('list') } return attribute_mapping def _isa_df_to_am_object(self, isa_df): skip_columns = { 'Raw Data File', 'Derived Data File', 'Array Data File', 'Image File' } if 'Sample Name' in isa_df.columns and not any( isa_df['Sample Name'].duplicated()): isa_df.set_index('Sample Name', inplace=True) elif 'Assay Name' in isa_df.columns and not any( isa_df['Assay Name'].duplicated()): isa_df.set_index('Assay Name', inplace=True) elif not any(isa_df[isa_df.columns[0]].duplicated()): logging.warning(f'Using {isa_df.columns[0]} as ID column') isa_df.set_index(isa_df.columns[0], inplace=True) else: raise ValueError( "Unable to detect an ID column that was unigue for each row. " f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}" ) self._validate_attribute_values(isa_df.iteritems()) attribute_mapping = { 'ontology_mapping_method': "User Curation - ISA format" } attribute_mapping[ 'attributes'], new_skip_cols = self._get_attributes_from_isa( isa_df, skip_columns) reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore') attribute_mapping['instances'] = reduced_isa.T.to_dict('list') return attribute_mapping def _validate_attribute_values(self, attribute_series): errors = {} for attr, vals in attribute_series: try: validator = getattr(AttributeValidation, attr) attr_errors = validator(vals) if attr_errors: errors[attr] = attr_errors except AttributeError: continue if errors: for attr, attr_errors in errors.items(): logging.error( f'Attribute {attr} had the following validation errors:\n' "\n".join(attr_errors) + '\n') raise ValueError( f'The following attributes failed validation: {", ".join(errors)}' f'\n See the log for details') def _get_attributes_from_isa(self, isa_df, skip_columns): attributes = [] # associate attribute columns with the other columns that relate to them for i, col in enumerate(isa_df.columns): if col.startswith('Term Source REF'): skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_ont'] = col else: last_attr['_val_ont'] = col elif col.startswith('Term Accession Number'): # If the term Accession is a web link only grab the last bit # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012 isa_df[col] = isa_df[col].map( lambda x: x.split("/")[-1].split("_")[-1]) skip_columns.add(col) last_attr = attributes[-1] if '_unit' in last_attr: last_attr['_unit_accession'] = col else: last_attr['_val_accession'] = col elif col.startswith('Unit'): skip_columns.add(col) last_attr = attributes[-1] if last_attr.get('unit'): raise ValueError( "More than one unit column is supplied for attribute {}" .format(last_attr['attribute'])) last_attr['_unit'] = col elif col not in skip_columns: split_col = col.split("|", 1) if len(split_col) > 1: attributes.append({ "attribute": split_col[0], "attribute_ont_id": split_col[1], "source": "upload" }) else: attributes.append({"attribute": col, "source": "upload"}) # handle the categories for each attribute for i, attribute in enumerate(attributes): if '_val_accession' in attribute: category_df = isa_df[[ attribute['attribute'], attribute.pop('_val_ont'), attribute.pop('_val_accession') ]].drop_duplicates() category_df[ 'attribute_ont_id'] = category_df.iloc[:, 1].str.cat( category_df.iloc[:, 2], ":") category_df['value'] = category_df[attribute['attribute']] cats = category_df.set_index(attribute['attribute'])[[ 'value', 'attribute_ont_id' ]].to_dict('index') attribute['categories'] = { k: self._add_ontology_info(v) for k, v in cats.items() } if '_unit' in attribute: units = isa_df[attribute.pop('_unit')].unique() if len(units) > 1: raise ValueError( "More than one unit type is supplied for attribute {}: {}" .format(attribute['attribute'], units)) attribute['unit'] = units[0] if '_unit_ont' in attribute: unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat( isa_df[attribute.pop('_unit_accession')], ":").unique() if len(units) > 1: raise ValueError( "More than one unit ontology is supplied for attribute " "{}: {}".format(attribute['attribute'], unit_ont)) attribute['unit_ont_id'] = unit_ont[0] attributes[i] = self._add_ontology_info(attribute) return attributes, skip_columns def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, attribute): """Searches KBASE ontologies for terms matching the user supplied attributes and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } attribute = { k: v for k, v in attribute.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( attribute.get('attribute_ont_id', "").replace("_", ":")) if ont_info: attribute['attribute_ont_ref'] = ont_info['ontology_ref'] attribute['attribute_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['attribute_ont_id'] == ":": attribute.pop('attribute_ont_id', None) if attribute.get('unit'): ont_info = self._search_ontologies( attribute.get('unit_ont_id', '').replace("_", ":")) if ont_info: attribute['unit_ont_ref'] = ont_info['ontology_ref'] attribute['unit_ont_id'] = ont_info['id'] elif not attribute.get( 'attribute_ont_id') or attribute['unit_ont_id'] == ":": attribute.pop('unit_ont_id', None) return attribute def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.AttributeMapping" in obj_type: df.to_excel(writer, "Attributes", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
def run_FamaGenomeProfiling(self, ctx, params): """ Run genome functional profiling module of Fama. :param params: instance of type "FamaGenomeProfilingParams" (Parameters for genome functional profiling. workspace_name - the name of the workspace for input/output genome_refs - references to a genome object ref_dataset - the name of Fama reference dataset output_result_name - the name of the output DomainAnnotation) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of list of String, parameter "ref_dataset" of String, parameter "output_feature_set_name" of String, parameter "output_annotation_name" of String :returns: instance of type "ReportResults" (Output report parameters report_name - the name of the report object report_ref - the reference to the report object) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FamaGenomeProfiling # Import protein sequences from input genome_ref ws_client = Workspace(self.ws_url) input_genome_refs = params['genome_ref'] fama_reference = params['ref_dataset'] input_proteins = {} name2ref = {} for input_genome_ref in input_genome_refs: ret = ws_client.get_objects2( {'objects': [{ 'ref': input_genome_ref }]})['data'][0] obj_data = ret['data'] obj_name = ret['info'][1] obj_type = ret['info'][2].split('.')[1].split('-')[0] if obj_type == 'GenomeSet': print('GenomeSet data', obj_data) genome_refs = [] if 'elements' in obj_data: genome_refs = [ item['ref'] for item in obj_data['elements'].values() ] elif 'items' in obj_data: genome_refs = [item['ref'] for item in obj_data['items']] for sub_obj_ref in genome_refs: ret = ws_client.get_objects2( {'objects': [{ 'ref': sub_obj_ref }]})['data'][0] genome_data = ret['data'] genome_name = ret['info'][1] if genome_name in name2ref: raise ServerError( 'All input genome names must be unique. Check ' + genome_name) name2ref[genome_name] = sub_obj_ref proteins = genome_proteins_to_fasta( genome_data, self.shared_folder) input_proteins[genome_name] = {} input_proteins[genome_name]['fwd'] = proteins elif obj_type == 'Genome': if obj_name in name2ref: raise ServerError('All input genome names must be unique') name2ref[obj_name] = input_genome_ref proteins = genome_proteins_to_fasta(obj_data, self.shared_folder) input_proteins[obj_name] = {} input_proteins[obj_name]['fwd'] = proteins else: raise ServerError('Incompatible object: ' + input_genome_ref + ' (' + obj_name + ')') self.log('Input sequence files:', str(input_proteins)) self.log('reference: ', fama_reference) # Run Fama fama_params = { 'input_proteins': input_proteins, 'work_dir': self.shared_folder, 'reference': fama_reference, 'ws_name': params['workspace_name'], 'ws_client': ws_client, 'featureset_name': params['output_feature_set_name'], 'annotation_prefix': params['output_annotation_name'], 'name2ref': name2ref } fama_output = protein_functional_profiling_pipeline(fama_params) objects_created = fama_output['objects_created'] dfu = DataFileUtil(self.callback_url) workspace_id = dfu.ws_name_to_id(params['workspace_name']) object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': fama_output['feature_set_data'], 'name': params['output_feature_set_name'] }] } try: dfu_oi = dfu.save_objects(save_object_params)[0] except ServerError as dfue: # not really any way to test this block self.log('Logging exception saving feature set') self.log(str(dfue)) raise feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created.append({ 'ref': feature_set_obj_ref, 'description': 'Filtered genome features' }) self.log('FeatureSet saved to ' + feature_set_obj_ref) # Write HTML output to workspace message = 'Fama protein functional profiling finished successfully' try: dfu_output = dfu.file_to_shock( {'file_path': fama_output['html_report']}) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('HTML report saved: ' + str(dfu_output)) html_links = [{ 'shock_id': dfu_output['shock_id'], 'description': 'HTML report for Fama App', 'name': 'fama_report.html', 'label': 'Fama_report' }] for krona_file in fama_output['krona_charts']: try: dfu_output = dfu.file_to_shock({'file_path': krona_file}) html_links.append({ 'shock_id': dfu_output['shock_id'], 'description': 'Krona chart for function taxonomy profile', 'name': fama_output['krona_charts'][krona_file][0], 'label': fama_output['krona_charts'][krona_file][1] }) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('Krona chart saved: ' + str(dfu_output)) # Save report report_params = { 'message': message, 'objects_created': objects_created, 'direct_html_link_index': 0, 'html_links': html_links, 'file_links': fama_output['report_files'], 'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'], 'html_window_height': 460 } try: self.log('Call KBaseReport at ' + str(self.callback_url)) report = KBaseReport(self.callback_url) self.log('Ready to save KBase report: ' + str(report_params)) report_info = report.create_extended_report(report_params) except ServerError as kre: # not really any way to test this block self.log('Logging exception saving report') self.log(str(kre)) raise report_info['report_params'] = report_params self.log('KBase report saved: ' + str(report_info)) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_FamaGenomeProfiling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FamaGenomeProfiling return value ' + 'output is not type dict as required.') # return the results return [output]
def run_MotifSuite(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "motifsuite_seq_input" -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "SS_ref" of String, parameter "promoter_length" of Long, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long, parameter "obj_name" of String, parameter "prb" of Double, parameter "motif_length" of Long, parameter "background" of Long, parameter "mask_repeats" of Long, parameter "background_group" of mapping from String to String, parameter "threshold" of Double, parameter "proportion" of Double :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_MotifSuite report = KBaseReport(self.callback_url) mfmd_obj = MotifFindermfmd(self.callback_url) homer_obj = MotifFinderHomer(self.callback_url) meme_obj = MotifFinderMEME(self.callback_url) gibbs_obj = MotifFinderGibbs(self.callback_url) ensemble_obj = MotifEnsemble(self.callback_url) mdscan_obj = MotifFinderMdscan(self.callback_url) sampler_obj = MotifFinderSampler(self.callback_url) p1 = Process(target=homer_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p1.start() p1.join() p2 = Process(target=mfmd_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p2.start() p2.join() p3 = Process(target=meme_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p3.start() p3.join() p4 = Process(target=gibbs_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p4.start() p4.join() p5 = Process(target=mdscan_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p5.start() p5.join() p6 = Process(target=sampler_obj.DiscoverMotifsFromSequenceSet, args=(params,)) p6.start() p6.join() MSU=MotifSuiteUtil() params['motifset_refs']= MSU.get_obj_refs() #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133','29716/72/134','29716/72/135','29716/72/136'] #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133'] print(params['motifset_refs']) #result = ensemble_obj.MotifEnsemble(params) #print('Ensemble RESULT:') #print(result) dms=DownloadMotifSets() MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],self.callback_url) matchSets = [] threshold = float(params['threshold']) fmu=FastaUtils() for i,MSR1 in enumerate(MotifSetDict.keys()): for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k,MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']): if fmu.CompareMotifsBP(motif1,motif2,threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m,mset in enumerate(matchSets): if (MSR1,j) in mset: found1 = True index1 = m if(MSR2,l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1,j)) elif not found2 and found1: matchSets[index1].add((MSR2,l)) elif found1 and found2: if index1 != index2: matchSets[index1].union(matchSets[index2]) matchSets.pop(index2) else: matchSets.append(set([(MSR1,j),(MSR2,l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i,mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys()))/numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = fmu.merge(matchSets[keep],MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) mr=MakeNewReport() mr.MakeReport(htmlDir,ESO) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_MotifSuite # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_MotifSuite return value ' + 'output is not type dict as required.') # return the results return [output]
class BiomUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.data_util = DataUtil(config) self.attr_util = AttributesUtil(config) self.matrix_util = MatrixUtil(config) self.matrix_types = [x.split(".")[1].split('-')[0] for x in self.data_util.list_generic_types()] self.taxon_wsname = config['taxon-workspace-name'] self.kbse = KBaseSearchEngine(config['search-url']) def import_matrix_from_biom(self, params): """ arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_attributemapping_ref: column AttributeMapping reference row_attributemapping_ref: row AttributeMapping reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ #exit(params) {'obj_type': 'AmpliconMatrix', 'matrix_name': 'test_AmpliconMatrix', 'workspace_name': 'man4ish_gupta:narrative_1568644342277', 'biom_fasta': {'biom_file_biom_fasta': 'data/phyloseq_test.biom', 'fasta_file_biom_fasta': 'data/phyloseq_test.fa'}, 'scale': 'raw', 'description': 'OTU data', 'amplicon_set_name': 'test_AmpliconSet', 'col_attributemapping_ref': '44071/33/54'} (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params) workspace_name = params.get('workspace_name') matrix_name = params.get('matrix_name') amplicon_set_name = params.get('amplicon_set_name') obj_type = params.get('obj_type') scale = params.get('scale') description = params.get('description') refs = {k: v for k, v in params.items() if "_ref" in k} if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys) new_row_attr_ref = None if not params.get('row_attributemapping_ref'): new_row_attr_ref = amplicon_data.get('row_attributemapping_ref') new_col_attr_ref = None if not params.get('col_attributemapping_ref'): new_col_attr_ref = amplicon_data.get('col_attributemapping_ref') logging.info('start saving Matrix object: {}'.format(matrix_name)) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id})['obj_ref'] amplicon_set_data = self._file_to_amplicon_set_data(biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref) logging.info('start saving AmpliconSet object: {}'.format(amplicon_set_name)) amplicon_set_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseExperiments.AmpliconSet', 'obj_name': amplicon_set_name, 'data': amplicon_set_data, 'workspace_name': workspace_id})['obj_ref'] logging.info('start resaving Matrix object with amplicon set: {}'.format(matrix_name)) amplicon_data['amplicon_set_ref'] = '{}/{}'.format(workspace_id, amplicon_set_name) matrix_obj_ref = self.data_util.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': amplicon_data, 'workspace_name': workspace_id})['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref, 'amplicon_set_obj_ref': amplicon_set_obj_ref} report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name) returnVal.update(report_output) return returnVal def _process_params(self, params): logging.info('start validating import_matrix_from_biom params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in self.matrix_types: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) scale = params.get('scale') if scale not in SCALE_TYPES: raise ValueError('Unknown scale type: {}'.format(scale)) biom_file = None tsv_file = None fasta_file = None metadata_keys = DEFAULT_META_KEYS if params.get('biom_tsv'): biom_tsv = params.get('biom_tsv') biom_file = biom_tsv.get('biom_file_biom_tsv') tsv_file = biom_tsv.get('tsv_file_biom_tsv') if not (biom_file and tsv_file): raise ValueError('missing BIOM or TSV file') biom_file = self.dfu.download_staging_file( {'staging_file_subdir_path': biom_file}).get('copy_file_path') tsv_file = self.dfu.download_staging_file( {'staging_file_subdir_path': tsv_file}).get('copy_file_path') mode = 'biom_tsv' elif params.get('biom_fasta'): biom_fasta = params.get('biom_fasta') biom_file = biom_fasta.get('biom_file_biom_fasta') fasta_file = biom_fasta.get('fasta_file_biom_fasta') if not (biom_file and fasta_file): raise ValueError('missing BIOM or FASTA file') biom_file = self.dfu.download_staging_file( {'staging_file_subdir_path': biom_file}).get('copy_file_path') fasta_file = self.dfu.download_staging_file( {'staging_file_subdir_path': fasta_file}).get('copy_file_path') mode = 'biom_fasta' elif params.get('tsv_fasta'): tsv_fasta = params.get('tsv_fasta') tsv_file = tsv_fasta.get('tsv_file_tsv_fasta') fasta_file = tsv_fasta.get('fasta_file_tsv_fasta') if not (tsv_file and fasta_file): raise ValueError('missing TSV or FASTA file') tsv_file = self.dfu.download_staging_file( {'staging_file_subdir_path': tsv_file}).get('copy_file_path') fasta_file = self.dfu.download_staging_file( {'staging_file_subdir_path': fasta_file}).get('copy_file_path') metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta') if metadata_keys_str: metadata_keys += [x.strip() for x in metadata_keys_str.split(',')] mode = 'tsv_fasta' elif params.get('tsv'): tsv = params.get('tsv') tsv_file = tsv.get('tsv_file_tsv') if not tsv_file: raise ValueError('missing TSV file') tsv_file = self.dfu.download_staging_file( {'staging_file_subdir_path': tsv_file}).get('copy_file_path') metadata_keys_str = tsv.get('metadata_keys_tsv') if metadata_keys_str: metadata_keys += [x.strip() for x in metadata_keys_str.split(',')] mode = 'tsv' else: raise ValueError('missing valide file group type in parameters') return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys))) def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False): #exit(tsv_metadata_df) defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fdb3037f378>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}) #exit(key) taxonomy #exit(biom_metadata_dict) none if key in biom_metadata_dict: return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key) elif key in tsv_metadata_df: return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key) elif required: raise ValueError('missing necessary [{}] from file'.format(key)) else: return None def _search_taxon(self, scientific_name): """ logic borrowed from: GFU.GenomeInterface https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216 """ taxon_id = None search_params = { "object_types": ["taxon"], "match_filter": { "lookup_in_keys": { "scientific_name": {"value": scientific_name}}, "exclude_subobjects": 1 }, "access_filter": { "with_private": 0, "with_public": 1 }, "sorting_rules": [{ "is_object_property": 0, "property": "timestamp", "ascending": 0 }] } objects = self.kbse.search_objects(search_params)['objects'] if not objects: search_params['match_filter']['lookup_in_keys'] = { "aliases": {"value": scientific_name} } objects = self.kbse.search_objects(search_params)['objects'] if objects: taxon_id = objects[0].get('object_name') #exit(taxon_id) 561_taxon return taxon_id def _fetch_taxon_level(self, taxon_char): taxon_level_mapping = {'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species'} return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown') def _fetch_taxonomy(self, datarow): #exit(datarow) defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7f7ca8e8d950>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}) lineage = self._retrieve_value([], datarow, 'taxonomy') #exit(lineage) ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'] if isinstance(lineage, str): delimiter = csv.Sniffer().sniff(lineage).delimiter lineage = [x.strip() for x in lineage.split(delimiter)] #exit(lineage) ['k__Bacteria', 'k__Bacteria'] taxonomy = {'lineage': lineage} for key in ['score', 'taxonomy_source', 'species_name']: val = self._retrieve_value([], datarow, key) if val: taxonomy[key] = val #exit(key) species_name for item in lineage[::-1]: scientific_name = item.split('_')[-1] taxon_level_char = item.split('_')[0] if scientific_name: taxon_id = self._search_taxon(scientific_name) if taxon_id: taxon_ref = f"{self.taxon_wsname}/{taxon_id}" taxon_level = self._fetch_taxon_level(taxon_level_char) taxonomy.update({'taxon_ref': taxon_ref, 'taxon_id': taxon_id, 'scientific_name': scientific_name, 'taxon_level': taxon_level}) break #exit(taxonomy) {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'} return taxonomy def _retrieve_tsv_amplicon_set_data(self, tsv_file): #tsv file is data/amplicon_test.tsv amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide TSV file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start processing each row in TSV') for observation_id in df.index: taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished parsing TSV file') return amplicons ''' {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}} ''' def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file): #tsvfile = data/amplicon_test.tsv amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") #{'GG_OTU_1' : SeqRecord(...), ...} except Exception: raise ValueError('Cannot parse file. Please provide valide FASTA file') try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide TSV file') logging.info('start processing files') for observation_id in df.index: if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons ''' {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}} ''' def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file): #exit(biom_file) data/phyloseq_test.biom amplicons = dict() try: logging.info('start parsing FASTA file') fastq_dict = SeqIO.index(fasta_file, "fasta") except Exception: raise ValueError('Cannot parse file. Please provide valide FASTA file') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in fastq_dict: raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id)) taxonomy = self._fetch_taxonomy(observation_metadata[index]) amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq), 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished processing files') return amplicons ''' {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}} ''' def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file): amplicons = dict() try: logging.info('start parsing TSV file') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide tsv file') if 'consensus_sequence' not in df.columns.tolist(): raise ValueError('TSV file does not include consensus_sequence') logging.info('start parsing BIOM file') table = biom.load_table(biom_file) observation_ids = table._observation_ids.tolist() observation_metadata = table._observation_metadata logging.info('start processing files') for index, observation_id in enumerate(observation_ids): if observation_id not in df.index: raise ValueError('TSV file does not have [{}] OTU id'.format(observation_id)) taxonomy = self._fetch_taxonomy(df.loc[observation_id]) amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'], 'taxonomy': taxonomy} amplicons.update({observation_id: amplicon}) logging.info('finished processing files') ''' {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}} ''' return amplicons def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description, matrix_obj_ref): logging.info('start parsing amplicon_set_data') amplicon_set_data = dict() if mode == 'biom_tsv': amplicons = self._retrieve_biom_tsv_amplicon_set_data(biom_file, tsv_file) elif mode == 'biom_fasta': amplicons = self._retrieve_biom_fasta_amplicon_set_data(biom_file, fasta_file) elif mode == 'tsv_fasta': amplicons = self._retrieve_tsv_fasta_amplicon_set_data(tsv_file, fasta_file) elif mode == 'tsv': amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file) else: raise ValueError('error parsing _file_to_amplicon_set_data, mode: {}'.format(mode)) amplicon_set_data.update({'amplicons': amplicons}) if 'reads_set_ref' in refs: amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref') if description: amplicon_set_data['description'] = description matrix_obj_ref_array = matrix_obj_ref.split('/') amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(matrix_obj_ref_array[0], matrix_obj_ref_array[1]) ''' {'amplicons': {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}}, 'description': 'OTU data', 'amplicon_matrix_ref': '44071/21'} ''' return amplicon_set_data def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id, scale, description, metadata_keys=None): amplicon_data = refs if mode.startswith('biom'): logging.info('start parsing BIOM file for matrix data') table = biom.load_table(biom_file) observation_metadata = table._observation_metadata sample_metadata = table._sample_metadata matrix_data = {'row_ids': table._observation_ids.tolist(), 'col_ids': table._sample_ids.tolist(), 'values': table.matrix_data.toarray().tolist()} logging.info('start building attribute mapping object') amplicon_data.update(self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data.update(self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} for k in ('create_date', 'generated_by'): val = getattr(table, k) if not val: continue if isinstance(val, bytes): amplicon_data['attributes'][k] = val.decode('utf-8') else: amplicon_data['attributes'][k] = str(val) elif mode.startswith('tsv'): observation_metadata = None sample_metadata = None try: logging.info('start parsing TSV file for matrix data') reader = pd.read_csv(tsv_file, sep=None, iterator=True) inferred_sep = reader._engine.data.dialect.delimiter df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0) except Exception: raise ValueError('Cannot parse file. Please provide valide tsv file') else: metadata_df = None if metadata_keys: shared_metadata_keys = list(set(metadata_keys) & set(df.columns)) if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys: raise ValueError('TSV file does not include consensus_sequence') if shared_metadata_keys: metadata_df = df[shared_metadata_keys] df.drop(columns=shared_metadata_keys, inplace=True) try: df = df.astype(float) except ValueError: err_msg = 'Found some non-float values. Matrix contains only numeric values\n' err_msg += 'Please list any non-numeric column names in Metadata Keys field' raise ValueError(err_msg) df.fillna(0, inplace=True) matrix_data = {'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist()} logging.info('start building attribute mapping object') amplicon_data.update(self.get_attribute_mapping("row", observation_metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df)) amplicon_data.update(self.get_attribute_mapping("col", sample_metadata, matrix_data, matrix_name, refs, workspace_id)) amplicon_data['attributes'] = {} else: raise ValueError('error parsing _file_to_amplicon_data, mode: {}'.format(mode)) amplicon_data.update({'data': matrix_data}) amplicon_data['search_attributes'] = [f'{k}|{v}' for k, v in amplicon_data['attributes'].items()] amplicon_data['scale'] = scale if description: amplicon_data['description'] = description ''' {'col_attributemapping_ref': '44071/33/24', 'row_attributemapping_ref': '44071/19/119', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'} ''' return amplicon_data def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs, workspace_id, metadata_df=None): ''' getting mapping data based on refs or metadata or metadata_df ''' #exit(metadata) ''' (defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf730>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf9d8>, {'taxonomy': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf6a8>, {'taxonomy': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafd08>, {'taxonomy': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafea0>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']})) ''' #exit(matrix_data) {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]} #exit(matrix_name) test_AmpliconMatrix #exit(refs) {'col_attributemapping_ref': '44071/33/51'} mapping_data = {} axis_ids = matrix_data[f'{axis}_ids'] #exit(axis_ids) ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] if refs.get(f'{axis}_attributemapping_ref'): am_data = self.dfu.get_objects( {'object_refs': [refs[f'{axis}_attributemapping_ref']]} )['data'][0]['data'] unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" if axis == 'col' else "Row" raise ValueError(f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") else: mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata: name = matrix_name + "_{}_attributes".format(axis) mapping_data[f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping( axis_ids, metadata, name, workspace_id) # if coming from biom file, metadata and axis IDs are guaranteed to match mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} elif metadata_df is not None: name = matrix_name + "_{}_attributes".format(axis) mapping_data[f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping( axis_ids, metadata_df, name, workspace_id) mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids} ''' {'row_attributemapping_ref': '44071/19/122', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5', 'GG_OTU_6': 'GG_OTU_6'}} ''' return mapping_data def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id): data = {'ontology_mapping_method': "TSV file", 'instances': {}} attribute_keys = metadata_df.columns.tolist() data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in attribute_keys] for axis_id in axis_ids: data['instances'][axis_id] = metadata_df.loc[axis_id].tolist() logging.info('start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] # 44071/19/128 return f'{info[6]}/{info[0]}/{info[4]}' def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id): data = {'ontology_mapping_method': "BIOM file", 'instances': {}} sample_set = metadata[0:min(len(metadata), 25)] metadata_keys = sorted(set((k for m_dict in sample_set for k in m_dict))) data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in metadata_keys] for inst, meta in zip(instances, metadata): data['instances'][inst] = [str(meta[attr]) for attr in metadata_keys] logging.info('start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": data, "name": obj_name }] })[0] # 44071/19/134 return f'{info[6]}/{info[0]}/{info[4]}' def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref, new_col_attr_ref, workspace_name): """ _generate_report: generate summary report """ objects_created = [{'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix'}, {'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set'}] if new_row_attr_ref: objects_created.append({'ref': new_row_attr_ref, 'description': 'Imported Amplicons(Row) Attribute Mapping'}) if new_col_attr_ref: objects_created.append({'ref': new_col_attr_ref, 'description': 'Imported Samples(Column) Attribute Mapping'}) report_params = {'message': '', 'objects_created': objects_created, 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} #{'report_name': 'import_matrix_from_biom_db306341-c03a-4e60-b8a4-2bd7f6a48925', 'report_ref': '44071/200/1'} return report_output def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref): #not going to be used anywhere logging.info('writting amplicon set data frame to tsv file') amplicon_set_obj = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0] amplicon_set_info = amplicon_set_obj['info'] amplicon_set_name = amplicon_set_info[1] file_path = os.path.join(result_dir, amplicon_set_name + ".tsv") amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True) return file_path def _amplicon_set_to_df(self, amplicon_set_ref): #not going to be used anywhere logging.info('converting amplicon set to data frame') am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0]['data'] amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref') matrix_data = self.dfu.get_objects({'object_refs': [amplicon_matrix_ref]})['data'][0]['data'] matrix_value_data = matrix_data.get('data') index = matrix_value_data.get('row_ids') columns = matrix_value_data.get('col_ids') values = matrix_value_data.get('values') df = pd.DataFrame(values, index=index, columns=columns) amplicons = am_set_data.get('amplicons') meta_index = list() meta_columns = ['taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score', 'taxonomy_source', 'species_name', 'consensus_sequence'] meta_values = list() for otu_id, amplicon in amplicons.items(): meta_index.append(otu_id) taxonomy_data = amplicon.get('taxonomy') taxonomy = taxonomy_data.get('lineage') taxon_id = taxonomy_data.get('taxon_id') taxon_ref = taxonomy_data.get('taxon_ref') taxon_level = taxonomy_data.get('taxon_level') score = taxonomy_data.get('score') taxonomy_source = taxonomy_data.get('taxonomy_source') species_name = taxonomy_data.get('species_name') consensus_sequence = amplicon.get('consensus_sequence') meta_values.append([taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source, species_name, consensus_sequence]) meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns) merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left', validate='one_to_one') return merged_df def export_amplicon_set_tsv(self, params): # not goign to be called anywhere """ export AmpliconSet as TSV """ logging.info('start exporting amplicon set object') amplicon_set_ref = params.get('input_ref') amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref) result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref) package_details = self.dfu.package_for_download({ 'file_path': result_dir, 'ws_refs': [amplicon_set_ref] }) return {'shock_id': package_details['shock_id']}
class expsfileuploadUtil: def __init__(self, params): self.params = params self.callback_url = os.environ["SDK_CALLBACK_URL"] self.dfu = DataFileUtil(self.callback_url) self.data_folder = os.path.abspath("/kb/module/data/") # This is where files from staging area exist self.staging_folder = os.path.abspath("/staging/") self.shared_folder = params["shared_folder"] self.scratch_folder = os.path.join(params["shared_folder"], "scratch") def upload_expsfile(self): """ The upload method We perform a number of steps: Get name of expsfile as it is in staging. Find the expsfile in /staging/expsfile_name Get the output name for the expsfile Get the column headers for the exps file for data and testing purposes. Test if expsfile is well-formed. We send the file to shock using dfu. We get the handle and save the object with all the necessary information- including related genome. params should include: username, staging_file_name, genome_ref, description, output_name """ print("params: ", self.params) self.validate_import_expsfile_from_staging_params() # Name of file in staging: (file name or absolute path?) staging_exps_fp_name = self.params["staging_file_name"] # Output name of exps file: expsfile_name = self.params["output_name"] print("expsfile_name: ", expsfile_name) print("top dir /:", os.listdir("/")) print("/kb/module/:", os.listdir("/kb/module")) if not os.path.exists(self.staging_folder): raise Exception("Staging dir does not exist yet!") else: print("Succesfully recognized staging directory") # This is the path to the exps file expsfile_fp = os.path.join(self.staging_folder, staging_exps_fp_name) # We check correctness of exps file. Returns list and int column_header_list, num_rows, setNames = self.check_exps_file( expsfile_fp) # We copy the file from staging to scratch new_exps_fp = os.path.join(self.shared_folder, expsfile_name) shutil.copyfile(expsfile_fp, new_exps_fp) expsfile_fp = new_exps_fp # We create the handle for the object: file_to_shock_result = self.dfu.file_to_shock( {"file_path": expsfile_fp, "make_handle": True, "pack": "gzip"} ) # The following var res_handle only created for simplification of code res_handle = file_to_shock_result["handle"] # We create a better Description by adding date time and username date_time = datetime.datetime.utcnow() #new_desc = "Uploaded by {} on (UTC) {} using Uploader. User Desc: ".format( # self.params['username'], str(date_time)) # We create the data for the object exps_data = { "file_type": "KBasePoolTSV.Experiments", "expsfile": res_handle["hid"], # below should be shock "handle_type": res_handle["type"], "shock_url": res_handle["url"], "shock_node_id": res_handle["id"], "compression_type": "gzip", "file_name": res_handle["file_name"], "utc_created": str(date_time), "column_header_list": column_header_list, "num_lines": str(num_rows), "related_genome_ref": self.params["genome_ref"], "related_organism_scientific_name": self.get_genome_organism_name( self.params["genome_ref"] ), "description": self.params["description"], } # To get workspace id: ws_id = self.params["workspace_id"] save_object_params = { "id": ws_id, "objects": [ { "type": "KBasePoolTSV.Experiments", "data": exps_data, "name": expsfile_name, } ], } # save_objects returns a list of object_infos dfu_object_info = self.dfu.save_objects(save_object_params)[0] print("dfu_object_info: ") print(dfu_object_info) return { "Name": dfu_object_info[1], "Type": dfu_object_info[2], "Date": dfu_object_info[3], } def validate_import_expsfile_from_staging_params(self): # check for required parameters for p in [ "username", "staging_file_name", "genome_ref", "description", "output_name" ]: if p not in self.params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def check_exps_file(self, expsfile_fp): required = [ "SetName", "Index", "Description", "Date_pool_expt_started", ] cols, num_rows, setNames = self.read_table(expsfile_fp, required) return [cols, num_rows, setNames] def read_table(self, fp, required): """ Following function takes a filename and a list of required fields i (file is TSV) returns list of headers Does not return header line """ with open(fp, "r") as f: file_str = f.read() file_list = file_str.split("\n") header_line = file_list[0] # Check for Mac Style Files if re.search(r"\t", header_line) and re.search(r"\r", header_line): raise Exception( ( "Tab-delimited input file {} is a Mac-style text file " "which is not supported.\n" "Use\ndos2unix -c mac {}\n to convert it to a Unix " "text file.\n" ).format(fp, fp) ) cols = header_line.split("\t") cols_dict = {} for i in range(len(cols)): cols_dict[cols[i]] = i for field in required: if field not in cols_dict: raise Exception( "No field {} in {}. Must include fields".format(field, fp) + "\n{}".format(" ".join(required)) ) rows = [] # This is unique to Experiments setNames = [] for i in range(1, len(file_list)): line = file_list[i] # if last line empty if len(line) == 0: continue line = re.sub(r"[\r\n]+$", "", line) split_line = line.split("\t") setNames.append(split_line[0]) if not len(split_line) == len(cols): raise Exception( "Wrong number of columns in:\n{}\nin {} l:{}".format(line, fp, i) ) new_dict = {} for i in range(len(cols)): new_dict[cols[i]] = split_line[i] rows.append(new_dict) return [cols, len(file_list), setNames] def get_genome_organism_name(self, genome_ref): # Getting the organism name using WorkspaceClient ws = self.params["ws_obj"] res = ws.get_objects2( {"objects": [{"ref": genome_ref, "included": ["scientific_name"]}]} ) scientific_name = res["data"][0]["data"]["scientific_name"] return scientific_name
class PDBUtil: # “Expect Value” threshold to restrict which alignments will be significant E_VALUE_THRESH = 1e-20 # BLAST sequence identity threshold to determine which pdb structures will be # matched to a KBase genome/feature B_IDENTITY_THRESH = 0.6 def _validate_import_pdb_file_params(self, params): """ _validate_import_pdb_file_params: validates input params to import_model_pdb_file and import_experiment_pdb_file """ # check for required parameters for p in ['structure_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params.get('workspace_name'), params.get( 'structure_name') def _model_file_to_data(self, file_path, params): """ _model_file_to_data: Do the PDB conversion--parse the model pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.PDBParser(PERMISSIVE=1) pdb1 = file_path pp_no = 0 data = {} try: structure = parser.get_structure("test", pdb1) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'PDBParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 # logging.info(f'Getting pdb structure data for {structure}!') (compound, source) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) model = structure[0] protein_data = self._get_proteins_by_structure( structure, model.get_id(), file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): data = { 'name': structure.header.get('name', ''), 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'compound': compound, 'source': source, 'proteins': protein_data } else: logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) data = {} finally: return data, pp_no, params def _exp_file_to_data(self, file_path, params): """ _exp_file_to_data: Do the PDB conversion--parse the experiment pdb file for creating a pdb data object """ logging.info( f'Parsing pdb file {file_path} to a pdb structure with params: {params}' ) parser = PDB.MMCIFParser() cif = file_path pp_no = 0 mmcif_data = None try: structure = parser.get_structure("PHA-L", cif) except (RuntimeError, TypeError, KeyError, ValueError) as e: logging.info(f'MMCIFParser errored with message: {e.message}') raise else: ppb = PPBuilder() for pp in ppb.build_peptides(structure): pp_no += 1 struc_name = structure.header.get('name', '') hd = self._upload_to_shock(file_path) # logging.info(f'Getting pdb structure data for {structure}!') (cpd, src) = self._get_compound_source(structure) (num_models, model_ids) = self._get_models_from_structure(structure) (num_chains, chain_ids) = self._get_chains_from_structure(structure) (num_residues, residue_ids) = self._get_residues_from_structure(structure) (num_atoms, atom_ids) = self._get_atoms_from_structure(structure) protein_data = self._get_proteins_by_structure( structure, model_ids[0], file_path) (protein_data, params) = self._match_features(params, protein_data) pdb_info = params.get('pdb_info', None) if pdb_info and pdb_info.get('sequence_identities', None): mmcif_data = { 'name': struc_name, 'head': structure.header.get('head', ''), 'rcsb_id': structure.header.get('rcsb_id', ''), 'deposition_date': structure.header.get('deposition_date', ''), 'release_date': structure.header.get('release_date', ''), 'structure_method': structure.header.get('structure_method', ''), 'resolution': structure.header.get('resolution', 0.0), 'structure_reference': structure.header.get('structure_reference', []), 'keywords': structure.header.get('keywords', ''), 'author': structure.header.get('author', ''), 'compound': cpd, 'source': src, 'num_models': num_models, 'num_chains': num_chains, 'num_residues': num_residues, 'num_atoms': num_atoms, 'num_het_atoms': structure.header.get('num_het_atoms', 0), 'num_water_atoms': structure.header.get('num_water_atoms', 0), 'num_disordered_atoms': structure.header.get('num_disordered_atoms', 0), 'num_disordered_residues': structure.header.get('num_disordered_residues', 0), 'pdb_handle': hd, 'mmcif_handle': hd, 'xml_handle': hd, 'proteins': protein_data } else: mmcif_data = {} logging.info( f'Parsing pdb file {file_path} failed to match KBase genome/features!' ) finally: return mmcif_data, pp_no, params def _match_features(self, params, protein_data): """ _match_features: match the protein_translation in feature_id with chain sequences in protein_data and compute the seq_identity and determine the exact_match example (in appdev): genome_obj = '57196/6/1', genome_name = 'Synthetic_bacterium_JCVI_Syn3.0_genome' feature_id = 'JCVISYN3_0004_CDS_1', feature_type = 'CDS' OR feature_id = 'JCVISYN3_0004', feature_type = 'gene' """ pdb_info = params.get('pdb_info', None) if pdb_info: kb_feature_type = '' kb_feature_seq = '' genome_name = pdb_info['genome_name'] narr_id = pdb_info['narrative_id'] feature_id = pdb_info['feature_id'] logging.info( f"Looking up for feature {feature_id} in genome {genome_name}'s features" ) # 1. Get the genome's features and reference (gn_ref, kb_genome_features) = self._get_genome_ref_features( narr_id, genome_name) if not gn_ref: logging.info( f"Given genome {genome_name} does not exist in workspace {narr_id}!" ) return protein_data, params pdb_info['genome_ref'] = gn_ref # 2. Match the genome features with the specified feature_id to obtain feature sequence for feat in kb_genome_features: if feat['id'] == feature_id: logging.info( f'Found genome feature match for {feature_id}') kb_feature_type = self._get_feature_type(feat) kb_feature_seq = feat.get('protein_translation', '') break pdb_info['feature_type'] = kb_feature_type # 3. Call self._compute_sequence_identity with the feature sequence and the the pdb # proteins' translations to to get the seq_identity and exact_match if kb_feature_seq: logging.info( f"Finding seq_identity and exact_match for feature {feature_id}" f" in genome {genome_name}'s features...") pdb_chain_ids = [] pdb_model_ids = [] pdb_seq_idens = [] pdb_exact_matches = [] for prot in protein_data: seq_idens, seq_mats = self._compute_sequence_identity( kb_feature_seq, prot.get('sequence', '')) if seq_idens: seq_idens.sort() max_iden = seq_idens.pop() if max_iden >= self.B_IDENTITY_THRESH: # get the good matches prot['seq_identity'] = max_iden prot['exact_match'] = 1 if max_iden > 0.99 else 0 prot['genome_ref'] = gn_ref prot['feature_id'] = feature_id prot['feature_type'] = kb_feature_type pdb_chain_ids.append(prot['chain_id']) pdb_model_ids.append(str(prot['model_id'])) pdb_seq_idens.append(str(prot['seq_identity'])) pdb_exact_matches.append(str(prot['exact_match'])) if pdb_seq_idens: pdb_info['sequence_identities'] = ','.join(pdb_seq_idens) if pdb_chain_ids: pdb_info['chain_ids'] = ','.join(pdb_chain_ids) if pdb_model_ids: pdb_info['model_ids'] = ','.join(pdb_model_ids) if pdb_exact_matches: pdb_info['exact_matches'] = ','.join(pdb_exact_matches) else: logging.info( f'Found NO feature in genome that matches with {feature_id}' ) else: logging.info( 'NO KBase genome/feature object info were given for uploading') return protein_data, params def _compute_sequence_identity(self, seq1, seq2): """ _compute_sequence_identity: Given two input sequences, do a blast identity check and then compute and return the matching percentage. """ # Create two sequence files Seq1 = SeqRecord(Seq(seq1), id="query_seq") Seq2 = SeqRecord(Seq(seq2), id="subject_seq") blast_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(blast_dir) query_seq = os.path.join(blast_dir, 'seq_qry.fasta') subject_seq = os.path.join(blast_dir, 'seq_sbj.fasta') SeqIO.write(Seq1, query_seq, "fasta") SeqIO.write(Seq2, subject_seq, "fasta") # on my laptop: blastp_path = '/Users/qzhang/miniconda3/bin/blastp' blastp_path = 'blastp' output_file_path = os.path.join(blast_dir, 'blast_output.xml') # Build the BLASTp command blastp_cmd = [blastp_path] blastp_cmd.append('-out') blastp_cmd.append(output_file_path) blastp_cmd.append('-outfmt') blastp_cmd.append('5') blastp_cmd.append('-query') blastp_cmd.append(query_seq) blastp_cmd.append('-subject') blastp_cmd.append(subject_seq) # Run BLASTp and parse the output as XML and then parse the xml file for identity matches exact_matches = [] idens = [] try: p = subprocess.Popen(blastp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) output, errors = p.communicate() if not output: logging.info(f'BLASTp returned: {p.returncode}') logging.info(f'OK> output: {output}') if errors: e = subprocess.CalledProcessError(p.returncode, blastp_cmd, output=output) raise e except OSError as e: logging.info(f'OSError > {e.errno}') logging.info(f'OSError > {e.strerror}') logging.info(f'OSError > {e.filename}') except subprocess.CalledProcessError as e: logging.info(f'CalledError > {e.returncode}') logging.info(f'CalledError > {e.output}') except: logging.info(f'Unexpected error > {sys.exc_info()[0]}') else: with open(output_file_path) as blast_fhd: blast_record = NCBIXML.read(blast_fhd) if blast_record: logging.info(f'query: {blast_record.query[:100]}') for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < self.E_VALUE_THRESH: logging.info('****Alignment****') logging.info(f'sequence: {alignment.title}') logging.info(f'length: {alignment.length}') logging.info(f'e value: {hsp.expect}') logging.info(f'hsp query: {hsp.query}') logging.info(f'hsp match: {hsp.match}') logging.info(f'hsp subject: {hsp.sbjct}') logging.info( f'hsp identities: {hsp.identities}') logging.info(f'hsp positives: {hsp.positives}') iden = round(hsp.identities / hsp.positives, 6) logging.info(f'identity={iden}') idens.append(iden) if hsp.positives == hsp.identities: exact_matches.append(alignment.title[:100]) return idens, exact_matches def _get_genome_ref_features(self, narr_id, genome_name): """ _get_genome_ref_features: Get the genome reference and features for genome_name """ genome_ref = '' genome_features = [] (genome_info, genome_data) = self._get_object_info_data(narr_id, genome_name) if genome_info and genome_data: genome_ref = '/'.join( [str(narr_id), str(genome_info[0]), str(genome_info[4])]) genome_features = genome_data['features'] return (genome_ref, genome_features) def _get_feature_type(self, feature_obj): """ _get_feature_type: Get the type for the feature object of given feature_obj """ feat_type = feature_obj.get('type', '') if not feat_type: if feature_obj.get('protein_translation'): feat_type = 'gene' else: feat_type = 'other' return feat_type def _get_object_info_data(self, narr_id, obj_name): """ _get_object_info_data: Get the object info/data with given obj_name in narrative narr_id """ obj_info = None obj_data = None if narr_id and obj_name: try: obj_data_res = self.ws_client.get_objects2( {'objects': [{ 'wsid': narr_id, 'name': obj_name }]})['data'][0] obj_info = obj_data_res['info'] obj_data = obj_data_res['data'] except: logging.info( f'No object with name {obj_name} exists in workspace {narr_id}' ) logging.info( f'Unexpected error occurred while getting object for {obj_name}' ) pass return (obj_info, obj_data) def _get_atoms_from_structure(self, pdb_structure): """ _get_atoms_from_structure: Given a pdb_structure object, parse atoms into a list of atoms and return it """ atom_ids = [] num_atoms = 0 my_residues = pdb_structure.get_residues() for r_ele in my_residues: for a_ele in r_ele.get_atoms(): num_atoms += 1 atom_ids.append(a_ele.get_id()) return (num_atoms, atom_ids) def _get_residues_from_structure(self, pdb_structure): """ _get_residues_from_structure: Given a pdb_structure object, parse residues into a list and return it """ res_ids = [] num_res = 0 my_res = pdb_structure.get_residues() for r_ele in my_res: if PDB.is_aa(r_ele): num_res += 1 res_ids.append(r_ele.get_id()) return (num_res, res_ids) def _get_chains_from_structure(self, pdb_structure): """ _get_chains: Given a pdb_structure object, parse chain ids into a list and return it """ chain_ids = [] num_chains = 0 my_chains = pdb_structure.get_chains() for c_ele in my_chains: if (c_ele): num_chains += 1 chain_ids.append(c_ele.get_id()) return (num_chains, chain_ids) def _get_models_from_structure(self, pdb_structure): """ _get_models_from_structure: Given a pdb_structure object, parse model ids into a list and return it """ model_ids = [] num_models = 0 my_models = pdb_structure.get_models() for m_ele in my_models: if (m_ele): num_models += 1 model_ids.append(m_ele.get_id()) return (num_models, model_ids) def _get_compound_source(self, structure): """ _get_compound_source: Parse data from given structure for compound and source """ cpd_dict = dict() cpd = structure.header.get('compound', {}) # logging.info(f'Compound:\n {cpd}') if cpd and cpd.get('1'): cpd_dict = cpd.get('1') src_dict = dict() src = structure.header.get('source', {}) # logging.info(f'Source:\n {src}') if src and src.get('1'): src_dict = src.get('1') return (cpd_dict, src_dict) def _get_proteins_by_structure(self, pdb_structure, model, file_path): """ _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data """ ppb = PPBuilder() protein_data = [] # Parse for the chain_id and chain sequence for c_ele in pdb_structure.get_chains(): if (c_ele): c_ppd_list = [] for c_ppd in ppb.build_peptides(c_ele): c_pp_seq = str(c_ppd.get_sequence()) c_ppd_list.append(c_pp_seq) c_seq = ''.join(c_ppd_list) protein_data.append({ 'id': os.path.basename(file_path), 'model_id': model, 'chain_id': c_ele.get_id(), 'sequence': c_seq, 'md5': hashlib.md5(c_seq.encode()).hexdigest() }) return protein_data def _validate_file(self, file_path): """ _validate_file: Check if file_path is accessable, if yes, return the handle """ try: fh = open(file_path, 'r') except IOError as e: if e.errno == errno.ENOENT: # No such file or directory raise ValueError(f'"{file_path}" does not exist!') elif e.errno == errno.EACCES: # Permission denied raise ValueError(f'"{file_path}" cannot be read!') else: raise ValueError(f'"{e.strerror}" error occurred') else: fh.close() return True def _dfu_get_objects(self, obj_ref): """ _dfu_get_objects: call dfu.get_objects to return object data and info """ obj = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0] return obj['data'], obj['info'] def _get_pdb_shock_id(self, obj_ref): """ _get_pdb_shock_id: Return the shock id for the PDB file """ obj_data, obj_info = self._dfu_get_objects(obj_ref) return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info(f'Start uploading file to shock: {file_path}') file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock( file_to_shock_params)['handle']['hid'] return shock_id def _generate_report_html(self, pdb_name, pdb_path): """ _generate_report_html: generates the HTML for the upload report """ html_report = list() # Make report directory and copy over files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) result_file_path = os.path.join(output_directory, 'viewer.html') new_pdb_path = os.path.join(output_directory, os.path.basename(pdb_path)) shutil.copy(pdb_path, new_pdb_path) # Fill in template HTML with open( os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html')) as report_template_file: report_template = report_template_file.read()\ .replace('*PDB_NAME*', pdb_name)\ .replace('*PDB_PATH*', os.path.basename(pdb_path)) with open(result_file_path, 'w') as result_file: result_file.write(report_template) html_report.append({ 'path': output_directory, 'name': os.path.basename(result_file_path), 'description': 'HTML report for PDB upload' }) return html_report def _generate_report(self, method_name, pdb_obj_ref, workspace_name, n_poly_pep, pdb_name, pdb_path): """ _generate_report: generate summary report for upload """ output_html_files = self._generate_report_html(pdb_name, pdb_path) report_params = { 'message': f'You uploaded a PDB file. {n_poly_pep} polypeptides detected.', 'html_links': output_html_files, 'direct_html_link_index': 0, 'objects_created': [{ 'ref': pdb_obj_ref, 'description': 'Imported PDB' }], 'workspace_name': workspace_name, 'report_object_name': method_name + '_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _validate_batch_import_pdbs_params(self, params): """ _validate_batch_import_pdbs_params: validates params passed to batch_import_pdbs method """ # check for required parameters for p in [ 'structures_name', 'workspace_name', 'metadata_staging_file_path' ]: if p not in params: raise ValueError(f'"{p}" parameter is required, but missing') # metadata_staging_file_path must be from the staging area--must have the staging dir prefix if params.get('metadata_staging_file_path', None): staging_file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('metadata_staging_file_path') }).get('copy_file_path') return (staging_file_path, params['workspace_name'], params['structures_name']) else: error_msg = "Must supply a 'metadata_staging_file_path'" raise ValueError(error_msg) def _read_file_by_type(self, file_path): """ _read_file_by_type: read the file given by file_path depending on its type, return a DataFrame object """ logging.info(f'Reading input from file: {file_path}...') if not self._validate_file(file_path): raise ValueError('Input file is invalid or not found!') df = None file_ext = pathlib.Path(file_path).suffix try: # read the data from file_path depending on its extension if 'csv' in file_ext: df = pd.read_csv(file_path) elif 'tsv' in file_ext: df = pd.read_csv(file_path, '\t') elif 'xls' in file_ext or 'od' in file_ext: # handle xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions df = pd.read_excel(file_path, index_col=None, engine='openpyxl') else: # invalid file type error_msg = "Invalid input file type, only 'csv/tsv/xlsx' are accepted!" raise ValueError(error_msg) # strip off the leading and trailing whitespaces of the column names df.columns = df.columns.str.strip() except (RuntimeError, TypeError, KeyError, ValueError, WorkspaceError) as e: logging.info( f'Reading file {file_path} errored with message: {e.message} and data: {e.data}' ) raise return df def _parse_metadata_file(self, metadata_file_path, ws_id): """ _parse_metadata_file: From metadata_file_path, a spreadsheet file, sort out the model_pdb_file_paths, exp_pdb_file_paths and the kbase_meta_data return: lists model_pdb_file_paths, exp_pdb_file_paths and dict kbase_meta_data """ logging.info( f'parsing metadata from input file {metadata_file_path}...') required_columns = [ 'Narrative ID', 'Object name (Genome AMA feature set)', 'Feature ID', 'PDB filename', 'Is model', 'From RCSB' ] pdb_file_paths = list() narrative_ids = list() genome_names = list() feature_ids = list() # df_meta_data is a Panda DataFrame object df_meta_data = self._read_file_by_type(metadata_file_path) df_col_list = df_meta_data.columns.values.tolist() # check if required columns are read in correctly for col in required_columns: if col not in df_col_list: missing_required = f"Required column '{col}' is missing!" raise ValueError(missing_required) df_indexes = df_meta_data.columns for i in range(len(df_meta_data[df_indexes[0]])): narr_id = int(df_meta_data[df_indexes[0]][i]) if not pd.isna(narr_id): narrative_ids.append(narr_id) else: missing_narr_id = "Please fill all the rows in column 'Narrative ID'!" raise ValueError(missing_narr_id) obj_name = df_meta_data[df_indexes[1]][i] if not pd.isna(obj_name): genome_names.append(obj_name) else: missing_obj_name = "Please fill all the rows in column 'Object name'!" raise ValueError(missing_obj_name) feat_id = df_meta_data[df_indexes[2]][i] if not pd.isna(feat_id): feature_ids.append(feat_id) else: missing_feature_id = f"Please fill all the rows in column '{required_columns[2]}'!" raise ValueError(missing_feature_id) pdb_fn = df_meta_data[df_indexes[3]][ i] # pdb_fn does not have staging dir prefix if pd.isna(pdb_fn): missing_pdb_file = f"Please fill all the rows in column '{required_columns[3]}'!" raise ValueError(missing_pdb_file) (struct_name, ext) = os.path.splitext(os.path.basename(pdb_fn)) from_rcsb = df_meta_data[df_indexes[5]][ i] # pdb file source, default to 'yes' if pd.isna(from_rcsb): from_rcsb = 'yes' is_model = df_meta_data[df_indexes[4]][i] if not pd.isna(is_model): pdb_file_paths.append({ 'file_path': pdb_fn, 'structure_name': struct_name, 'narrative_id': narr_id, 'genome_name': obj_name, 'feature_id': feat_id, 'is_model': 'y' in is_model or 'Y' in is_model, 'from_rcsb': 'y' in from_rcsb or 'Y' in from_rcsb }) else: missing_pdb_md = f"Please fill all the rows in columns '{required_columns[4]}'!" raise ValueError(missing_pdb_md) if not pdb_file_paths: error_msg = "No PDB file info is provided!" raise ValueError(error_msg) return (pdb_file_paths, narrative_ids, genome_names, feature_ids) def _generate_batch_report(self, workspace_name, structs_ref, structs_name, pdb_infos, failed_pdbs): """ _generate_batch_report: generate summary report for upload """ output_html_files = self._generate_batch_report_html( structs_name, pdb_infos) description = ( f'Imported PDBs into a ProteinStructures object "{structs_ref}", ' f'named "{structs_name}".') if failed_pdbs: failed_files = ','.join(failed_pdbs) description += f' These files "{failed_files}" failed to load.' report_params = { 'message': f'You have uploaded a batch of PDB files into {structs_name}.', 'html_links': output_html_files, 'direct_html_link_index': 0, 'objects_created': [{ 'ref': structs_ref, 'description': description }], 'workspace_name': workspace_name, 'report_object_name': 'batch_import_pdb_files_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _write_pdb_htmls(self, output_dir, succ_pdb_infos): """ _write_pdb_htmls: write the batch pdb info as a jQuery DataTable into HTML files """ pdb_html = '' srv_domain = urlparse( self.shock_url).netloc # parse url to get the domain portion srv_base_url = f'https://{srv_domain}' logging.info(f'Get the url for building the anchors: {srv_base_url}') dir_name = os.path.dirname(__file__) molstar_html_file = os.path.join(dir_name, 'templates', 'molstar_viewer.html') molstar_js_file = os.path.join(dir_name, 'templates', 'molstar.js') molstar_css_file = os.path.join(dir_name, 'templates', 'molstar.css') shutil.copy(molstar_html_file, os.path.join(output_dir, 'molstar_viewer.html')) shutil.copy(molstar_js_file, os.path.join(output_dir, 'molstar.js')) shutil.copy(molstar_css_file, os.path.join(output_dir, 'molstar.css')) for succ_pdb in succ_pdb_infos: row_html = '<tr>' file_path = succ_pdb['file_path'] pdb_file_path = succ_pdb[ 'scratch_path'] # This is the scratch path for this pdb file new_pdb_path = os.path.join(output_dir, os.path.basename(file_path)) shutil.copy(pdb_file_path, new_pdb_path) struct_nm = succ_pdb['structure_name'].upper() genome_name = succ_pdb['genome_name'] genome_ref = succ_pdb['genome_ref'] feat_id = succ_pdb['feature_id'] feat_type = succ_pdb['feature_type'] src_rcsb = succ_pdb['from_rcsb'] pdb_chains = [] pdb_models = [] seq_idens = [] if succ_pdb.get('chain_ids', None): pdb_chains = succ_pdb['chain_ids'].split() if succ_pdb.get('model_ids', None): pdb_models = succ_pdb['model_ids'].split() if succ_pdb.get('sequence_identities', None): seq_idens = succ_pdb['sequence_identities'].split() if src_rcsb: row_html += ( f'<td>{struct_nm}<a href="https://www.rcsb.org/3d-view/{struct_nm}"' f' target="_blank"> RCSB Structure</a></td>') else: row_html += (f'<td>{struct_nm}<a href="./molstar_viewer.html"' f' or <a href="molstar_viewer.html"' f' target="_blank"> MolStar Viewer</a></td>') row_html += (f'<td><a href="{srv_base_url}/#dataview/{genome_ref}"' f' target="_blank">{genome_name}</a></td>' f'<td>{feat_id}</td><td>{feat_type}</td>') row_html += f'<td>{pdb_models}</td>' row_html += f'<td>{pdb_chains}</td>' row_html += f'<td>{seq_idens}</td>' row_html += '</tr>' pdb_html += row_html return pdb_html def _generate_batch_report_html(self, prot_structs_name, succ_pdb_infos): """ _generate_batch_report_html: generates the HTML for the upload report """ html_report = list() # Make report directory and copy over uploaded pdb files output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_directory) # Create the template html file for reporting batch-uploaded pdb files batch_html_report_path = os.path.join(output_directory, 'batch_pdb_viewer.html') pdb_html = self._write_pdb_htmls(output_directory, succ_pdb_infos) # Fetch & fill in detailed info into template HTML with open( os.path.join( os.path.dirname(__file__), 'templates', 'batch_pdb_template.html')) as batch_template_html: batch_html_report = batch_template_html.read()\ .replace('<!--replace this content-->', pdb_html) with open(batch_html_report_path, 'w') as html_report_file: html_report_file.write(batch_html_report) print( f'Full batch_html_report has been written to {batch_html_report_path}' ) html_report.append({ 'path': output_directory, 'name': os.path.basename(batch_html_report_path), 'description': 'HTML report for PDB upload' }) return html_report def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.user_id = config['USER_ID'] self.dfu = DataFileUtil(self.callback_url) self.hs = AbstractHandle(config['handle-service-url']) self.ws_client = Workspace(config['workspace-url']) self.shock_url = config['shock-url'] def import_model_pdb_file(self, params, create_report=True): """ import_model_pdb_file: upload an experiment pdb file and convert into a KBaseStructure.ModelProteinStructure object """ logging.info( f'import_model_pdb_file to a pdb data structure with params: {params}' ) # file_path is the pdb file's working area path (after dfu.download_staging_file call) file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params( params) (data, n_polypeptides, params) = self._model_file_to_data(file_path, params) if not data: logging.info( f'PDB file {file_path} import with "Import ModelProteinStructure" failed!' ) return {}, {} data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') pdb_info = params.get('pdb_info', None) if pdb_info: pdb_info['scratch_path'] = file_path logging.info(f'Model structure data:{data}') return data, pdb_info def import_experiment_pdb_file(self, params, create_report=True): """ import_experiment_pdb_file: upload an experiment pdb file and convert into a KBaseStructure.ExperimentalProteinStructure object """ logging.info( f'import_experiment_pdb_file to a pdb structure with params: {params}' ) # file_path is the pdb file's working area path (after dfu.download_staging_file call) file_path, workspace_name, mmcif_name = self._validate_import_pdb_file_params( params) # Parse the experimental pdb file for an experimental data structure (data, n_polypeptides, params) = self._exp_file_to_data(file_path, params) if not data: logging.info( f'Import {file_path} with "Import ExperimentalProteinStructure" failed!' ) return {}, {} data['pdb_handle'] = self._upload_to_shock(file_path) data['user_data'] = params.get('description', '') pdb_info = params.get('pdb_info', None) if pdb_info: pdb_info['scratch_path'] = file_path logging.info(data) return data, pdb_info def _export_pdb(self, params): """ _export_pdb: return the shock_id of the uploaded pdb object """ if "input_ref" not in params: raise ValueError("'input_ref' not in supplied params") return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def _structure_to_pdb_file(self, params): """ _structure_to_pdb_file: get the file path for the given pdb object """ if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") shock_id = self._get_pdb_shock_id(params['input_ref']) file_path = self.dfu.shock_to_file({ 'shock_id': shock_id, 'file_path': params['destination_dir'], 'unpack': 'uncompress' })['file_path'] return {'file_path': file_path} def export_pdb_structures(self, params): """ export_pdb_structures: return the shock_ids of the ProteinStructures object """ if 'input_ref' not in params: raise ValueError("'input_ref' not in supplied params") model_pdbs = [] exp_pdbs = [] # shock_ids = [] for m_pdb in model_pdbs: pass for e_pdb in exp_pdbs: pass return {'shock_id': self._get_pdb_shock_id(params['input_ref'])} def batch_import_pdbs(self, params): """ batch_import_pdbs: upload two sets of pdb files and create a KBaseStructure.ProteinStructures object required params: metadata_staging_file_path: a metafile from the user's staging area that must be a subdirectory file path in staging area, e.g., /data/bulk/user_name/metadata_staging_file_path staging_file_subdir_path is metadata_staging_file_path structures_name: name of the ProteinStructures object to be generated workspace_name: workspace name that the protein structure(s) will be saved return: structures_ref: return ProteinStructures object reference report_name: name of generated report (if any) report_ref: report reference (if any) 1. call _validate_batch_import_pdbs_params to validate input params 2. call _parse_metadata to parse for model_pdb_files, exp_pdb_files and kbase_meta_data 3. call import_model_pdb_file on each entry in model_pdb_paths, and call import_experiment_pdb_file on each entry in exp_pdb_paths 4. assemble the data for a ProteinStructures and save the data object 5. call _generate_batch_report to generate a report for batch_import_pdbs' result """ (metadata_file_path, workspace_name, structures_name) = self._validate_batch_import_pdbs_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name params['workspace_id'] = workspace_id (pdb_file_paths, narrative_ids, genome_names, feature_ids) = self._parse_metadata_file(metadata_file_path, workspace_id) model_pdb_objects = list() exp_pdb_objects = list() pdb_infos = list() successful_files = list() failed_files = list() protein_structures = dict() total_structures = 0 pdb_params = {} # loop through the list of pdb_file_paths for pdb in pdb_file_paths: pdb_params['pdb_info'] = pdb pdb_params['input_staging_file_path'] = pdb['file_path'] pdb_params['input_file_path'] = None pdb_params['input_shock_id'] = None pdb_params['workspace_name'] = workspace_name pdb_params['structure_name'] = pdb['structure_name'] if pdb['is_model']: model_pdb_data, pdb_info = self.import_model_pdb_file( pdb_params, False) if model_pdb_data: model_pdb_objects.append(model_pdb_data) pdb_infos.append(pdb_info) successful_files.append(pdb['file_path']) total_structures += 1 else: failed_files.append(pdb['file_path']) else: exp_pdb_data, pdb_info = self.import_experiment_pdb_file( pdb_params, False) if exp_pdb_data: exp_pdb_objects.append(exp_pdb_data) pdb_infos.append(pdb_info) successful_files.append(pdb['file_path']) total_structures += 1 else: failed_files.append(pdb['file_path']) if not model_pdb_objects: logging.info("No model pdb structure was created/saved!") return {} protein_structures['model_structures'] = model_pdb_objects protein_structures['experimental_structures'] = exp_pdb_objects protein_structures['total_structures'] = total_structures protein_structures['description'] = ( f'Created {total_structures} ' f'structures in {structures_name}') logging.info( f'ProteinStructures data structure to be saved:\n{protein_structures}' ) returnVal = {} try: info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [{ 'type': 'KBaseStructure.ProteinStructures', 'name': structures_name, 'data': protein_structures }] })[0] except (RuntimeError, TypeError, KeyError, ValueError, WorkspaceError) as e: err_msg = f'DFU.save_objects errored with message: {e.message} and data: {e.data}' logging.info(err_msg) raise ValueError(err_msg) else: structs_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'structures_ref': structs_ref} report_output = self._generate_batch_report( workspace_name, structs_ref, structures_name, pdb_infos, failed_files) returnVal.update(report_output) finally: return returnVal
class poolfileuploadUtil: def __init__(self, params): self.params = params self.callback_url = os.environ["SDK_CALLBACK_URL"] self.dfu = DataFileUtil(self.callback_url) self.data_folder = os.path.abspath("/kb/module/data/") # This is where files from staging area exist self.staging_folder = os.path.abspath("/staging/") self.shared_folder = params["shared_folder"] self.scratch_folder = os.path.join(params["shared_folder"], "scratch") def upload_poolfile(self): """ The upload method We perform a number of steps: Get name of poolfile as it is in staging. Find the poolfile in /staging/poolfile_name Get the output name for the poolfile Get the column headers for the pool file for data and testing purposes. Should be len 12. Test if poolfile is well-formed. We send the file to shock using dfu. We get the handle and save the object with all the necessary information- including related genome. """ print("params: ", self.params) self.validate_import_poolfile_from_staging_params() # Name of file in staging: staging_pool_fp_name = self.params["staging_file_name"] # Output name of pool file: poolfile_name = self.params["output_name"] print("poolfile_name: ", poolfile_name) print("top dir /:", os.listdir("/")) print("/kb/module/:", os.listdir("/kb/module")) if not os.path.exists(self.staging_folder): raise Exception("Staging dir does not exist yet! Error will be thrown") else: print("Succesfully recognized staging directory") # This is the path to the pool file poolfile_fp = os.path.join(self.staging_folder, staging_pool_fp_name) # We check correctness of pool file column_header_list, num_lines = self.check_pool_file(poolfile_fp) if len(column_header_list) != 12: print( "WARNING: Number of columns is not 12 as expected: {}".format( len(column_header_list) ) ) # We copy the file from staging to scratch new_pool_fp = os.path.join(self.shared_folder, poolfile_name) shutil.copyfile(poolfile_fp, new_pool_fp) poolfile_fp = new_pool_fp # We create the handle for the object: file_to_shock_result = self.dfu.file_to_shock( {"file_path": poolfile_fp, "make_handle": True, "pack": "gzip"} ) # The following var res_handle only created for simplification of code res_handle = file_to_shock_result["handle"] # We create a better Description by adding date time and username date_time = datetime.datetime.utcnow() #new_desc = "Uploaded by {} on (UTC) {} using Uploader. User Desc: ".format( # self.params['username'], str(date_time)) fastq_refs = [] # We create the data for the object pool_data = { "file_type": "KBasePoolTSV.PoolFile", "poolfile": res_handle["hid"], # below should be shock "handle_type": res_handle["type"], "shock_url": res_handle["url"], "shock_node_id": res_handle["id"], "compression_type": "gzip", "column_header_list": column_header_list, "num_lines": str(num_lines), "fastqs_used": fastq_refs, "file_name": res_handle["file_name"], "utc_created": str(date_time), "related_genome_ref": self.params["genome_ref"], "related_organism_scientific_name": self.get_genome_organism_name( self.params["genome_ref"] ), "description": "Manual Upload: " + self.params["description"], } # To get workspace id: ws_id = self.params["workspace_id"] save_object_params = { "id": ws_id, "objects": [ { "type": "KBasePoolTSV.PoolFile", "data": pool_data, "name": poolfile_name, } ], } # save_objects returns a list of object_infos dfu_object_info = self.dfu.save_objects(save_object_params)[0] print("dfu_object_info: ") print(dfu_object_info) return { "Name": dfu_object_info[1], "Type": dfu_object_info[2], "Date": dfu_object_info[3], } def validate_import_poolfile_from_staging_params(self): # check for required parameters for p in [ "username", "staging_file_name", "genome_ref", "description", "output_name" ]: if p not in self.params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def check_pool_file(self, poolfile_fp): """ We check the pool file by initializing into dict format The function "init_pool_dict" runs the tests to see if the file is correct. """ col_header_list = [] # Parse pool file and check for errors test_vars_dict = {"poolfile": poolfile_fp, "report_dict": {"warnings": []}} try: col_header_list, num_lines = self.init_pool_dict(test_vars_dict) except Exception: logging.warning( "Pool file seems to have errors - " + "Please check and reupload." ) raise Exception return [col_header_list, num_lines] def init_pool_dict(self, vars_dict): # pool dict is rcbarcode to [barcode, scaffold, strand, pos] pool = {} num_lines = 0 with open(vars_dict["poolfile"], "r") as f: header_str = f.readline() if header_str == '': raise Exception("Issue with pool file - first line empty") num_lines += 1 column_header_list = [x.strip() for x in header_str.split("\t")] crnt_line = f.readline() while crnt_line != '': num_lines += 1 crnt_line.rstrip() pool = self.check_pool_line_and_add_to_pool_dict( crnt_line, pool, vars_dict ) crnt_line = f.readline() if len(pool.keys()) == 0: raise Exception("No entries in pool file") return [column_header_list, num_lines] def check_pool_line_and_add_to_pool_dict(self, pool_line, pool, vars_dict): """ For a pool line to be correct it has to follow a few rules. We care about the first 7 columns of each pool line. The first line in the file is the headers, and the first 7 are barcode, rcbarcode, nTot, n, scaffold, strand, pos Both the barcodes and rcbarcodes must be entirely made up of characters from "ACTG". Position must be made up of any number of digits (including 0). Strand is from "+","-","". If the rcbarcode already exists in the pool, then there is a problem with the pool file. Each rcbarcode must be unique. """ # We get first 7 columns of pool_line (out of 12) split_pool_line = pool_line.split("\t")[:7] # We remove spaces: for x in split_pool_line: x.replace(" ", "") if len(split_pool_line) >= 7: # We unpack ( barcode, rcbarcode, undef_1, undef_2, scaffold, strand, pos, ) = split_pool_line else: warning_text = "pool file line with less than 7 tabs:\n{}".format(pool_line) vars_dict["report_dict"]["warnings"].append(warning_text) logging.warning(warning_text) barcode = "barcode" if barcode == "barcode": # Header line pass else: if not re.search(r"^[ACGT]+$", barcode): logging.debug(len(barcode)) raise Exception("Invalid barcode: |{}|".format(barcode)) if not re.search(r"^[ACGT]+$", rcbarcode): raise Exception("Invalid rcbarcode: |{}|".format(rcbarcode)) if not (pos == "" or re.search(r"^\d+$", pos)): raise Exception("Invalid position: |{}|".format(pos)) if not (strand == "+" or strand == "-" or strand == ""): raise Exception("Invalid strand: |{}|".format(strand)) if rcbarcode in pool: raise Exception("Duplicate rcbarcode.") pool[rcbarcode] = [barcode, scaffold, strand, pos] return pool def get_genome_organism_name(self, genome_ref): # Getting the organism name using WorkspaceClient ws = self.params['ws_obj'] res = ws.get_objects2( { "objects": [ { "ref": genome_ref, "included": ["scientific_name"], } ] } ) scientific_name = res["data"][0]["data"]["scientific_name"] return scientific_name
class StrainInfo: def __init__(self, config): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = os.environ['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.sampleservice_util = SampleServiceUtil(config) def _sampleset_to_strain_info(self, sample_set_ref, vcf_strain_ids): ''' :param sample_set_ref: :param vcf_strain_ids: :return: StrainInfo order of StrainInfo should be same as order of vcf_strain_ids ''' sample_set = self.dfu.get_objects({"object_refs": [sample_set_ref] })['data'][0]['data'] samples = sample_set['samples'] sample_dict = {} for sample in samples: name = sample['name'] sample_dict[name] = { "name": name, "sample_id": sample['id'], "version": sample['version'] } StrainInfo = [] missing_strains = [] duplicated_strains = [] seen_strain = {} for strain in vcf_strain_ids: if strain in seen_strain: duplicated_strains.append(strain) else: seen_strain[strain] = 1 if strain not in sample_dict: missing_strains.append(strain) else: StrainInfo.append(sample_dict[strain]) dup_strains = ", ".join(duplicated_strains) if duplicated_strains: raise ValueError( f'duplicated strain ids need to be fixed in vcf file - {dup_strains}' ) if missing_strains: strains_not_found = ", ".join(missing_strains) raise ValueError( f'Missing strains from sample set {strains_not_found}') return (StrainInfo) def _sample_set_to_attribute_mapping(self, axis_ids, sample_set_ref, obj_name, ws_id): am_data = self.sampleservice_util.sample_set_to_attribute_mapping( sample_set_ref) unmatched_ids = set(axis_ids) - set(am_data['instances'].keys()) if unmatched_ids: name = "Column" raise ValueError( f"The following {name} IDs from the uploaded matrix do not match " f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}" f"\nPlease verify the input data or upload an excel file with a" f"{name} mapping tab.") logging.info( 'start saving AttributeMapping object: {}'.format(obj_name)) info = self.dfu.save_objects({ "id": ws_id, "objects": [{ "type": "KBaseExperiments.AttributeMapping", "data": am_data, "name": obj_name }] })[0] sample_attribute_ref = str(info[6]) + "/" + str(info[0]) + "/" + str( info[4]) return (sample_attribute_ref) def sample_strain_info(self, params): vcf_strain_ids = params["vcf_strain_ids"] sample_set_ref = params["sample_set_ref"] ws_id = params["ws_id"] obj_name = params["sample_attribute_name"] sample_attribute_ref = self._sample_set_to_attribute_mapping( vcf_strain_ids, sample_set_ref, obj_name, ws_id) strains = self._sampleset_to_strain_info(sample_set_ref, vcf_strain_ids) return (sample_attribute_ref, strains)
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.3.6" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "75ef2c24694c056dfca71859d6f344ccff7d4725" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file How we compute this stats: For each segment (line) in SAM/BAM file: we take the first element as `reads_id` the second element as `flag` if the last bit (0x1) of flag is `1`: we treat this segment as paired end reads otherwise: we treat this segment as single end reads For single end reads: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_reads_count + identical mapped `reads_id` For paired end reads: if the 7th last bit (0x40) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_left_reads_count else: we treat this `reads_id` as mapped if the 8th last bit ( 0x80) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_right_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id` """ path, file = os.path.split(bam_file) self.__LOGGER.info('Start to generate aligner stats') start_time = time.time() infile = pysam.AlignmentFile(bam_file, 'r') properly_paired = 0 unmapped_reads_count = 0 unmapped_left_reads_count = 0 unmapped_right_reads_count = 0 mapped_reads_ids = [] mapped_left_reads_ids = [] mapped_right_reads_ids = [] paired = False for alignment in infile: seg = alignment.to_string().split('\t') reads_id = seg[0] flag = "0000000" + "{0:b}".format(int(seg[1])) if flag[-1] == '1': paired = True if paired: # process paired end sequence if flag[-7] == '1': # first sequence of a pair if flag[-3] == '1': unmapped_left_reads_count += 1 else: mapped_left_reads_ids.append(reads_id) if flag[-8] == '1': # second sequence of a pair if flag[-3] == '1': unmapped_right_reads_count += 1 else: mapped_right_reads_ids.append(reads_id) if flag[-2] == '1': properly_paired += 1 else: # process single end sequence if flag[-3] == '1': unmapped_reads_count += 1 else: mapped_reads_ids.append(reads_id) if flag[-2] == '1': properly_paired += 1 infile.close() if paired: mapped_reads_ids = mapped_left_reads_ids + mapped_right_reads_ids unmapped_reads_count = unmapped_left_reads_count + unmapped_right_reads_count mapped_reads_ids_counter = Counter(mapped_reads_ids) mapped_reads_count = len(list(mapped_reads_ids_counter)) singletons = list(mapped_reads_ids_counter.values()).count(1) multiple_alignments = mapped_reads_count - singletons total_reads = unmapped_reads_count + mapped_reads_count properly_paired = properly_paired // 2 else: mapped_reads_ids_counter = Counter(mapped_reads_ids) mapped_reads_count = len(list(mapped_reads_ids_counter)) singletons = list(mapped_reads_ids_counter.values()).count(1) multiple_alignments = mapped_reads_count - singletons total_reads = unmapped_reads_count + mapped_reads_count try: alignment_rate = round( float(mapped_reads_count) / total_reads * 100, 3) except ZeroDivisionError: alignment_rate = 0 if alignment_rate > 100: alignment_rate = 100.0 elapsed_time = time.time() - start_time self.__LOGGER.info('Used: {}'.format( time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) stats_data = { "alignment_rate": alignment_rate, "mapped_reads": mapped_reads_count, "multiple_alignments": multiple_alignments, "properly_paired": properly_paired, "singletons": singletons, "total_reads": total_reads, "unmapped_reads": unmapped_reads_count } return stats_data def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment How we compute BAM stats: For each segment (line) in SAM/BAM file: we take the first element as `reads_id` the second element as `flag` if the last bit (0x1) of flag is `1`: we treat this segment as paired end reads otherwise: we treat this segment as single end reads For single end reads: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_reads_count + identical mapped `reads_id` For paired end reads: if the 7th last bit (0x40) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_left_reads_count else: we treat this `reads_id` as mapped if the 8th last bit ( 0x80) of flag is `1`: if the 3rd last bit (0x8) of flag is `1`: we increment unmapped_right_reads_count else: we treat this `reads_id` as mapped for all mapped `reads_ids`" if it appears only once: we treat this `reads_id` as `singletons` else: we treat this `reads_id` as `multiple_alignments` lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id` :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get(self.PARAM_IN_DOWNLOAD_BAI, False): bai_file = file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get(self.PARAM_IN_DOWNLOAD_SAM, False): sam_file = file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.items(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class poolcountfileuploadUtil: def __init__(self, params): self.params = params self.callback_url = os.environ["SDK_CALLBACK_URL"] self.dfu = DataFileUtil(self.callback_url) self.data_folder = os.path.abspath("/kb/module/data/") # This is where files from staging area exist self.staging_folder = os.path.abspath("/staging/") self.shared_folder = params["shared_folder"] self.scratch_folder = os.path.join(params["shared_folder"], "scratch") def upload_poolcountfile(self): """ The upload method We perform a number of steps: Get name of poolcount file as it is in staging. Find the poolcount file in /staging/poolcount_name Get the output name for the poolcount file Get the column headers for the pool count file for data and testing purposes. Test if poolcount file is well-formed. NOTE: We use output_name as set_name - it is important that these are equivalent!!!!! We send the file to shock using dfu. We get the handle and save the object with all the necessary information- including related genome. params should include: output_name, staging_file_name, ws_obj, workspace_id, """ print("params: ", self.params) self.validate_import_file_from_staging_params() # Name of file in staging (Not path): staging_fp_name = self.params["staging_file_name"] # Output name of poolcount file: poolcount_name = self.params["output_name"] print("Output pool count name: ", poolcount_name) if not os.path.exists(self.staging_folder): raise Exception("Staging dir does not exist yet!") else: print("Succesfully recognized staging directory") # This is the path to the pool file in staging poolcount_fp = os.path.join(self.staging_folder, staging_fp_name) # We check correctness of pool file in staging column_header_list, num_lines = self.check_poolcount_file(poolcount_fp) # We copy the file from staging to scratch new_pc_fp = os.path.join(self.shared_folder, poolcount_name) shutil.copyfile(poolcount_fp, new_pc_fp) #poolcount_scratch_fp is location of pool file in scratch poolcount_scratch_fp = new_pc_fp # We create the KBase handle for the object: file_to_shock_result = self.dfu.file_to_shock({ "file_path": poolcount_scratch_fp, "make_handle": True, "pack": "gzip" }) # The following var res_handle only created for simplification of code res_handle = file_to_shock_result["handle"] # Keep track of our own datetime date_time = datetime.datetime.utcnow() #new_desc = "Uploaded by {} on (UTC) {} using Uploader. User Desc: ".format( # self.params['username'], str(date_time)) fastq_refs = [] # We create the data for the object poolcount_data = { "file_type": "KBasePoolTSV.PoolCount", "poolcount": res_handle["hid"], # below should be shock "handle_type": res_handle["type"], "shock_url": res_handle["url"], "shock_node_id": res_handle["id"], "compression_type": "gzip", "column_header_list": column_header_list, "fastqs_used": fastq_refs, "file_name": res_handle["file_name"], "utc_created": str(date_time), "set_name": self.params['output_name'], "num_lines": str(num_lines), "related_genome_ref": self.params["genome_ref"], "related_organism_scientific_name": self.get_genome_organism_name(self.params["genome_ref"]), "description": "Manual Upload: " + self.params["description"], } # To get workspace id: ws_id = self.params["workspace_id"] save_object_params = { "id": ws_id, "objects": [{ "type": "KBasePoolTSV.PoolCount", "data": poolcount_data, "name": self.params['output_name'], }], } # save_objects returns a list of object_infos dfu_object_info = self.dfu.save_objects(save_object_params)[0] print("dfu_object_info: ") print(dfu_object_info) return { "Name": dfu_object_info[1], "Type": dfu_object_info[2], "Date": dfu_object_info[3], } def check_poolcount_file(self, poolcount_fp): """ We check the pool file by initializing into dict format Currently a weak test- should add more testing capabilities. """ # Expected fields exp_f = "barcode rcbarcode scaffold strand pos".split(" ") with open(poolcount_fp, "r") as f: f_str = f.read() f_list = f_str.split('\n') num_lines = len(f_list) header_line = f_list[0] # Dropping f_str from memory f_str = None if header_line == '': raise Exception("File format incorrect: " + poolcount_fp) fields = header_line.split("\t") if not (len(fields) >= 6): raise Exception("Too few fields in " + poolcount_fp) for i in range(len(exp_f)): if not fields[i] == exp_f[i]: raise Exception("Expected {} but field is {}".format( exp_f[i], fields[i])) return [fields, num_lines] def validate_import_file_from_staging_params(self): # check for required parameters for p in [ "username", "staging_file_name", "genome_ref", "description", "output_name", "ws_obj", "workspace_id" ]: if p not in self.params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def get_genome_organism_name(self, genome_ref): # Getting the organism name using WorkspaceClient ws = self.params['ws_obj'] res = ws.get_objects2({ "objects": [{ "ref": genome_ref, "included": ["scientific_name"], }] }) scientific_name = res["data"][0]["data"]["scientific_name"] return scientific_name
class DataUtil: @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {} for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'): constraints[tag] = [line.strip().split()[1:] for line in type_desc.split("\n") if line.startswith(f'@{tag}')] return constraints def _filter_constraints(self, constraints, data): """filters out constraints with missing keys""" contains_constraints = constraints.get('contains') filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint[1:] missing_key = True for in_value in in_values: if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) return constraints def _retrieve_value(self, data, value): """Parse the provided 'data' object to retrieve the item in 'value'.""" logging.info('Getting value for {}'.format(value)) retrieve_data = [] m_data = DotMap(data) if value.startswith('set('): retrieve_data = value[4:-1].split(",") elif value.startswith('values('): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp.values()) elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2({'objects': [{'ref': obj_ref, 'included': [included]}]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [x.get(keys[2]) for x in ref_data.get(keys[0])] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) logging.info('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20])) return retrieve_data def _validate(self, constraints, data): """ _validate: validate data """ validated = True failed_constraints = defaultdict(list) unique_constraints = constraints.get('unique') for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint[0]) if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint[0]) contains_constraints = constraints.get('contains') for contains_constraint in contains_constraints: value = contains_constraint[0] in_values = contains_constraint[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append(" ".join(contains_constraint)) conditional_constraints = constraints.get('conditionally_required') for conditional_constraint in conditional_constraints: trigger = conditional_constraint[0] required_keys = conditional_constraint[1:] if trigger in data: missing_keys = [key for key in required_keys if key not in data] if missing_keys: validated = False failed_constraints['conditionally_required'].append( (trigger, required_keys, missing_keys)) return validated, failed_constraints @staticmethod def _mkdir_p(path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise @staticmethod def _raise_validation_error(params, validate): """Raise a meaningful error message for failed validation""" logging.error('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = ['Object {} failed type checking:'.format(params.get('obj_name'))] if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg.append('Object should have unique field: {}'.format(unique_values)) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) if 'col_mapping' in super_value: error_msg.append('Column attribute mapping instances should contain all ' 'column index from original data') if 'row_mapping' in super_value: error_msg.append('Row attribute mapping instances should contain all row ' 'index from original data') error_msg.append('Object field [{}] should contain field [{}]'.format( super_value, subset_value)) for failure in failed_constraints.get('conditionally_required', []): error_msg.append('If object field "{}" is present than object field(s) {} should ' 'also be present. Object is missing {}'.format(*failure)) raise ValueError('\n'.join(error_msg)) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.serviceWizardURL = config['srv-wiz-url'] self.wsClient = workspaceService(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.generics_service = GenericsService(self.serviceWizardURL) self.ws_large_data = WsLargeDataIO(self.callback_url) def list_generic_types(self, params=None): """ *Not yet exposed in spec* list_generic_types: lists the current valid generics types arguments: none return: A list of generic types in the current environment """ returnVal = [x['type_def'] for module in GENERICS_MODULES for x in self.wsClient.get_all_type_info(module)] return returnVal def fetch_data(self, params): """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ for p in ['obj_ref']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) return self.generics_service.fetch_data(params) def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) return {'validated': validated, 'failed_constraints': failed_constraints} def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ logging.info('Starting validating and saving object data') obj_type = params.get('obj_type').split('-')[0] module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({'mod': module_name}).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').items() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): self._raise_validation_error(params, validate) # make sure users with shared object have access to the handle file upon saving handle = data.get('sequencing_file_handle') if handle: output_directory = os.path.join(self.scratch, str(uuid.uuid4())) logging.info('Downloading consensus sequence file in {}'.format(output_directory)) self._mkdir_p(output_directory) matrix_fasta_file = self.dfu.shock_to_file({ 'handle_id': handle, 'file_path': self.scratch}).get('file_path') logging.info('Saving consensus sequence file to shock: {}'.format(matrix_fasta_file)) handle_id = self.dfu.file_to_shock({'file_path': matrix_fasta_file, 'make_handle': True})['handle']['hid'] data['sequencing_file_handle'] = handle_id # cast data int_data_names = ['sequencing_quality_filter_cutoff', 'read_length_cutoff'] for data_name in int_data_names: if data_name in data: try: logging.info('Casting {} to int'.format(data_name)) data[data_name] = int(data[data_name]) except Exception as e: err_msg = 'Unexpected data type {}. '.format(data_name) err_msg += 'Data type {} requests {} to be an integer value. '.format( obj_type, data_name) err_msg += 'Provided [{}] {} instead'.format( type(data[data_name]), data[data_name]) raise ValueError(err_msg) from e float_data_names = ['barcode_error_rate', 'sequence_error_cutoff', 'clustering_cutoff'] for data_name in float_data_names: if data_name in data: try: logging.info('Casting {} to float'.format(data_name)) data[data_name] = float(data[data_name]) except Exception as e: err_msg = 'Unexpected data type {}. '.format(data_name) err_msg += 'Data type {} requests {} to be a float value. '.format( obj_type, data_name) err_msg += 'Provided [{}] {} instead'.format( type(data[data_name]), data[data_name]) raise ValueError(err_msg) from e ws_name_id = params.get('workspace_id') workspace_name = params.get('workspace_name') if not ws_name_id: if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name try: logging.info('Starting saving object via DataFileUtil') info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] except Exception: logging.info('Saving object via DataFileUtil failed') logging.info('Starting saving object via WsLargeDataIO') data_path = os.path.join(self.scratch, params.get('obj_name') + "_" + str(uuid.uuid4()) + ".json") json.dump(data, open(data_path, 'w')) info = self.ws_large_data.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data_json_file": data_path, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
def MotifEnsemble(self, ctx, params): """ :param params: instance of type "EnsembleParams" (Internal workflow: 1. Input - list of motifsets , workspace, threshold consensus 2. Download MotifSets -> Utils function 3. Assign motif ids by position in list Use refs to identify MSOs internally! Dictionary of motifsets key: ref, val set list of match sets: each item in the set is a tuple of (ref,index) for each motifset: <- enumerate to avoid duplicate for each motif in motifset for each other motifset: <- enumerate to avoid duplicate for each motif in other: compare(motif1,motif2): if motifs same: search list of sets for motif1: if found add motif2 if not in if not found search list of sets for motif2: if found add motif1 else add a new set with motif1 + motif2) -> structure: parameter "motifset_refs" of list of String, parameter "workspace_name" of String, parameter "threshold" of Double :returns: instance of type "Ensemble_out" -> structure: parameter "motifset_ref" of String """ # ctx is the context object # return variables are: out #BEGIN MotifEnsemble #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.) MotifSetDict = DownloadMotifSet(params['motifset_refs'],self.callback_url) matchSets = [] threshold = float(params['threshold']) for i,MSR1 in enumerate(MotifSetDict.keys()): for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']): for k,MSR2 in enumerate(MotifSetDict.keys()): if k > i: for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']): if CompareMotifsBP(motif1,motif2,threshold): found1 = False found2 = False index1 = -1 index2 = -1 for m,mset in enumerate(matchSets): if (MSR1,j) in mset: found1 = True index1 = m if(MSR2,l) in mset: found2 = True index2 = m if not found1 and found2: matchSets[index2].add((MSR1,j)) elif not found2 and found1: matchSets[index1].add((MSR2,l)) elif found1 and found2: if index1 != index2: matchSets[index1].union(matchSets[index2]) matchSets.pop(index2) else: matchSets.append(set([(MSR1,j),(MSR2,l)])) numMotifSets = len(params['motifset_refs']) threshold = float(params['proportion']) KeepSets = [] print('NUM MATCHSETS********') print(len(matchSets)) for i,mset in enumerate(matchSets): uniqueRefs = {} for tuple in mset: if tuple[0] not in uniqueRefs: uniqueRefs[tuple[0]] = tuple[0] if float(len(uniqueRefs.keys()))/numMotifSets >= threshold: KeepSets.append(i) print(len(KeepSets)) #handle duplicates... #for i,tuple1 in enumerate(matchSets): # for j,tuple2 in enumerate(matchSets): # if j > i: # if tuple1[0] == tuple2[0]: #handle this.... #how...? #merge locations if theyre different #pick one motif by default(p-val) #run motif compare to ensure theyre actually similar enough # print('duplicate') #create new MSO ESO = {} for ref in MotifSetDict: ESO['Condition'] = MotifSetDict[ref]['Condition'] ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref'] ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet']) ESO['Background'] = deepcopy(MotifSetDict[ref]['Background']) break ESO['Motifs'] = [] #Add motifs for keep in KeepSets: motif = merge(matchSets[keep],MotifSetDict) ESO['Motifs'].append(deepcopy(motif)) #upload new MSO dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['id'] = params['workspace_name'] save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}] info = dfu.save_objects(save_objects_params)[0] obj_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #create report htmlDir = self.shared_folder + '/ensemble_html' os.mkdir(htmlDir) MakeReport(htmlDir,ESO) try: html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set #MSO = {} #MSO['Condition'] = 'Temp' #MSO['FeatureSet_ref'] = '123' #MSO['Motifs'] = [] #MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = {} #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(fullMotifList,MSO) #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)) #Pass motif set into this #save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}] #info = dfu.save_objects(save_objects_params)[0] #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4()) reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' } ] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END MotifEnsemble # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method MotifEnsemble return value ' + 'out is not type dict as required.') # return the results return [out]
class IntegrateAppImpl: @staticmethod def _validate_params(params, required, optional=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" required = set(required) optional = set(optional) pkeys = set(params) if required - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(required - pkeys))) defined_param = required | optional for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def _build_figure(self, file_path, figure_matrix): # Make figure matrix html file and embed file_name = 'integrated_scatterplot_output.html' figure_html_path = os.path.join(file_path, file_name) output_file(figure_html_path) save(grid(figure_matrix)) return file_name def _build_table(self, table_dict, stats_df): html_lines = list() html_lines.append('<table class="table table-bordered table-striped">') header_list = [ "Enzymes", "Compartments", "Reactions", "EC numbers", "Subsystems" ] + self.conditions_ids + ["Mahalanobis distance", "p-value"] html_lines.append('<thead>') internal_header_line = "</td><td>".join(header_list) html_lines.append('<tr><td>' + internal_header_line + '</td></tr>') html_lines.append('</thead>') html_lines.append("<tbody>") print_row = True for complex_row in sorted(table_dict.keys()): print_row = True cpts = ", ".join(sorted(list(table_dict[complex_row]))) ecs = [] subsystems = [] reactions = [] conditions = [] mahal_list = [] pvalue_list = [] mahalanobis_dist = "0.00" pvalue = "0.00" for cpt in table_dict[complex_row]: for rxn in table_dict[complex_row][cpt]: if (rxn not in reactions): reactions.append(rxn) if (len(conditions) == 0): conditions = table_dict[complex_row][cpt][rxn] if (rxn in self.reactions_data): for ss in self.reactions_data[rxn]['subsystems']: ss = ss.replace("_", " ") ss = ss.replace(" in plants", "") if (ss not in subsystems): subsystems.append(ss) for ec in self.reactions_data[rxn]['ecs']: if (ec not in ecs): ecs.append(ec) str_md = "0.00" str_pv = "0.00" if (rxn + '_' + cpt not in stats_df.index): print("MISSING REACTION: ", complex_row, rxn + "_" + cpt) print_row = False else: str_md = "{0:.2f}".format( stats_df.loc[rxn + '_' + cpt]['mahalanobis']) str_pv = "{0:.2f}".format(stats_df.loc[rxn + '_' + cpt]['pvalue']) if (str_pv == "0.00"): str_pv = "{0:.2e}".format( stats_df.loc[rxn + '_' + cpt]['pvalue']) if (mahalanobis_dist != "0.00" and str_md != mahalanobis_dist): print( "WARNING: CHANGING STATS FOR SAME PROTEIN COMPLEXES\n" ) print( "===================================================\n\n" ) print(complex_row, cpts, rxn, conditions, stats_df.loc[rxn + '_' + cpt]['mahalanobis'], mahalanobis_dist, "\n") print( "===================================================\n\n" ) mahalanobis_dist = str_md pvalue = str_pv reactions = ", ".join(sorted(reactions)) subsystems = ", ".join(sorted(subsystems)) ecs = ", ".join(sorted(ecs)) conditions_strings = list() for i in range(len(conditions)): conditions[i][0] = "{0:.2f}".format(conditions[i][0]) conditions_strings.append(" | ".join(conditions[i])) # some complexes may have zero features predicted if (print_row is True): html_lines.append("<tr>") internal_row_line = "</td><td>".join( [complex_row, cpts, reactions, ecs, subsystems] + conditions_strings + [mahalanobis_dist, pvalue]) html_lines.append("<td>" + internal_row_line + "</td>") html_lines.append("</tr>") html_lines.append("</tbody>") html_lines.append("</table>") return "\n".join(html_lines) def _build_report(self, figure_matrix, table_dict, stats_df, saved_object_list, workspace_name): """ _generate_report: generate summary report """ # Make report directory and copy over files report_file_path = os.path.join(self.scratch, self.report_uuid) os.mkdir(report_file_path) table_html_string = self._build_table(table_dict, stats_df) if (len(self.conditions_ids) > 1): figure_html_file = self._build_figure(report_file_path, figure_matrix) output_html_files = self._generate_report_html( report_file_path, figure_html_file=figure_html_file, table_string=table_html_string) else: output_html_files = self._generate_report_html( report_file_path, table_string=table_html_string) report_params = { 'direct_html_link_index': 0, #Use to refer to index of 'html_links' 'workspace_name': workspace_name, 'report_object_name': 'plant_fba_' + self.report_uuid, 'objects_created': saved_object_list, 'html_links': output_html_files } output = self.kbr.create_extended_report(report_params) return {'report_name': output['name'], 'report_ref': output['ref']} def _generate_report_html(self, file_path, figure_html_file=None, table_string=None): """ _generate_report: generates the HTML for the upload report """ html_report_list = list() ############################################################## # Write table html file ############################################################## # Read in template html with open( os.path.join( '/kb/module/data', 'app_report_templates', 'integrate_abundances_report_tables_template.html') ) as report_template_file: report_template_string = report_template_file.read() # Generate and Insert html title title_string = "-".join( [self.input_params['input_expression_matrix']] + self.conditions_ids) report_template_string = report_template_string.replace( '*TITLE*', title_string) # Insert html table table_report_string = report_template_string.replace( '*TABLES*', table_string) # Write html file table_html_file = "integrated_table_output.html" with open(os.path.join(file_path, table_html_file), 'w') as table_file: table_file.write(table_report_string) ############################################################## # Write summary index.html file ############################################################## # Begin composing html html_lines = list() html_lines.append( '<h3 style="text-align: center">Integrate Abundances with Metabolism Report</h3>' ) html_lines.append( "<p>The \"Integrate Abundances with Metabolism\" app has finished running.</br>" ) html_lines.append("The app integrated the values from the <b>" + self.input_params['input_expression_matrix'] + "</b> ExpressionMatrix") html_lines.append(" with the <b>" + self.input_params['input_fbamodel'] + "</b> FBAModel</br>") html_lines.append( "Specifically, the app integrated the values from these chosen conditions in the ExpressionMatrix: <b>" + "</b>, <b>".join(self.conditions_ids) + "</b></br>") html_lines.append( "The results of the integration are stored in the <b>" + self.input_params['output_reaction_matrix'] + "</b> ReactionMatrix.</p><br/>") html_lines.append( 'The results of the integration are also tabulated in this <a href="' + table_html_file + '" target="_blank">Table</a></br>') if (len(self.conditions_ids) > 1): html_lines.append( 'The results of the integration can be also be visualized in these <a href="' + figure_html_file + '" target="_blank">Scatterplots</a>') # Read in template html with open( os.path.join('/kb/module/data', 'app_report_templates', 'integrate_abundances_report_template.html') ) as report_template_file: report_template_string = report_template_file.read() # Insert html summary_report_string = report_template_string.replace( '*TEXT*', "\n".join(html_lines)) summary_html_file = "index.html" with open(os.path.join(file_path, summary_html_file), 'w') as index_file: index_file.write(summary_report_string) ############################################################## # Upload files and compose html report object ############################################################## # Cache it in shock as an archive upload_info = self.dfu.file_to_shock({ 'file_path': file_path, 'pack': 'zip' }) # HTML Link objects html_link = dict() # Index # html_link = {'shock_id' : upload_info['shock_id'], # 'name' : summary_html_file, # 'label' : 'HTML report for integrate_abundances_with_metabolism app', # 'description' : 'HTML report for integrate_abundances_with_metabolism app'} # html_report_list.append(html_link) if (len(self.conditions_ids) > 1): # Figures html_link = { 'shock_id': upload_info['shock_id'], 'name': figure_html_file, 'label': 'Scatterplot figures generated by Integrate Abundances with Metabolism app', 'description': 'Scatterplot figures generated by Integrate Abundances with Metabolism app' } html_report_list.append(html_link) # Table html_link = { 'shock_id': upload_info['shock_id'], 'name': table_html_file, 'label': 'HTML table generated by Integrate Abundances with Metabolism app', 'description': 'HTML table generated by Integrate Abundances with Metabolism app' } html_report_list.append(html_link) return html_report_list def _load_fbamodel(self, model_ref): model_obj = self.dfu.get_objects({'object_refs': [model_ref]})['data'][0] print("Number of reactions: " + str(len(model_obj['data']['modelreactions']))) model_reaction_lookup_dict = dict() for index in range(len(model_obj['data']['modelreactions'])): model_reaction_lookup_dict[model_obj['data']['modelreactions'] [index]['id']] = index return [model_obj, model_reaction_lookup_dict] def _load_expression_matrix(self, expdata_ref): expdata_obj = self.dfu.get_objects({'object_refs': [expdata_ref]})['data'][0] conditions_ids = expdata_obj['data']['data']['col_ids'] features_ids = expdata_obj['data']['data']['row_ids'] feature_lookup_dict = dict() for index in range(len(features_ids)): feature_lookup_dict[features_ids[index]] = index condition_lookup_dict = dict() for index in range(len(conditions_ids)): condition_lookup_dict[conditions_ids[index]] = index if (len(self.conditions_ids) == 0): self.conditions_ids = conditions_ids return [ expdata_obj, features_ids, feature_lookup_dict, condition_lookup_dict ] def _compile_genome_scores(self, data, conditions_indices): Feature_Comparison_Dict = dict() for feature_index in range(len(data)): scores_dict = dict() for condition in self.conditions_ids: condition_index = conditions_indices[condition] #Retrieve value from 2D matrix score = data[feature_index][condition_index] #Force into string for easier comparison str_score = "{0:.2f}".format(score) if (str_score == "0.00"): continue scores_dict[condition] = score #Here we skip features where there aren't enough scores (should be same number of conditions) if (len(scores_dict) < len(self.conditions_ids)): continue for condition in scores_dict: if (condition not in Feature_Comparison_Dict): Feature_Comparison_Dict[condition] = list() Feature_Comparison_Dict[condition].append( scores_dict[condition]) return Feature_Comparison_Dict def _compile_model_scores_percentiles(self, data): # I want to compute percentile rank for each feature under each condition # The Conditions_Score_Dicts variable is used to "bin" identical scores # (to two decimal points, can be changed) # First, we iterate through the conditions for computing percentile rank # for each condition model_conditions_score_lists = dict() model_conditions_score_pct_dicts = dict() for condition_index in range(len(self.conditions_ids)): condition = self.conditions_ids[condition_index] # For each condition, we "bin" the scores score_reaction_dict = dict() score_reaction_list = list() # The counting of features is done independently because we skip scores of zero # (which this affect how percentile rank distributes) n_ftrs = 0 for reaction_index in range(len(data)): # Retrieve value from 2D matrix score = data[reaction_index][condition_index] # Many reactions are not assigned a score, and instead have a default tiny score if (score == float(-sys.maxsize - 1)): continue # Force into string for easier comparison str_score = "{0:.2f}".format(score) # I skip the relatively large number of reactions that have a value of zero # to prevent the computation of the percentile rank skewing towards zero if (str_score == "0.00"): continue n_ftrs += 1 if (str_score not in score_reaction_dict): score_reaction_dict[str_score] = list() score_reaction_dict[str_score].append(reaction_index) score_reaction_list.append(float(str_score)) model_conditions_score_lists[condition] = score_reaction_list # Then for each condition, we use the binned scores to compute # percentile rank if (condition not in model_conditions_score_pct_dicts): model_conditions_score_pct_dicts[condition] = dict() sorted_scores = sorted(score_reaction_dict.keys(), key=float) less_than_score_ftrs_count = 0 for score_index in range(len(sorted_scores)): n_score_ftrs = len( score_reaction_dict[sorted_scores[score_index]]) half_n_score_ftrs = float(n_score_ftrs) * 0.5 cumulative_n_score_ftrs = float( less_than_score_ftrs_count) + half_n_score_ftrs percentile_rank = cumulative_n_score_ftrs / float(n_ftrs) less_than_score_ftrs_count += len( score_reaction_dict[sorted_scores[score_index]]) model_conditions_score_pct_dicts[condition][ sorted_scores[score_index]] = percentile_rank # This next part of the code is to re-iterate through the data and to compose the dicts # that become ColumnDataStores, and also with default values # The reaction_percentile_comparison_dict is for the reaction percentile plot reaction_percentile_comparison_dict = dict() if ('All' not in reaction_percentile_comparison_dict): reaction_percentile_comparison_dict['All'] = dict() # The reaction_score_comparison_dict works for the genome features plot reaction_score_comparison_dict = dict() for reaction_index in range(len(data)): scores_dict = dict() for condition_index in range(len(self.conditions_ids)): condition = self.conditions_ids[condition_index] #Retrieve value from 2D matrix score = data[reaction_index][condition_index] #Many reactions are not assigned a score, and instead a default tiny score if (score == float(-sys.maxsize - 1)): continue scores_dict[condition] = score #Here we skip reactions where there aren't enough scores (should be same number of conditions) if (len(scores_dict) < len(self.conditions_ids)): continue for condition in scores_dict: # Collect reaction scores if (condition not in reaction_score_comparison_dict): reaction_score_comparison_dict[condition] = list() reaction_score_comparison_dict[condition].append( scores_dict[condition]) # Collect reaction percentiles if (condition not in reaction_percentile_comparison_dict['All']): reaction_percentile_comparison_dict['All'][ condition] = list() #Force into string for easier comparison str_score = "{0:.2f}".format(scores_dict[condition]) #We skip zero scores when computing the percentiles #So we have to check for them here condition_pct = 0.00 if (str_score != '0.00'): condition_pct = model_conditions_score_pct_dicts[ condition][str_score] reaction_percentile_comparison_dict['All'][condition].append( condition_pct) if ('reactions' not in reaction_percentile_comparison_dict['All']): reaction_percentile_comparison_dict['All'][ 'reactions'] = list() if(self.reactions_ids[reaction_index] not in \ reaction_percentile_comparison_dict['All']['reactions']): reaction_percentile_comparison_dict['All'][ 'reactions'].append(self.reactions_ids[reaction_index]) base_rxn = self.reactions_ids[reaction_index].split('_')[0] for ss in self.reactions_data[base_rxn]['subsystems']: if (ss not in reaction_percentile_comparison_dict): reaction_percentile_comparison_dict[ss] = dict() if (condition not in reaction_percentile_comparison_dict[ss]): reaction_percentile_comparison_dict[ss][ condition] = list() reaction_percentile_comparison_dict[ss][condition].append( condition_pct) if ('reactions' not in reaction_percentile_comparison_dict[ss]): reaction_percentile_comparison_dict[ss][ 'reactions'] = list() if(self.reactions_ids[reaction_index] not in \ reaction_percentile_comparison_dict[ss]['reactions']): reaction_percentile_comparison_dict[ss][ 'reactions'].append( self.reactions_ids[reaction_index]) self.mh_reactions_ids.append(self.reactions_ids[reaction_index]) # We set the default values here at the end of the loop because we don't know # how many reactions there will be for each category for category in reaction_percentile_comparison_dict: for key in ['color', 'size', 'tooltip', 'fill_alpha']: reaction_percentile_comparison_dict[category][key] = list() for index in range( len(reaction_percentile_comparison_dict[category][ self.conditions_ids[0]])): reaction_percentile_comparison_dict[category][ 'fill_alpha'].append(1.0) # format string of subsystems for tooltip rxn = reaction_percentile_comparison_dict[category][ 'reactions'][index] base_rxn = rxn.split('_')[0] ss_string = ", ".join( self.reactions_data[base_rxn]['subsystems']) reaction_percentile_comparison_dict[category][ 'tooltip'].append(rxn + ", " + ss_string) if (category == 'All'): reaction_percentile_comparison_dict[category][ 'color'].append('black') reaction_percentile_comparison_dict[category][ 'size'].append(6) else: reaction_percentile_comparison_dict[category][ 'color'].append('red') reaction_percentile_comparison_dict[category][ 'size'].append(8) return [ reaction_score_comparison_dict, reaction_percentile_comparison_dict ] def _compile_mahalanobis_dist_pvalue(self, data, threshold): data_df = pd.DataFrame(data, columns=self.conditions_ids, index=self.mh_reactions_ids) # I don't know the math well enough to follow what's going on, but I used # the recipe described here: # https://www.machinelearningplus.com/statistics/mahalanobis-distance/ # Covariance matrix via numpy cov_mat = np.cov(data_df.values.T) # Inverse covariance matrix via scipy.linalg # It won't accept a 1x1 matrix hence the if/else if (len(self.conditions_ids) > 1): inv_cov_mat = sp.linalg.inv(cov_mat) else: inv_cov_mat = 1 / cov_mat # two terms required, second using dot product data_minus_mean = data_df - np.mean(data_df) left_term = np.dot(data_minus_mean, inv_cov_mat) # dot product mahalanobis = np.dot(left_term, data_minus_mean.T) data_df['mahalanobis'] = mahalanobis.diagonal() # chi-squared p-values with one degree of freedom (two sets of variables) data_df['pvalue'] = 1 - sp.stats.chi2.cdf(data_df['mahalanobis'], 1) # find the outliers below a given threshold, i.e. p < 0.01 outliers = data_df.loc[data_df.pvalue < threshold] # this is used when you want to just plot the p-values alone data_df.index.name = 'reactions' outliers.index.name = 'reactions' #Need to return the mapping between reactions and the p-values return [data_df, outliers] def _integrate_abundances(self, model_obj, feature_lookup_dict, expdata_obj, condition_indices): reaction_values_matrix = list() reactions_ids = list() minmax_expscore_dict = dict() model_complexes_dict = dict() fh = open(self.scratch + '/output.txt', 'w') fh2 = open(self.scratch + '/rxn01486.txt', 'w') print_data = False for mdlrxn in range(len(model_obj['data']['modelreactions'])): mdlrxn_obj = model_obj['data']['modelreactions'][mdlrxn] reactions_ids.append(mdlrxn_obj['id']) [base_rxn, cpt_id] = mdlrxn_obj['id'].split('_') # if(base_rxn == 'rxn01486' or base_rxn == 'rxn37610'): # print_data=True rxndata_row = list() for condition in self.conditions_ids: if (condition not in minmax_expscore_dict): minmax_expscore_dict[condition] = { 'max': -sys.maxsize - 1, 'min': sys.maxsize } condition_index = condition_indices[condition] # Maximal gene expression for a reaction reaction_score = ['nan', ""] prots_str_list = list() for prt in mdlrxn_obj['modelReactionProteins']: # Minimal gene expression for a complex complex_score = ['nan', ""] subs_str_list = list() for sbnt in prt['modelReactionProteinSubunits']: # Maximal gene expression for a subunit subunit_score = ['nan', ""] ftrs_str_list = list() for feature in sbnt['feature_refs']: feature = feature.split('/')[-1] ftrs_str_list.append(feature) feature_index = feature_lookup_dict[feature] ftr_score = expdata_obj['data']['data']['values'][ feature_index][condition_index] if (print_data is True): fh2.write(mdlrxn_obj['id'] + ':' + feature + ':' + str(ftr_score) + '\n') if (ftr_score < minmax_expscore_dict[condition]['min']): minmax_expscore_dict[condition][ 'min'] = ftr_score if (ftr_score > minmax_expscore_dict[condition]['max']): minmax_expscore_dict[condition][ 'max'] = ftr_score # Maximal gene expression for a subunit if (subunit_score[0] == 'nan' or subunit_score[0] < ftr_score): subunit_score = [ftr_score, feature] if (print_data is True): fh2.write(subunit_score, '\n') ftr_str = "(" + ", ".join(ftrs_str_list) + ")" subs_str_list.append(ftr_str) # Minimal gene expression for a complex if (subunit_score[0] != 'nan'): if (complex_score[0] == 'nan' or complex_score[0] > subunit_score[0]): complex_score[0] = subunit_score[0] complex_score[1] = subunit_score[1] if (print_data is True): fh2.write(complex_score, '\n') sub_str = "[" + ", ".join(subs_str_list) + "]" prots_str_list.append(sub_str) # Maximal gene expression for a reaction if (complex_score[0] != 'nan'): if (reaction_score[0] == 'nan' or reaction_score[0] < complex_score[0]): reaction_score[0] = complex_score[0] reaction_score[1] = complex_score[1] if (reaction_score[0] == 'nan'): reaction_score[0] = float(-sys.maxsize - 1) if (print_data is True): fh2.write(condition + ':' + str(reaction_score[0]) + '(' + reaction_score[1] + ')\n') #Putting together dict for table proteins_string = ', '.join(prots_str_list) if (len(prots_str_list) > 0 and proteins_string != "[]" and proteins_string != "[()]"): if (proteins_string not in model_complexes_dict): model_complexes_dict[proteins_string] = dict() if (cpt_id not in model_complexes_dict[proteins_string]): model_complexes_dict[proteins_string][cpt_id] = dict() if (base_rxn not in model_complexes_dict[proteins_string] [cpt_id]): model_complexes_dict[proteins_string][cpt_id][ base_rxn] = list() fh.write('\t'.join([ condition, proteins_string, cpt_id, base_rxn, str(reaction_score[0]), reaction_score[1], '\n' ])) model_complexes_dict[proteins_string][cpt_id][ base_rxn].append(reaction_score) rxndata_row.append(reaction_score[0]) print_data = False reaction_values_matrix.append(rxndata_row) fh.close() self.reactions_ids = reactions_ids return (reaction_values_matrix, model_complexes_dict) def __init__(self, config, ctx, input_params): self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.scratch = config['scratch'] self.report_uuid = str(uuid.uuid4()) # There is a bug in the UI that won't let me collect a # a clean list of conditions, so I have to parse them # from a comma-separated string if ("input_columns" in input_params and input_params["input_columns"] != ""): conditions = list() for condition in input_params["input_columns"].split(','): conditions.append(condition) input_params["input_columns"] = conditions self.input_params = input_params # set in _load_expression_matrix() self.conditions_ids = list() # this is an optional parameter, but restricts the # number of chosen columns in the matrix if ('input_columns' in input_params and len(input_params['input_columns']) > 0): self.conditions_ids = input_params['input_columns'] # set in _integrate_abundances() self.reactions_ids = list() # set in _compile_model_scores_percentiles self.mh_reactions_ids = list() with open( os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3", "PlantSEED_Roles.json")) as plsd_fh: PS_Roles = json.load(plsd_fh) plantseed = FetchPlantSEEDImpl() self.reactions_data = plantseed.fetch_reactions(PS_Roles) def integrate_abundances_with_metabolism(self): self._validate_params( self.input_params, { 'input_ws', 'input_fbamodel', 'input_expression_matrix', 'output_reaction_matrix' }, {'input_columns'}) ############################################################## # Load model and expression objects ############################################################## model_ref = self.input_params['input_ws'] + '/' + self.input_params[ 'input_fbamodel'] [model_obj, reaction_index] = self._load_fbamodel(model_ref) # The columns / conditions_ids are set in this function if not set via user parameter expression_ref = self.input_params[ 'input_ws'] + '/' + self.input_params['input_expression_matrix'] [expdata_obj, features_ids, feature_index, condition_index] = self._load_expression_matrix(expression_ref) ############################################################## # Extract expression abundances for use in first scatter plot ############################################################## feature_comparison_dict = self._compile_genome_scores( expdata_obj['data']['data']['values'], condition_index) #################################################################### # Actually integrate abundances and build new ReactionMatrix object #################################################################### (reaction_values_matrix, model_complexes_dict) = self._integrate_abundances( model_obj, feature_index, expdata_obj, condition_index) rxndata_obj = { 'row_ids': self.reactions_ids, 'col_ids': self.conditions_ids, 'values': reaction_values_matrix } ########################################################################################## # Extract / organize reaction expression scores for use in first and second scatter plot ########################################################################################## [reaction_scores_dict, reaction_percentiles_dict ] = self._compile_model_scores_percentiles(reaction_values_matrix) ############################################################################################################# # Multi-variate mahalanobis distances computed along with outliers depending on chi-squared p-value of 0.01 ############################################################################################################# [mahal_dist_df, outliers] = self._compile_mahalanobis_dist_pvalue( reaction_percentiles_dict['All'], 0.01) ############################################################## # Figure generator ############################################################## subsystem_select_list = ["None"] for category in sorted(list(reaction_percentiles_dict.keys())): if (category == 'All'): continue subsystem_select_list.append(category) for rxn_idx in range( len(reaction_percentiles_dict[category]['reactions'])): rxn = reaction_percentiles_dict[category]['reactions'][rxn_idx] pval = mahal_dist_df.loc[rxn]['pvalue'] # reaction_percentiles_dict[category]['fill_alpha'][rxn_idx] = 1-pval figure_generator = GenerateFigureImpl() figure_grid = figure_generator.generate_figure( self.conditions_ids, category_select=subsystem_select_list, genome_features=feature_comparison_dict, reaction_scores=reaction_scores_dict, reaction_percentiles=reaction_percentiles_dict) ############################################################## # Finishing and Saving ReactionMatrix ############################################################## ReactionMatrix_obj = { 'type': 'KBaseMatrices.ReactionMatrix', 'name': self.input_params['output_reaction_matrix'], 'data': { 'scale': 'raw', 'description': 'reaction expression score', 'fbamodel_ref': model_ref, 'expression_ref': expression_ref, 'data': rxndata_obj } } ws_id = self.dfu.ws_name_to_id(self.input_params['input_ws']) saved_matrix_dict = self.dfu.save_objects({ 'id': ws_id, 'objects': [ReactionMatrix_obj] })[0] saved_matrix_ref = "{}/{}/{}".format(saved_matrix_dict[6], saved_matrix_dict[0], saved_matrix_dict[4]) saved_matrix_desc = "Reaction matrix: " + self.input_params[ 'output_reaction_matrix'] ##################################################################### # Building the report with figures, tables, and saved_objects (to be improved) # We pass in a dict where each key is a row for the table ##################################################################### output_object_files = list() output_object_files.append({ 'ref': saved_matrix_ref, 'description': saved_matrix_desc }) return self._build_report(figure_grid, model_complexes_dict, mahal_dist_df, output_object_files, self.input_params['input_ws'])
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in ['diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used') @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report(up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object'}] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object'}] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': up_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': down_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report'}) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_ref}]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2]}) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements} object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{'type': object_type, 'data': feature_set_data, 'name': feature_set_name}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects({'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name} # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref] save_object_params = { 'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(first_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(second_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}] })['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({'ref': genome_ref, 'limit': len(ids), 'structured_query': {"$or": [{"feature_id": x} for x in ids]}, 'sort_by': [['feature_id', True]]})['features'] features_ids = set((feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']} )['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [x for x in base_set['element_ordering'] if x not in new_feature_set['elements']] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [x for x in genome_refs if x not in new_feature_set['elements'][ element]] else: new_feature_set['elements'][element] = genome_refs new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format( new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3({"objects": [{"ref": diff_expression_set_ref}]} )['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set(up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set(down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = {'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list} report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params(params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]} )['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] message = "Filtered Expression Matrix based of the {} feature ids present in {}"\ .format(len(feature_ids), feature_set_name) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref']} def build_feature_set(self, params): self.validate_params(params, {'output_feature_set', 'workspace_name', }, {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description'}) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError("You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set']}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{'ref': feature_set_obj_ref, 'description': 'Feature Set'}] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref']}
def test_update_taxon_assignments_valid(self): """ Test a valid call to the update_taxon_assignments method. """ taxon_key = str(uuid4()) taxon_val = str(uuid4()) taxon_val_new = str(uuid4()) # Copy the object to test workspace dfu = DataFileUtil(self.callbackURL) obj_ref = f"{_WORKSPACE_NAME}/{_OBJECT_NAME}" result = dfu.get_objects({'object_refs': [obj_ref]})['data'][0] obj_data = result['data'] # crate user owned handle in the object and update it hs = HandleService(self.handleURL) prev_handle_id = obj_data['genbank_handle_ref'] prev_shock_id = hs.hids_to_handles([prev_handle_id])[0]['id'] new_handle_id = dfu.own_shock_node({ 'shock_id': prev_shock_id, 'make_handle': 1 })['handle']['hid'] obj_data['genbank_handle_ref'] = new_handle_id # Save new object in test workspace obj_info = result['info'] new_obj = { 'type': obj_info[2], 'data': obj_data, 'name': 'GCF_002287175.1' } test_ws_id = dfu.ws_name_to_id(self.wsName) infos = dfu.save_objects({'id': test_ws_id, 'objects': [new_obj]}) obj_ref = f"{infos[0][6]}/{infos[0][0]}/{infos[0][4]}" new_ws_id = infos[0][6] new_obj_id = infos[0][0] get_obj_params = { 'wsid': new_ws_id, 'objid': new_obj_id, 'included': ['/taxon_assignments'] } # Add a new assignment self.serviceImpl.update_taxon_assignments( self.ctx, { 'workspace_id': new_ws_id, 'object_id': new_obj_id, 'taxon_assignments': { taxon_key: taxon_val } }) # Fetch the object and check the mapping obj = self.wsClient.get_objects2({'objects': [get_obj_params]})['data'][0]['data'] self.assertTrue(taxon_key in obj['taxon_assignments']) self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val) # Update the assignment we just added self.serviceImpl.update_taxon_assignments( self.ctx, { 'workspace_id': new_ws_id, 'object_id': new_obj_id, 'taxon_assignments': { taxon_key: taxon_val_new } }) # Fetch the object and check the mapping obj = self.wsClient.get_objects2({'objects': [get_obj_params]})['data'][0]['data'] self.assertTrue(taxon_key in obj['taxon_assignments']) self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val_new) # Remove the assignment we just added self.serviceImpl.update_taxon_assignments( self.ctx, { 'workspace_id': new_ws_id, 'object_id': new_obj_id, 'remove_assignments': [taxon_key] }) # Fetch the object and check the mapping obj = self.wsClient.get_objects2({'objects': [get_obj_params]})['data'][0]['data'] self.assertTrue(taxon_key not in obj['taxon_assignments']) self.assertEqual(obj['taxon_assignments'].get(taxon_key), None)
class sample_uploader: ''' Module Name: sample_uploader Module Description: A KBase module: sample_uploader ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.14" GIT_URL = "[email protected]:Tianhao-Gu/sample_uploader.git" GIT_COMMIT_HASH = "fddb10ca67368def8437569f8157b71b59f41e1c" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.workspace_url = config['workspace-url'] self.scratch = config['scratch'] # janky, but works for now self.sw_url = config.get('kbase-endpoint') + '/service_wizard' self.dfu = DataFileUtil(url=self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def import_samples(self, ctx, params): """ :param params: instance of type "ImportSampleInputs" -> structure: parameter "sample_set_ref" of String, parameter "sample_file" of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "file_format" of String, parameter "description" of String, parameter "set_name" of String, parameter "header_row_index" of Long, parameter "id_field" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long, parameter "prevalidate" of Long, parameter "incl_input_in_output" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples print(f"Beginning sample import with following parameters:") print(f"params -- {params}") sample_set = {"samples": []} # Check if we have an existing Sample Set as input # if so, download if params.get('sample_set_ref'): ret = self.dfu.get_objects( {'object_refs': [params['sample_set_ref']]})['data'][0] sample_set = ret['data'] set_name = ret['info'][1] save_ws_id = params['sample_set_ref'].split('/')[0] else: if not params.get('set_name'): raise ValueError( f"Sample set name required, when new SampleSet object is created." ) set_name = params['set_name'] save_ws_id = params.get('workspace_id') if params.get('header_row_index'): header_row_index = int(params["header_row_index"]) - 1 else: header_row_index = 0 if params.get('file_format') == "SESAR": header_row_index = 1 username = ctx['user_id'] if params.get('file_format') == 'ENIGMA': # ENIGMA_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']} # ) sample_set, errors = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], ENIGMA_mappings['column_mapping'], ENIGMA_mappings.get('groups', []), ENIGMA_mappings['date_columns'], ENIGMA_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'SESAR': # SESAR_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in SESAR_mappings['basic_columns']} # ) sample_set, errors = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], SESAR_mappings['column_mapping'], SESAR_mappings.get('groups', []), SESAR_mappings['date_columns'], SESAR_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'KBASE': sample_set, errors = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], {}, [], [], [], sample_set, header_row_index) else: raise ValueError( f"Only SESAR and ENIGMA formats are currently supported for importing samples. " "File of format {params.get('file_format')} not supported.") file_links = [] sample_set_ref = None html_link = None if errors: # create UI to display the errors clearly html_link = _error_ui(errors, self.scratch) else: # only save object if there are no errors obj_info = self.dfu.save_objects({ 'id': save_ws_id, 'objects': [{ "name": set_name, "type": "KBaseSets.SampleSet", "data": sample_set }] })[0] sample_set_ref = '/'.join( [str(obj_info[6]), str(obj_info[0]), str(obj_info[4])]) sample_file_name = os.path.basename( params['sample_file']).split('.')[0] + '_OTU' # -- Format outputs below -- # if output file format specified, add one to output if params.get('output_format') in ['csv', 'xls']: otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name, self.scratch, params) file_links.append({ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "OTU template file", 'description': "file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }) if params.get('incl_input_in_output'): sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError( f"input file {sample_file} does not exist.") sample_file_copy = os.path.join(self.scratch, os.path.basename(sample_file)) shutil.copy(sample_file, sample_file_copy) file_links.append({ "path": sample_file_copy, "name": os.path.basename(sample_file_copy), "label": "Input Sample file", "description": "Input file provided to create the sample set." }) # create report report_client = KBaseReport(self.callback_url) report_data = { 'report_object_name': "SampleSet_import_report_" + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] } if file_links: report_data['file_links'] = file_links if sample_set_ref: report_data[ 'message'] = f"SampleSet object named \"{set_name}\" imported." report_data['objects_created'] = [{'ref': sample_set_ref}] if html_link: report_data['html_links'] = [{ 'path': html_link, 'name': 'index.html', 'description': 'Sample Set Import Error ui' }] report_data['direct_html_link_index'] = 0 report_info = report_client.create_extended_report(report_data) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], 'sample_set': sample_set, 'sample_set_ref': sample_set_ref, 'errors': errors } #END import_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples return value ' + 'output is not type dict as required.') # return the results return [output] def import_samples_from_IGSN(self, ctx, params): """ :param params: instance of type "ImportSampleIGSNInputs" -> structure: parameter "sample_set_ref" of String, parameter "igsns" of list of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "description" of String, parameter "set_name" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long, parameter "prevalidate" of Long, parameter "incl_input_in_output" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples_from_IGSN igsns = params.get('igsns') if not igsns: raise ValueError('Please provide IGSNs') if isinstance(igsns, str): if igsns.isalnum(): # single igsn given e.g. 'IEAWH0001' igsns = [igsns] else: # multiple igsn given e.g. 'IEAWH0001, GEE0000O4' or 'IEAWH0001; GEE0000O4' delimiter = csv.Sniffer().sniff(igsns).delimiter igsns = [x.strip() for x in igsns.split(delimiter)] logging.info('Start importing samples from IGSNs: {}'.format(igsns)) sample_file_name = 'isgn_sample_{}.csv'.format(str(uuid.uuid4())) sample_file_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(sample_file_dir) sample_file = os.path.join(sample_file_dir, sample_file_name) igsns_to_csv(igsns, sample_file) params['sample_file'] = sample_file params['file_format'] = 'SESAR' output = self.import_samples(ctx, params)[0] #END import_samples_from_IGSN # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples_from_IGSN return value ' + 'output is not type dict as required.') # return the results return [output] def generate_OTU_sheet(self, ctx, params): """ :param params: instance of type "GenerateOTUSheetParams" (Generate a customized OTU worksheet using a SampleSet input to generate the appropriate columns.) -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "output_name" of String, parameter "output_format" of String, parameter "num_otus" of Long, parameter "taxonomy_source" of String, parameter "incl_seq" of Long, parameter "otu_prefix" of String :returns: instance of type "GenerateOTUSheetOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN generate_OTU_sheet # first we download sampleset sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] if params.get('output_name'): output_name = params.get('output_name') else: # if output_name not specified use name of sample_set as output + "_OTUs" output_name = ret['info'][1] + "_OTUs" otu_path = sample_set_to_OTU_sheet(sample_set, output_name, self.scratch, params) report_client = KBaseReport(self.callback_url) report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'file_links': [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "CSV with headers for OTU", 'description': "CSV file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }], 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], } #END generate_OTU_sheet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method generate_OTU_sheet return value ' + 'output is not type dict as required.') # return the results return [output] def update_sample_set_acls(self, ctx, params): """ :param params: instance of type "update_sample_set_acls_params" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "new_users" of list of String, parameter "is_reader" of Long, parameter "is_writer" of Long, parameter "is_admin" of Long, parameter "share_within_workspace" of Long :returns: instance of type "update_sample_set_acls_output" -> structure: parameter "status" of String """ # ctx is the context object # return variables are: output #BEGIN update_sample_set_acls # first get sample_set object sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_url = get_sample_service_url(self.sw_url) acls = {'read': [], 'write': [], 'admin': []} if params.get('share_within_workspace'): acls = get_workspace_user_perms(self.workspace_url, params.get('workspace_id'), ctx['token'], ctx['user_id'], acls) for new_user in params.get('new_users'): if params.get('is_admin'): acls['admin'].append(new_user) elif params.get('is_writer'): acls['write'].append(new_user) elif params.get('is_reader'): acls['read'].append(new_user) for sample in sample_set['samples']: sample_id = sample['id'] status = update_acls(sample_url, sample_id, acls, ctx['token']) output = {"status": status} #END update_sample_set_acls # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method update_sample_set_acls return value ' + 'output is not type dict as required.') # return the results return [output] def export_samples(self, ctx, params): """ :param params: instance of type "ExportParams" (export function for samples) -> structure: parameter "input_ref" of String, parameter "file_format" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_samples if not params.get('input_ref'): raise ValueError(f"variable input_ref required") sample_set_ref = params.get('input_ref') output_file_format = params.get('file_format', 'SESAR') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_set_name = ret['info'][1] sample_url = get_sample_service_url(self.sw_url) export_package_dir = os.path.join(self.scratch, "output") if not os.path.isdir(export_package_dir): os.mkdir(export_package_dir) output_file = os.path.join(export_package_dir, '_'.join(sample_set_name.split()) + ".csv") sample_set_to_output(sample_set, sample_url, ctx['token'], output_file, output_file_format) # package it up package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = { 'shock_id': package_details['shock_id'], 'result_dir': export_package_dir } #END export_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_samples return value ' + 'output is not type dict as required.') # return the results return [output] def link_reads(self, ctx, params): """ :param params: instance of type "LinkReadsParams" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of String, parameter "sample_set_ref" of String, parameter "links" of list of type "ReadsLink" (Create links between samples and reads objects.) -> structure: parameter "sample_name" of String, parameter "reads_ref" of String :returns: instance of type "LinkReadsOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "links" of list of unspecified object """ # ctx is the context object # return variables are: output #BEGIN link_reads logging.info(params) ss = SampleService(self.sw_url, service_ver='dev') sample_set_ref = params['sample_set_ref'] sample_set_obj = self.dfu.get_objects( {'object_refs': [sample_set_ref]})['data'][0]['data'] sample_name_2_info = {d['name']: d for d in sample_set_obj['samples']} links = [(d['sample_name'][0], d['reads_ref']) for d in params['links']] new_data_links = [] for sample_name, reads_ref in links: sample_id = sample_name_2_info[sample_name]['id'] version = sample_name_2_info[sample_name]['version'] sample = ss.get_sample({ 'id': sample_id, 'version': version, }) ret = ss.create_data_link( dict( upa=reads_ref, id=sample_id, version=version, node=sample['node_tree'][0]['id'], update=1, )) new_data_links.append(ret) report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report({ 'workspace_name': params['workspace_name'], }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'links': new_data_links, } #END link_reads # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method link_reads return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class FileUtil: def _validate_import_file_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ # check for required parameters for p in ['msa_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) return file_path, params['workspace_name'], params['msa_name'] def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ logging.info('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = { 'file_path': file_path, 'pack': 'gzip', 'make_handle': True, } shock_id = self.dfu.file_to_shock(file_to_shock_params)['shock_id'] return shock_id @staticmethod def _infer_seq_type(msa): dna_set = {"A", "C", "G", "T", "-"} seq_chars = {char for record in msa for char in record.seq} if seq_chars - dna_set: return "protein" else: return "dna" def _file_to_data(self, file_path, format='fasta'): """Do the file conversion""" data = { 'alignment': {}, 'default_row_labels': {}, 'row_order': [], } msa = AlignIO.read(file_path, format) data['alignment_length'] = msa.get_alignment_length() data['sequence_type'] = self._infer_seq_type(msa) for record in msa: data['row_order'].append(record.id) data['default_row_labels'][record.id] = record.description data['alignment'][record.id] = str(record.seq) message = f'A Multiple Sequence Alignment with {len(data["alignment"])} sequences and ' \ f'an alignment length of {data["alignment_length"]} was produced' return data, message def _generate_report(self, msa_ref, workspace_name, message): """ _generate_report: generate summary report for upload """ report_params = { 'message': message, 'objects_created': [{ 'ref': msa_ref, 'description': 'Imported MSA' }], 'workspace_name': workspace_name, 'report_object_name': f'import_msa_file_{uuid.uuid4()}' } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _get_object(self, params): ret = self.dfu.get_objects({'object_refs': [params['input_ref']]})['data'][0] obj_name = ret['info'][1] obj_data = ret['data'] return obj_name, obj_data def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) def import_fasta_file(self, params): file_path, workspace_name, msa_name = self._validate_import_file_params( params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data, message = self._file_to_data(file_path, params.get('file_format', 'fasta')) data['description'] = params.get('description', '') info = self.dfu.save_objects({ 'id': workspace_id, 'objects': [{ 'type': 'KBaseTrees.MSA', 'name': msa_name, 'data': data }] })[0] obj_ref = f"{info[6]}/{info[0]}/{info[4]}" returnVal = {'msa_obj_ref': obj_ref} report_output = self._generate_report(obj_ref, workspace_name, message) returnVal.update(report_output) return returnVal def msa_to_file(self, params, file_type='fasta'): if "input_ref" not in params: raise ValueError("input_ref not in supplied params") if "destination_dir" not in params: raise ValueError("destination_dir not in supplied params") obj_name, obj_data = self._get_object(params) keys = obj_data.get('row_order', obj_data['alignment'].keys) row_labels = obj_data.get('default_row_labels', {}) file_path = os.path.join(self.scratch, f'{obj_name}.{file_type}') seq_type = generic_protein if obj_data.get( 'sequence_type') == "protein" else generic_dna msa = MultipleSeqAlignment([ SeqRecord(Seq(obj_data['alignment'][key], seq_type), id=key, description=row_labels[key]) for key in keys ]) AlignIO.write(msa, file_path, file_type) return {'file_path': file_path} def msa_to_clustal_file(self, params): raise NotImplementedError def export_file(self, params, file_type='fasta'): params['destination_dir'] = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(params['destination_dir']) file_path = self.msa_to_file(params, file_type)['file_path'] return {'shock_id': self._upload_to_shock(file_path)}
class DataUtil: @staticmethod def _find_between(s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {} for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'): constraints[tag] = [ line.strip().split()[1:] for line in type_desc.split("\n") if line.startswith(f'@{tag}') ] return constraints def _filter_constraints(self, constraints, data): """filters out constraints with missing keys""" contains_constraints = constraints.get('contains') # exit(contains_constraints) [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']] filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint[1:] #exit(in_values) ['row_mapping'] missing_key = True for in_value in in_values: # exit(in_value) row_mapping if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) #exit(constraints) ''' {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]} ''' return constraints def _retrieve_value(self, data, value): """Parse the provided 'data' object to retrieve the item in 'value'.""" logging.info('Getting value for {}'.format(value)) retrieve_data = [] #exit(data) ''' {'row_attributemapping_ref': '44071/19/157', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_attributemapping_ref': '44071/20/79', 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'} ''' m_data = DotMap(data) #exit(m_data) ''' DotMap(row_attributemapping_ref='44071/19/158', row_mapping=DotMap(GG_OTU_1='GG_OTU_1', GG_OTU_2='GG_OTU_2', GG_OTU_3='GG_OTU_3', GG_OTU_4='GG_OTU_4', GG_OTU_5='GG_OTU_5'), col_attributemapping_ref='44071/20/80', col_mapping=DotMap(Sample1='Sample1', Sample2='Sample2', Sample3='Sample3', Sample4='Sample4', Sample5='Sample5', Sample6='Sample6'), attributes=DotMap(generated_by='QIIME revision XYZ'), data=DotMap(row_ids=['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], col_ids=['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], values=[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]), search_attributes=['generated_by|QIIME revision XYZ'], scale='raw', description='OTU data') ''' #exit(value) data.row_ids if value.startswith('set('): retrieve_data = value[4:-1].split(",") elif value.startswith( 'values('): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp.values()) elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2( {'objects': [{ 'ref': obj_ref, 'included': [included] }]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [ x.get(keys[2]) for x in ref_data.get(keys[0]) ] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) logging.info('Retrieved value (first 20):\n{}\n'.format( retrieve_data[:20])) #exit(retrieve_data) ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] return retrieve_data def _validate(self, constraints, data): """ _validate: validate data """ #exit(constraints) ''' {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]} ''' validated = True failed_constraints = defaultdict(list) unique_constraints = constraints.get('unique') #exit(unique_constraints) [['data.row_ids'], ['data.col_ids']] for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint[0]) #exit(retrieved_value) ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'] if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint[0]) contains_constraints = constraints.get('contains') #exit(contains_constraints) [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']] for contains_constraint in contains_constraints: value = contains_constraint[0] in_values = contains_constraint[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append( " ".join(contains_constraint)) conditional_constraints = constraints.get('conditionally_required') #exit(conditional_constraints) [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']] for conditional_constraint in conditional_constraints: trigger = conditional_constraint[0] required_keys = conditional_constraint[1:] if trigger in data: missing_keys = [ key for key in required_keys if key not in data ] if missing_keys: validated = False failed_constraints['conditionally_required'].append( (trigger, required_keys, missing_keys)) return validated, failed_constraints @staticmethod def _raise_validation_error(params, validate): """Raise a meaningful error message for failed validation""" logging.error('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = [ 'Object {} failed type checking:'.format(params.get('obj_name')) ] if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg.append( 'Object should have unique field: {}'.format(unique_values)) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) if 'col_mapping' in super_value: error_msg.append( 'Column attribute mapping instances should contain all ' 'column index from original data') if 'row_mapping' in super_value: error_msg.append( 'Row attribute mapping instances should contain all row ' 'index from original data') error_msg.append( 'Object field [{}] should contain field [{}]'.format( super_value, subset_value)) for failure in failed_constraints.get('conditionally_required', []): error_msg.append( 'If object field "{}" is present than object field(s) {} should ' 'also be present. Object is missing {}'.format(*failure)) raise ValueError('\n'.join(error_msg)) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.serviceWizardURL = config['srv-wiz-url'] self.wsClient = workspaceService(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.generics_service = GenericsService(self.serviceWizardURL) def list_generic_types(self, params=None): """ *Not yet exposed in spec* list_generic_types: lists the current valid generics types arguments: none return: A list of generic types in the current environment """ returnVal = [ x['type_def'] for module in GENERICS_MODULES for x in self.wsClient.get_all_type_info(module) ] return returnVal def fetch_data(self, params): #exit(params) {'obj_ref': '44071/21/241'} """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ for p in ['obj_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) #exit(self.generics_service.fetch_data(params)) {'data_matrix': '{"Sample1":{"GG_OTU_1":0.0,"GG_OTU_2":5.0,"GG_OTU_3":0.0,"GG_OTU_4":2.0,"GG_OTU_5":0.0},"Sample2":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample3":{"GG_OTU_1":1.0,"GG_OTU_2":0.0,"GG_OTU_3":1.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample4":{"GG_OTU_1":0.0,"GG_OTU_2":2.0,"GG_OTU_3":4.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample5":{"GG_OTU_1":0.0,"GG_OTU_2":3.0,"GG_OTU_3":2.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample6":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":0.0}}'} return self.generics_service.fetch_data(params) def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) return { 'validated': validated, 'failed_constraints': failed_constraints } def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ logging.info('Starting saving object') obj_type = params.get('obj_type') module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({ 'mod': module_name }).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').items() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): self._raise_validation_error(params, validate) workspace_name = params.get('workspace_name') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
class VCFToVariation: def __init__(self, config, scratch, callback_url ): self.scratch = config['scratch'] self.ws_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.wsc = Workspace(self.ws_url) self.scratch = scratch self.callback_url = callback_url self.au = AssemblyUtil(self.callback_url) self.gapi = GenericsAPI(self.callback_url) def _parse_vcf_data(self, params): vcf_filepath = self._stage_input(params) # file is validated by this point, can assume vcf_filepath is valid reader = vcf.Reader(open(vcf_filepath, 'r')) version = float(reader.metadata['fileformat'][4:6]) genotypes = reader.samples chromosomes = [] contigs = {} totalvars = 0 for record in reader: totalvars += 1 if record.CHROM not in chromosomes: chromosomes.append(record.CHROM) if record.CHROM not in contigs.keys(): passvar = 1 if not record.FILTER else 0 contigs[record.CHROM] = { 'contig_id': record.CHROM, 'totalvariants': 1, 'passvariants': passvar, 'length': int(record.affected_end-record.affected_start), } else: contigs[record.CHROM]['totalvariants'] += 1 if not record.FILTER: contigs[record.CHROM]['passvariants'] += 1 vcf_info = { 'version': version, 'contigs': contigs, 'total_variants': totalvars, 'genotype_ids': genotypes, 'chromosome_ids': chromosomes, 'file_ref': vcf_filepath } return vcf_info def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids): genos_not_found = [] vgenotypes = [x.upper().strip() for x in vcf_genotypes] sids = [x.upper().strip() for x in sample_ids] for geno in vgenotypes: if geno not in sids: genos_not_found.append(geno) if not genos_not_found: return True else: return genos_not_found def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes): chromos_not_in_assembly = [] pp(assembly_chromosomes) for chromo in vcf_chromosomes: if chromo not in assembly_chromosomes: chromos_not_in_assembly.append(chromo) if not chromos_not_in_assembly: return True else: return chromos_not_in_assembly def _get_vcf_version(self, vcf_filepath): with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted.") raise ValueError("Invalid VCF. ##fileformat line in meta is improperly formatted. " "Check VCF file specifications: https://samtools.github.io/hts-specs/") vcf_version = float(tokens[1][-4:].rstrip()) return vcf_version def validate_vcf(self, params): if 'genome_or_assembly_ref' not in params: raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params) if 'vcf_staging_file_path' not in params: raise ValueError('VCF staging file path not in input parameters: \n\n' + params) vcf_filepath = self._stage_input(params) vcf_version = self._get_vcf_version(vcf_filepath) # setup directorys for validation output validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) # vcftools (vcf-validator) supports VCF v4.0-4.2 # https://github.com/vcftools/vcftools # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3 # https://github.com/EBIvariation/vcf-validator # vcftools is only to validate VCF v4.0 if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-l") validator_cmd.append('error') print("VCF version "+str(vcf_version)+".") elif vcf_version >= 4.0: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version 4.0.") else: raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the ' 'first line of vcf file and in appropriate syntax. Check VCF file specifications: ' 'https://samtools.github.io/hts-specs/') print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break if line.decode("utf-8").strip().startswith('[info]'): validator_output.append(line.decode("utf-8")) out, err = p.communicate() validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt') file_output_chk = [] try: if validator_output[0][:6] == '[info]': # validation by vcf_validator_linux validation_output_filename = validator_output[1].split(' ')[6].strip('\n') vo = validator_output[2].split(' ') file_output_chk = ''.join(vo[9:]).strip('\n') if not os.path.exists(validation_output_filename): raise ValueError(validation_output_filename+' does not exist!') if not file_output_chk == 'isvalid': print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) #TODO: more detailed validation parsing for vcf_validator_linux else: if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0") f.close() # TODO: more detailed validation parsing for vcftools except IndexError: # if vcf file < v4.1, and valid it will produce index error on line 132 if validator_output: with open(validation_output_filename, 'w') as f: for line in validator_output: f.write(str(line)) f.close() print('\n'.join(validator_output)) raise ValueError('\n'.join(validator_output)) else: with open(validation_output_filename, 'w') as f: f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0") f.close() if not os.path.exists(validation_output_filename): print('Validator did not generate log file!') raise SystemError("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filename)) log("Return code from validator {}".format(p.returncode)) return validation_output_filename def _stage_input(self, params): # extract file location from input ui parameters if params['vcf_staging_file_path'].startswith('/kb/module/test/'): # variation utils unit test vcf_local_file_path = params['vcf_staging_file_path'] if vcf_local_file_path.endswith('.gz'): with gzip.open(vcf_local_file_path, 'rb') as f_in: with open(vcf_local_file_path[:-3], 'wb') as f_out: shutil.copyfileobj(f_in, f_out) vcf_local_file_path = vcf_local_file_path[:-3] else: staging_dir = '/staging' vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path']) if not os.path.exists(vcf_local_file_path): raise OSError('VCF input path does not exist, or is not readable') orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path)) print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}') self.original_file = shutil.copy(vcf_local_file_path, orig_file_path) # TODO: use data file utils here, upload vcf to shock, use dfu. if is_gz_file(vcf_local_file_path): # /staging is read only, therefore have to copy before uncompressing if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']): copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path'])) unpack = self.dfu.unpack_file({'file_path': copy}) else: unpack = {} unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path']) params['vcf_local_file_path'] = unpack['file_path'] return unpack['file_path'] else: params['vcf_local_file_path'] = vcf_local_file_path return vcf_local_file_path def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file): """ function for creating sample attribute mapping file. """ try: with open (vcf_file, 'r') as vcf_handle: Lines = vcf_handle.readlines() for line in Lines: if(line.startswith("#CHROM")): header = line.lstrip().split("\t") try: with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle: attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID") for i in range(9,len(header)): attribute_mapping_handle.write("\t"+header[i]) #attribute_mapping_handle.write("\n") attribute_mapping_handle.write("label\t\t\t") for j in range(9,len(header)): attribute_mapping_handle.write("\t"+header[j]) #attribute_mapping_handle.write("\n") except IOError: print("Could not write to file:", sample_attribute_mapping_file) except IOError: print("Could not read file:", vcf_file) def _validate_assembly_ids(self, params): # All chromosome ids from the vcf should be in assembly # but not all assembly chromosome ids should be in vcf if ('genome_ref' in params): subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': params['genome_or_assembly_ref'] }]) self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref'] if ('assembly_ref' in params): self.vcf_info['assembly_ref'] = params['assembly_ref'] assembly_chromosome_ids_call = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }]) assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys() vcf_chromosomes = self.vcf_info['chromosome_ids'] chk_assembly_ids = self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes) if isinstance(chk_assembly_ids, list): failed_ids = ' '.join(chk_assembly_ids) print(f'VCF contig ids: {failed_ids} are not present in assembly.') raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.') return assembly_chromosomes def _validate_sample_ids(self, params): # All samples within the VCF file need to be in sample attribute list vcf_genotypes = self.vcf_info['genotype_ids'] sample_ids_subset = self.wsc.get_object_subset([{ 'included': ['/instances'], 'ref': params['sample_attribute_ref'] }]) sample_ids = sample_ids_subset[0]['data']['instances'].keys() validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids) if isinstance(validate_genotypes, list): failed_genos = ' '.join(validate_genotypes) print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.') return sample_ids def _construct_contig_info(self, params): """ KBaseGwasData.Variations type spec /* Contig variation data contig_id - contig identifier totalvariants - total number of variants in each contig passvariants - total number of variants that pass quality variation filter in contig length - length of contig from assembly data */ typdef structure { string contig_id; int totalvariants; int passvariants; int length; // from assembly } contig_info; """ assembly_chromosome_dict = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': self.vcf_info['assembly_ref'] }])[0]['data']['contigs'] contigs = [] contig_infos = self.vcf_info['contigs'] for contig_id in contig_infos: length_contig = assembly_chromosome_dict[contig_id].get("length") contig_infos[contig_id]["length"] = length_contig contigs.append(contig_infos[contig_id]) return contigs def _bgzip_vcf(self, vcf_filepath): if not os.path.exists(vcf_filepath): print (vcf_filepath + " does not exist") zip_cmd = ["bgzip", vcf_filepath] p = subprocess.Popen(zip_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() bgzip_file_path = vcf_filepath + ".gz" print (bgzip_file_path) return bgzip_file_path def _index_vcf(self, bgzip_file): output_dir = self.scratch bgzip_filepath = os.path.join(self.scratch, bgzip_file) if not os.path.exists(bgzip_filepath): print (bgzip_filepath + " does not exist") index_cmd = ["tabix", "-p", "vcf", bgzip_filepath] p = subprocess.Popen(index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() index_file_path = bgzip_filepath + ".tbi" return index_file_path def _index_assembly(self, assembly_file): if not os.path.exists(assembly_file): print (assembly_file + " does not exist") logging.info("indexing assembly file") assembly_index_cmd = ["samtools", "faidx", assembly_file] print(assembly_index_cmd) p = subprocess.Popen(assembly_index_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logging.info("indexing of assembly file done!") return assembly_file + ".fai" def _download_assembly(self, assembly_ref): file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }) return file def _construct_variation(self, params, contigs_info): """ KBaseGwasData.Variations type spec /* Variation object data structure num_genotypes - number of total genotypes within variant file num_variants - number of total variants within variant file contigs - list of contig ids and variant information attribute_ref - KBase reference to attribute mapping workspace object genome_ref - KBase reference to genome workspace object assembly_ref - KBase reference to assemebly workspace object vcf_handle_ref - VCF handle reference to VCF file @optional genome_ref */ typedef structure { int numgenotypes; int numvariants; list<contig_info> contigs; attribute_ref population; // KBaseExperiments.AttributeMapping genome_ref genome_ref; // KBaseGenomes.Genome assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly vcf_handle_ref vcf_handle_ref; } Variations; :param params: KBase ui input parameters :param population: previoiusly constructed sample population data :return: constructed variation object (dictionary) """ if not self.vcf_info['file_ref'].startswith(self.scratch): new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref'])) self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file) vcf_staged_file = self.original_file bgzip_file_path = self._bgzip_vcf(vcf_staged_file) vcf_shock_file_ref = self.dfu.file_to_shock( {'file_path': bgzip_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref) index_file_path = self._index_vcf(bgzip_file_path) vcf_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref) assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path'] assembly_index_file_path = self._index_assembly(assembly_file_path) assembly_index_shock_file_ref = self.dfu.file_to_shock( {'file_path': assembly_index_file_path, 'make_handle': 1} ) compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref) variation_obj = { 'numgenotypes': int(len(self.vcf_info['genotype_ids'])), 'numvariants': int(self.vcf_info['total_variants']), 'contigs': contigs_info, 'population': params['sample_attribute_ref'], # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref 'assemby_ref': self.vcf_info['assembly_ref'], 'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'], 'vcf_handle' : vcf_shock_file_ref['handle'], 'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'], 'vcf_index_handle': vcf_index_shock_file_ref['handle'], 'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'], 'assembly_index_handle': assembly_index_shock_file_ref['handle'] } if 'genome_ref' in params: variation_obj['genome_ref'] = params['genome_ref'] return variation_obj def _save_var_obj(self, params, var): """ :param params: :param var: :return: DataFileUtils object_info: objid - the numerical id of the object. name - the name of the object. type - the type of the object. save_date - the save date of the object. ver - the version of the object. saved_by - the user that saved or copied the object. wsid - the id of the workspace containing the object. workspace - the name of the workspace containing the object. chsum - the md5 checksum of the object. size - the size of the object in bytes. meta - arbitrary user-supplied metadata about the object. """ print('Saving Variation to workspace...\n') if var: if not 'variation_object_name' in params: var_obj_name = 'variation_'+str(uuid.uuid4()) else: var_obj_name = params['variation_object_name'] var_obj_info = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': var, 'name': var_obj_name }] })[0] return var_obj_info else: raise ValueError('Variation object blank, cannot not save to workspace!') def _validate_sample_attribute_ref(self, params): #params["sample_attribute_ref"] = '' #just for testing if not params['sample_attribute_ref']: sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv") #hardcoded for testing self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file) logging.info("Uploading sample attribute file to ref") vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock( {'file_path': sample_attribute_mapping_file, 'make_handle': 1} ) shock_id = vcf_sample_attribute_shock_file_ref['shock_id'] ws_id = self.dfu.ws_name_to_id(params['workspace_name']) import_params = { 'input_shock_id' : shock_id, 'output_ws_id': ws_id, 'output_obj_name': 'Sample_attribute'} ret = self.gapi.file_to_attribute_mapping(import_params) params['sample_attribute_ref'] = ret['attribute_mapping_ref'] def import_vcf(self, params): # VCF validation # VCF file validation file_valid_result = self.validate_vcf(params) self._validate_sample_attribute_ref(params) # VCF file parsing self.vcf_info = self._parse_vcf_data(params) # Validate vcf chromosome ids against assembly chromosome ids self._validate_assembly_ids(params) # Validate vcf genotypes against sample meta data ids self._validate_sample_ids(params) # Variation object construction # construct contigs_info contigs_info = self._construct_contig_info(params) # construct variation var = self._construct_variation(params, contigs_info) # Save variation object to workspace var_wksp_obj = self._save_var_obj(params, var) return [var_wksp_obj, var]
class GFFUtils2: def __init__(self, config): self.callback_url = config['callback_url'] self.shared_folder = config['scratch'] #self.shared_folder = "/kb/module/work" self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.wsc = Workspace(self.ws_url) def _prep_gff(self, gff_file): outfile = os.path.join(self.genome_dir, 'out.gff') sortcmd = f'(grep ^"#" {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)' with open(outfile, 'w') as o: p = subprocess.Popen(sortcmd, shell=True, stdout=o) out, err = p.communicate() o.close() bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir) out2, err2 = bgzip.communicate() outfile += '.gz' return outfile def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths): with open(gff_file_path, 'w') as f: for feature in json: if feature['feature_type'].strip().upper() == 'GENE': end = int(feature['location'][0]['start'])+int(feature['location'][0]['length']) metainfo = "ID="+feature['feature_id'] if feature['function']: metainfo += ';FUNCTION='+feature['function'] contig_id = str(feature['location'][0]['contig_id']) start = int(feature['location'][0]['start']) # TODO: Fix Plink reassignment of Chr prefixes try: global_pos = int(contig_base_lengths[contig_id]) + start except KeyError: try: global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start except KeyError: pp(contig_base_lengths) pp(contig_id) raise KeyError(e) """ Remove ontology for now if feature['ontology_terms']: metainfo += ';ONTOLOGY(' for k, v in feature['ontology_terms'].items(): metainfo += str(k) + ',' + str(v) + ':' metainfo = metainfo[:-1] # remove trailing ; metainfo += ')' """ constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \ 'KBase\tgene\t' + \ str(feature['location'][0]['start']) + '\t' + \ str(end) + '\t.\t' + \ str(feature['location'][0]['strand']) + '\t' + \ str(global_pos) + '\t' + \ str(metainfo) + '\n' f.write(constructed_gff_line) f.close() if os.path.exists(gff_file_path): return gff_file_path else: raise FileNotFoundError('Unable to create GFF file form genome JSON.') def _process_tabix_results(self, queryresult): queryinfo = queryresult[8].split(';') if len(queryinfo) >= 2: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])] elif len(queryinfo) is 1: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"] else: extension = ['NA', 'NA', 'NA'] return extension def find_gene_info(self, row): tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"])) tbresult = next(tb, None) if tbresult is None: tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult2 = next(tb2, None) if tbresult2 is None: tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult3 = next(tb3, None) if tbresult3 is None: if int(row["POS"]) < 500: nstart = 0 else: nstart = int(row["POS"]) - 500 neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500) neigh_result = next(neigh_tb, None) if neigh_result is None: return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: nq = self._process_tabix_results(neigh_result) return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q3 = self._process_tabix_results(tbresult3) return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q2 = self._process_tabix_results(tbresult2) return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q = self._process_tabix_results(tbresult) return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) def get_gwas_result_file(self, association_ref, association_name, p_value): #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data'] association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0] association_results = association_obj['data']["association_details"][0]["association_results"] result = "CHR\tSNP\tPOS\tP\tBP\n" for variation in association_results: if (float(variation[3]) > float(p_value)): continue result += str(variation[0]) + "\t" result += str(variation[1]) + "\t" result += str(variation[2]) + "\t" result += str(variation[3]) + "\t" result += str(variation[2]) + "\n" filepath = os.path.join(self.genome_dir, association_name) with open(filepath, "w") as file1: file1.write(result) return (filepath) def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix): gene_ids = dict() element_ordering = list() elements = dict() skip_words = ["GENEID", "NEIGHBORGENE", "NA"] with open(filepath, 'r') as reader: for line in reader: fields = line.split("\t") condition1 = fields[5] not in skip_words condition2 = fields[5] not in elements condition3 = fields[6] not in skip_words condition4 = fields[6] not in elements if condition1 and condition2: element_ordering.append(fields[5]) elements[fields[5]] = [genome_ref] if condition3 and condition4: element_ordering.append(fields[6]) elements[fields[6]] = [genome_ref] featureset = dict() featureset['description'] = description featureset['element_ordering'] = element_ordering featureset['elements'] = elements ws_id = self.dfu.ws_name_to_id(workspace_name) featureset_obj_name = prefix + str(association_name) save_info = self.dfu.save_objects( { 'id': ws_id, 'objects': [ {'type': 'KBaseCollections.FeatureSet', 'data': featureset, 'name': featureset_obj_name}]})[0] obj_ref = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] ) return obj_ref def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value): #TODO: Send outfile to prep gff function inseted of hardcord #TODO: Removed hard coded stuff and create new directory for each test function self.genome_dir_name = "_".join(genome_ref.split("/")) self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name) if not os.path.isdir(self.genome_dir): os.mkdir(self.genome_dir) sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz') self.sorted_gff = sorted_gff_path if not os.path.exists(sorted_gff_path): feature_num = self.gsu.search({'ref': genome_ref})['num_found'] # get genome features for gff construction genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, #'sort_by': [['feature_id', True]] })['features'] assembly_ref = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }])[0]['data']['assembly_ref'] # get assembly contigs for base length calculations assembly_contigs = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data']['contigs'] contig_ids = list(assembly_contigs.keys()) contig_ids.sort() contig_base_lengths = {} prev_length = 0 for contig in contig_ids: contig_base_lengths[contig] = prev_length prev_length += assembly_contigs[contig]['length'] gff_file = os.path.join(self.genome_dir, 'constructed.gff') constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths) self.sorted_gff = self._prep_gff(constructed_gff) tabix_index(self.sorted_gff) obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]}) association_name =obj_info["infos"][0][1] gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value) gwas_results = pd.read_csv(gwas_results_file, sep='\t') gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \ gwas_results.apply(self.find_gene_info, axis=1) new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..')) fname = 'final_' + association_name new_results_path = os.path.join(new_results_path, fname ) gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False) description = "Genelist for GWAS results of trait " + association_name featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix) return featureset_obj
class sample_uploader: ''' Module Name: sample_uploader Module Description: A KBase module: sample_uploader ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.12" GIT_URL = "https://github.com/kbaseapps/sample_uploader" GIT_COMMIT_HASH = "5134b679279c84128b0ca5b684fa75dacf7dba59" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.workspace_url = config['workspace-url'] self.scratch = config['scratch'] # janky, but works for now self.sw_url = config.get('kbase-endpoint') + '/service_wizard' self.dfu = DataFileUtil(url=self.callback_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def import_samples(self, ctx, params): """ :param params: instance of type "ImportSampleInputs" -> structure: parameter "sample_set_ref" of String, parameter "sample_file" of String, parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "file_format" of String, parameter "description" of String, parameter "set_name" of String, parameter "header_row_index" of Long, parameter "id_field" of String, parameter "output_format" of String, parameter "taxonomy_source" of String, parameter "num_otus" of Long, parameter "incl_seq" of Long, parameter "otu_prefix" of String, parameter "share_within_workspace" of Long :returns: instance of type "ImportSampleOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "sample_set" of type "SampleSet" -> structure: parameter "samples" of list of type "sample_info" -> structure: parameter "id" of type "sample_id", parameter "name" of String, parameter "description" of String, parameter "sample_set_ref" of String """ # ctx is the context object # return variables are: output #BEGIN import_samples print(f"Beginning sample import with following parameters:") print(f"params -- {params}") sample_set = {"samples": []} # We subtract by 1 for zero indexing. if params.get('sample_set_ref'): ret = self.dfu.get_objects( {'object_refs': [params['sample_set_ref']]})['data'][0] sample_set = ret['data'] set_name = ret['info'][1] save_ws_id = params['sample_set_ref'].split('/')[0] else: if not params.get('set_name'): raise ValueError( f"Sample set name required, when new SampleSet object is created." ) set_name = params['set_name'] save_ws_id = params.get('workspace_id') if params.get('header_row_index'): header_row_index = int(params["header_row_index"]) - 1 else: header_row_index = 0 if params.get('file_format') == "SESAR": header_row_index = 1 username = ctx['user_id'] if params.get('file_format') == 'ENIGMA': # ENIGMA_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']} # ) sample_set = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], ENIGMA_mappings['column_mapping'], ENIGMA_mappings.get('groups', []), ENIGMA_mappings['date_columns'], ENIGMA_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'SESAR': # SESAR_mappings['verification_mapping'].update( # {key: ("is_string", []) for key in SESAR_mappings['basic_columns']} # ) sample_set = import_samples_from_file( params, self.sw_url, self.workspace_url, username, ctx['token'], SESAR_mappings['column_mapping'], SESAR_mappings.get('groups', []), SESAR_mappings['date_columns'], SESAR_mappings.get('column_unit_regex', []), sample_set, header_row_index) elif params.get('file_format') == 'KBASE': sample_set = import_samples_from_file(params, self.sw_url, self.workspace_url, username, ctx['token'], {}, [], [], [], sample_set, header_row_index) else: raise ValueError( f"Only SESAR and ENIGMA formats are currently supported for importing samples. " "File of format {params.get('file_format')} not supported.") obj_info = self.dfu.save_objects({ 'id': save_ws_id, 'objects': [{ "name": set_name, "type": "KBaseSets.SampleSet", "data": sample_set }] })[0] sample_set_ref = '/'.join( [str(obj_info[6]), str(obj_info[0]), str(obj_info[4])]) sample_file_name = os.path.basename( params['sample_file']).split('.')[0] + '_OTU' # -- Format outputs below -- # if output file format specified, add one to output if params.get('output_format') in ['csv', 'xls']: otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name, self.scratch, params) file_links = [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "OTU template file", 'description': "file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }] else: file_links = [] if params.get('incl_input_in_output'): sample_file = params.get('sample_file') if not os.path.isfile(sample_file): # try prepending '/staging/' to file and check then if os.path.isfile(os.path.join('/staging', sample_file)): sample_file = os.path.join('/staging', sample_file) else: raise ValueError( f"input file {sample_file} does not exist.") sample_file_copy = os.path.join(self.scratch, os.path.basename(sample_file)) shutil.copy(sample_file, sample_file_copy) file_links.append({ "path": sample_file_copy, "name": os.path.basename(sample_file_copy), "label": "Input Sample file", "description": "Input file provided to create the sample set." }) # create report report_client = KBaseReport(self.callback_url) report_name = "SampleSet_import_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'message': f"SampleSet object named \"{set_name}\" imported.", 'objects_created': [{ 'ref': sample_set_ref }], 'file_links': file_links, 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], 'sample_set': sample_set, 'sample_set_ref': sample_set_ref } #END import_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method import_samples return value ' + 'output is not type dict as required.') # return the results return [output] def generate_OTU_sheet(self, ctx, params): """ :param params: instance of type "GenerateOTUSheetParams" (Generate a customized OTU worksheet using a SampleSet input to generate the appropriate columns.) -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "output_name" of String, parameter "output_format" of String, parameter "num_otus" of Long, parameter "taxonomy_source" of String, parameter "incl_seq" of Long, parameter "otu_prefix" of String :returns: instance of type "GenerateOTUSheetOutputs" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN generate_OTU_sheet # first we download sampleset sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] if params.get('output_name'): output_name = params.get('output_name') else: # if output_name not specified use name of sample_set as output + "_OTUs" output_name = ret['info'][1] + "_OTUs" otu_path = sample_set_to_OTU_sheet(sample_set, output_name, self.scratch, params) report_client = KBaseReport(self.callback_url) report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4()) report_info = report_client.create_extended_report({ 'file_links': [{ 'path': otu_path, 'name': os.path.basename(otu_path), 'label': "CSV with headers for OTU", 'description': "CSV file with each column containing the assigned sample_id and sample " "name of each saved sample. Intended for uploading OTU data." }], 'report_object_name': report_name, 'workspace_name': params['workspace_name'] }) output = { 'report_ref': report_info['ref'], 'report_name': report_info['name'], } #END generate_OTU_sheet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method generate_OTU_sheet return value ' + 'output is not type dict as required.') # return the results return [output] def update_sample_set_acls(self, ctx, params): """ :param params: instance of type "update_sample_set_acls_params" -> structure: parameter "workspace_name" of String, parameter "workspace_id" of Long, parameter "sample_set_ref" of String, parameter "new_users" of list of String, parameter "is_reader" of Long, parameter "is_writer" of Long, parameter "is_admin" of Long, parameter "share_within_workspace" of Long :returns: instance of type "update_sample_set_acls_output" -> structure: parameter "status" of String """ # ctx is the context object # return variables are: output #BEGIN update_sample_set_acls # first get sample_set object sample_set_ref = params.get('sample_set_ref') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_url = get_sample_service_url(self.sw_url) acls = {'read': [], 'write': [], 'admin': []} if params.get('share_within_workspace'): acls = get_workspace_user_perms(self.workspace_url, params.get('workspace_id'), ctx['token'], ctx['user_id'], acls) for new_user in params.get('new_users'): if params.get('is_admin'): acls['admin'].append(new_user) elif params.get('is_writer'): acls['write'].append(new_user) elif params.get('is_reader'): acls['read'].append(new_user) for sample in sample_set['samples']: sample_id = sample['id'] status = update_acls(sample_url, sample_id, acls, ctx['token']) output = {"status": status} #END update_sample_set_acls # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method update_sample_set_acls return value ' + 'output is not type dict as required.') # return the results return [output] def export_samples(self, ctx, params): """ :param params: instance of type "ExportParams" (export function for samples) -> structure: parameter "input_ref" of String, parameter "file_format" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_samples if not params.get('input_ref'): raise ValueError(f"variable input_ref required") sample_set_ref = params.get('input_ref') output_file_format = params.get('file_format', 'SESAR') ret = self.dfu.get_objects({'object_refs': [sample_set_ref]})['data'][0] sample_set = ret['data'] sample_set_name = ret['info'][1] sample_url = get_sample_service_url(self.sw_url) export_package_dir = os.path.join(self.scratch, "output") if not os.path.isdir(export_package_dir): os.mkdir(export_package_dir) output_file = os.path.join(export_package_dir, '_'.join(sample_set_name.split()) + ".csv") sample_set_to_output(sample_set, sample_url, ctx['token'], output_file, output_file_format) # package it up package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = { 'shock_id': package_details['shock_id'], 'result_dir': export_package_dir } #END export_samples # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_samples return value ' + 'output is not type dict as required.') # return the results return [output] def link_reads(self, ctx, params): """ Create links between samples and reads objects :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN link_reads ss = SampleService(self.sw_url, token=ctx['token'], service_ver='beta') sample_set_ref = params['sample_set_ref'] sample_set = SampleSet(self.dfu, sample_set_ref) links = [(d['sample_name'], d['reads_ref']) for d in params['links']] for sample_name, reads_ref in links: node_id, version, sample_id = sample_set.get_sample_info( sample_name) p = dict( upa=reads_ref, id=sample_id, version=version, node=node_id, update=1, ) ret = ss.create_data_link(p) report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report({ 'workspace_name': params['workspace_name'], }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END link_reads # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method link_reads return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]