def fetch_narrative_data(endpt: str, token: str, ws_id: int, outdir: str) -> int: ws = Workspace(url=endpt + "ws", token=token) ws_info = ws.get_workspace_info({"id": ws_id}) ws_meta = ws_info[8] # Narrative object narr_id = ws_meta["narrative"] narr_obj = ws.get_objects2({"objects": [{ "ref": f"{ws_id}/{narr_id}" }]})["data"][0] narr_ver = narr_obj["info"][4] narr_outpath = os.path.join( outdir, f"narrative-{ws_id}.{narr_id}.{narr_ver}.json") with open(narr_outpath, "w") as fout: json.dump(narr_obj, fout, indent=4) # Report objects for cell in narr_obj["data"]["cells"]: if "kbase" in cell["metadata"]: meta = cell["metadata"]["kbase"] if "appCell" in meta: job_state = meta["appCell"].get("exec", {}).get("jobState") result = list() if "result" in job_state: result = job_state["result"] elif "job_output" in job_state and "result" in job_state[ "job_output"]: result = job_state["job_output"]["result"] if len(result) > 0 and "report_ref" in result[0]: report_data = ws.get_objects2( {"objects": [{ "ref": result[0]["report_ref"] }]})["data"][0] report_info = report_data["info"] ref_dots = f"{report_info[6]}.{report_info[0]}.{report_info[4]}" report_path = os.path.join(outdir, f"report-{ref_dots}.json") with open(report_path, "w") as fout: json.dump(report_data, fout, indent=4) # List objects results service = NarrativeService(url=endpt + "service_wizard", token=token) # service = ServiceClient(url=endpt + "service_wizard", use_url_lookup=True, token=token) ws_data = service.list_objects_with_sets({ "ws_id": ws_id, "includeMetadata": 1 }) # ws_data = service.sync_call( # "NarrativeService.list_objects_with_sets", # [{"ws_id": ws_id, "includeMetadata": 1}] # )[0] data_outpath = os.path.join(outdir, f"objects-{ws_id}.json") with open(data_outpath, "w") as fout: json.dump(ws_data, fout, indent=4) return 0
def read_narrative(ref: NarrativeRef, ws_client: Workspace) -> Dict: """ Fetches a Narrative and its object info from the Workspace If content is False, this only returns the Narrative's info and metadata, otherwise, it returns the whole workspace object. This is mainly a wrapper around Workspace.get_objects2(), except that it always returns a dict. If content is False, it returns a dict containing a single key: 'info', with the object info and, optionally, metadata. Can the following errors: ValueError (if ref isn't a Narrative object), WorkspaceError if there's a Workspace issue (ref isn't valid, or token isn't valid) :param ref: a NarrativeRef :param content: if True, returns the narrative document, otherwise just the metadata :param include_metadata: if True, includes the object metadata when returning """ try: narr_data = ws_client.get_objects2({'objects': [{'ref': str(ref)}]}) nar = narr_data['data'][0] _validate_narr_type(nar['info'][2], ref) # nar['data'] = update_narrative(nar['data']) return nar['data'] except ServerError as err: raise WorkspaceError(err, ref.wsid)
def load_fastas(config, scratch, upa): ''' ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs":[upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] fasta_paths = [] for genome_upa in upas: if upa != genome_upa: genome_upa = upa + ';' + genome_upa genome_data = ws.get_objects2( {'objects':[{"ref":genome_upa}]})['data'][0]['data'] target_upa = genome_data.get('contigset_ref') or genome_data.get('assembly_ref') assembly_upa = genome_upa + ';' + target_upa faf = au.get_assembly_as_fasta({"ref":assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
def build_bin_summary_file_from_binnedcontigs_obj(self, input_ref, bin_dir, bin_basename, fasta_extension): # read bin info from obj ws = Workspace(self.ws_url) try: binned_contig_obj = ws.get_objects2( {'objects': [{ 'ref': input_ref }]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch ' + str(input_ref) + ' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() bin_summary_info = dict() # bid in object is full name of contig fasta file. want just the number for bin_item in binned_contig_obj['bins']: #print ("BIN_ITEM[bid]: "+bin_item['bid']) # DEBUG bin_ID = re.sub('^[^\.]+\.', '', bin_item['bid'].replace('.' + fasta_extension, '')) #print ("BIN_ID: "+bin_ID) # DEBUG bin_summary_info[bin_ID] = { 'n_contigs': bin_item['n_contigs'], 'gc': round(100.0 * float(bin_item['gc']), 1), 'sum_contig_len': bin_item['sum_contig_len'], 'cov': round(100.0 * float(bin_item['cov']), 1) } # write summary file for just those bins present in bin_dir header_line = ['Bin name', 'Completeness', 'Genome size', 'GC content'] bin_fasta_files_by_bin_ID = self.get_bin_fasta_files( bin_dir, fasta_extension) bin_IDs = [] for bin_ID in sorted(bin_fasta_files_by_bin_ID.keys()): bin_ID = re.sub('^[^\.]+\.', '', bin_ID.replace('.' + fasta_extension, '')) bin_IDs.append(bin_ID) summary_file_path = os.path.join(bin_dir, bin_basename + '.' + 'summary') print("writing filtered binned contigs summary file " + summary_file_path) with open(summary_file_path, 'w') as summary_file_handle: print("\t".join(header_line)) summary_file_handle.write("\t".join(header_line) + "\n") for bin_ID in bin_IDs: #print ("EXAMINING BIN SUMMARY INFO FOR BIN_ID: "+bin_ID) # DEBUG bin_summary_info_line = [ bin_basename + '.' + str(bin_ID) + '.' + fasta_extension, str(bin_summary_info[bin_ID]['cov']) + '%', str(bin_summary_info[bin_ID]['sum_contig_len']), str(bin_summary_info[bin_ID]['gc']) ] print("\t".join(bin_summary_info_line)) summary_file_handle.write("\t".join(bin_summary_info_line) + "\n") return summary_file_path
def load_fastas(config, scratch: str, upa: str): ''' Returns list of (fasta_path, upa) ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) mgu = MetagenomeUtils(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0] obj_type = obj_data['info'][2] if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file # file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({"ref": upa}) return [(faf['path'], upa)] elif "KBaseSets.AssemblySet" in obj_type: fasta_paths = [] for item_upa in obj_data['data']['items']: faf = au.get_assembly_as_fasta({"ref": item_upa['ref']}) fasta_paths.append((faf['path'], item_upa['ref'])) return fasta_paths elif 'KBaseMetagenomes.BinnedContigs' in obj_type: fasta_paths = [] bin_file_dir = mgu.binned_contigs_to_file({ 'input_ref': upa, 'save_to_shock': 0 })['bin_file_directory'] for (dirpath, dirnames, filenames) in os.walk(bin_file_dir): for fasta_file in filenames: fasta_path = os.path.join(scratch, fasta_file) fasta_path = os.path.splitext(fasta_path)[0] + ".fa" copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path) # Should I verify that the bins have contigs? # is it possible to have empty bins? fasta_paths.append((fasta_path, upa)) break return fasta_paths else: raise Error('Input genome/metagenome reference has unhandled type') fasta_paths = [] for genome_upa in upas: genome_data = ws.get_objects2({'objects': [{ "ref": genome_upa }]})['data'][0]['data'] assembly_upa = genome_upa + ';' + str( genome_data.get('contigset_ref') or genome_data.get('assembly_ref')) faf = au.get_assembly_as_fasta({'ref': assembly_upa}) fasta_paths.append((faf['path'], assembly_upa)) return fasta_paths
def check_assembly_cache(self, ref, token): ws = Workspace(self.ws_url, token=token) info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0] inner_chsum = info[8] index_file = os.path.join(self.assembly_index_dir, inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz") if not os.path.isfile(index_file): if self.debug: print(" Loading WS object...") t1 = time.time() if 'KBaseGenomeAnnotations.Assembly' in info[2]: included = ["/contigs"] assembly_data = ws.get_objects2( {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data'] contigs = list(assembly_data['contigs'].values()) self.save_assembly_tsv(contigs, inner_chsum) elif 'KBaseGenomes.ContigSet' in info[2]: included = ["/contigs/[*]/id", "/contigs/[*]/length", "/contigs/[*]/md5", "/contigs/[*]/description"] cs_data = ws.get_objects2( {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data'] contigs = [] for c in cs_data['contigs']: this_contig_data = {'contig_id': ''} if 'id' in c: this_contig_data['contig_id'] = c['id'] if 'md5' in c: this_contig_data['md5'] = c['md5'] if 'length' in c: this_contig_data['length'] = c['length'] if 'description' in c: this_contig_data['description'] = c['description'] contigs.append(this_contig_data) self.save_assembly_tsv(contigs, inner_chsum) else: raise ValueError('The "ref" is not an Assembly or ContigSet data object. ' 'It was a ' + info[2]) if self.debug: print(f" (time={time.time() - t1})") return inner_chsum
def read_assembly_ref_from_binnedcontigs(self, input_ref): ws = Workspace(self.ws_url) try: binned_contig_obj = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() return binned_contig_obj['assembly_ref']
def search_orthologs_from_pangenome(self, token, ref, query, sort_by, start, limit, num_found): search_object = 'orthologs' info_included = [ 'id', 'type', 'function', 'md5', 'protein_translation', 'orthologs' ] table_indexer = TableIndexer(token, self.ws_url) ret = table_indexer.run_search(ref, self.pangenome_index_dir, self.ORTHOLOGS_SUFFIX, search_object, info_included, query, sort_by, start, limit, num_found, self.debug) for orthologs in ret['orthologs']: orthologs_string = orthologs['orthologs'] if orthologs_string: orthologs['orthologs'] = list(eval(orthologs_string)) if not isinstance(orthologs['orthologs'][0], list): orthologs['orthologs'] = [orthologs['orthologs']] ws = Workspace(self.ws_url, token=token) genome_feature_function_map = {} for orthologs in ret['orthologs']: for orthologs_obj in orthologs['orthologs']: gene_id = orthologs_obj[0] if gene_id in genome_feature_function_map: orthologs_obj.append( genome_feature_function_map.get(gene_id)) else: included = ["/features/[*]/function", "/features/[*]/id"] object_info = ws.get_objects2({ 'objects': [{ 'ref': orthologs_obj[2], 'included': included }] })['data'][0]['data'] for feature in object_info['features']: genome_feature_function_map.update( {feature.get('id'): feature.get('function')}) orthologs_obj.append( genome_feature_function_map.get(gene_id)) return ret
def check_object_cache(self, ref, search_object, info_included, index_dir, object_suffix, debug): ws = Workspace(self.ws_url, token=self.token) info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0] inner_chsum = info[8] index_file = os.path.join(index_dir, inner_chsum + object_suffix + ".tsv.gz") if not os.path.isfile(index_file): if debug: print(" Loading WS object...") t1 = time.time() included = self.build_info_included(search_object, info_included) object = ws.get_objects2({'objects': [{'ref': ref, 'included': included}]})['data'][0]['data'] self.save_object_tsv(object[search_object], inner_chsum, info_included, index_dir, object_suffix) if debug: print(" (time=" + str(time.time() - t1) + ")") return inner_chsum
def fetch_pangenome_summary( pangenome_ref: str, workspace_url: str, token: str) -> dict: """ Construct a summary data object for a single pangenome, used in the "simple_summary" method. Args: pangenome_ref: Workspace reference to the pangenome object workspace_url: URL of the Workspace being used in the current env token: authorization token for fetching the data Returns: A python object adhering to the SimpleSummaryResult type in PanGenomeAPI.spec """ ws_client = Workspace(workspace_url, token=token) # Download the full pangenome workspace dataset resp = ws_client.get_objects2({ 'objects': [{'ref': pangenome_ref}] }) data = resp['data'][0]['data'] # Fetch the object infos for each genome genome_refs = [{"ref": ref} for ref in data["genome_refs"]] genome_infos = ws_client.get_object_info3({ "objects": genome_refs, "includeMetadata": 1 })["infos"] name_mapping = _genome_name_mapping(genome_infos) ret = { "pangenome_id": data["id"], "genomes_count": len(data["genome_refs"]), "genes": _count_genes(data), "families": _count_families(data), "genomes": _genome_counts(data, genome_infos, name_mapping), "shared_family_map": _shared_family_map(data, name_mapping), "genome_ref_name_map": name_mapping, } return ret
def fetch_fasta_from_genome(genome_ref, ws_url, callback_url): """ Returns an assembly or contigset as FASTA. """ if not check_ref_type(genome_ref, ['KBaseGenomes.Genome'], ws_url): raise ValueError( "The given genome_ref {} is not a KBaseGenomes.Genome type!") # test if genome references an assembly type # do get_objects2 without data. get list of refs ws = Workspace(ws_url) genome_obj_info = ws.get_objects2({ 'objects': [{ 'ref': genome_ref }], 'no_data': 1 }) # get the list of genome refs from the returned info. # if there are no refs (or something funky with the return), this will be an empty list. # this WILL fail if data is an empty list. But it shouldn't be, and we know because # we have a real genome reference, or get_objects2 would fail. genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', []) # see which of those are of an appropriate type (ContigSet or Assembly), if any. assembly_ref = list() ref_params = [{'ref': genome_ref + ";" + x} for x in genome_obj_refs] ref_info = ws.get_object_info3({'objects': ref_params}) for idx, info in enumerate(ref_info.get('infos')): if "KBaseGenomeAnnotations.Assembly" in info[ 2] or "KBaseGenomes.ContigSet" in info[2]: assembly_ref.append(";".join(ref_info.get('paths')[idx])) if len(assembly_ref) == 1: return fetch_fasta_from_assembly(assembly_ref[0], ws_url, callback_url) else: raise ValueError( "Multiple assemblies found associated with the given genome ref {}! " "Unable to continue.")
def _attributemapping_index(ws_url, upa, parent_upa): """""" ws = Workspace(ws_url) obj = ws.get_objects2({'objects': [{ 'ref': parent_upa + ";" + upa }]})['data'][0] data = obj['data'] doc = { "attributes": [], "attribute_ontology_ids": [], "attribute_units": [], "attribute_unit_ontology_ids": [], "attribute_values": [], "attribute_value_ontology_ids": [], "instances": data['instances'], "num_attributes": len(data['attributes']), "num_instances": len(data['instances']), } for attr in data['attributes']: doc['attributes'].append(attr['attribute']) if 'attribute_ont_id' in attr: doc['attribute_ontology_ids'].append(attr['attribute_ont_id']) if 'unit' in attr: doc['attribute_units'].append(attr['unit']) if 'attribute_ont_id' in attr: doc['attribute_unit_ontology_ids'].append(attr['attribute_ont_id']) if 'categories' in attr: doc['attribute_values'].extend(attr['categories'].keys()) doc['attribute_value_ontology_ids'].extend( x['attribute_ont_id'] for x in attr['categories'] if 'attribute_ont_id' in x) return { 'doc': doc, 'sub_id': str(upa_delimeter.join(list(upa.split('/')))), 'sub_type': "atrrmapping" }
class WorkspaceAdminUtils: def __init__(self, config): wsurl = config.get('workspace-url') self.atoken = config.get('workspace-admin-token') self.noadmin = False if self.atoken is None or self.atoken == '': self.noadmin = True self.atoken = os.environ.get('KB_AUTH_TOKEN', None) self.ws = Workspace(wsurl, token=self.atoken) def list_objects(self, params): """ Provide something that acts like a standard listObjects """ if self.noadmin: return self.ws.list_objects(params) return self.ws.administer({'command': 'listObjects', 'params': params}) def get_objects2(self, params): """ Provide something that acts like a standard getObjects """ if self.noadmin: return self.ws.get_objects2(params) return self.ws.administer({'command': 'getObjects', 'params': params}) def get_workspace_info(self, params): """ Provide something that acts like a standard getObjects """ if self.noadmin: return self.ws.get_workspace_info(params) return self.ws.administer({ 'command': 'getWorkspaceInfo', 'params': params })
def run_generate_metadata_report(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_generate_metadata_report object_type = params['object_type'] workspace_name = params['workspace_name'] ws = Workspace(self.ws_url) print(params) objects_in_workspace = ws.list_objects({ 'workspaces': [workspace_name], 'type': object_type }) object_names = sorted([j[1] for j in objects_in_workspace]) d = dict() if (object_type == 'KBaseRNASeq.RNASeqAlignment'): for object_name in object_names: alignment_stats = ws.get_objects2({ 'objects': [{ 'workspace': workspace_name, 'name': object_name }] })['data'][0]['data']['alignment_stats'] metadata_keys = alignment_stats.keys() object_pd = pd.Series(alignment_stats, index=metadata_keys) d[object_name] = object_pd else: for object_name in object_names: obj_meta_data = ws.get_object_info3( { 'objects': [{ 'workspace': workspace_name, 'name': object_name }], 'includeMetadata': 1 }, ) metadata = obj_meta_data.get('infos')[0][10] metadata_keys = metadata.keys() object_pd = pd.Series(metadata, index=metadata_keys) d[object_name] = object_pd df = pd.DataFrame(d) htmlDir = os.path.join(self.shared_folder, str(uuid.uuid4())) self._mkdir_p(htmlDir) report_file_path = os.path.join(htmlDir, "index.html") #df.to_html(report_file_path) self.write_pd_html(df.T, report_file_path) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except Exception: raise ValueError('Error uploading HTML file: ' + str(htmlDir) + ' to shock') reportname = 'generate_metadata_report_' + str(uuid.uuid4()) reportobj = { 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 500, 'workspace_name': params['workspace_name'], 'report_object_name': reportname } # attach to report obj reportobj['direct_html'] = '' reportobj['direct_html_link_index'] = 0 reportobj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'index.html' }] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportobj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } print(output) #END run_generate_metadata_report # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method run_generate_metadata_report return value ' + 'output is not type dict as required.') # return the results return [output]
class dN_dS_ratio: ''' Module Name: dN_dS_ratio Module Description: A KBase module: dN_dS_ratio ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "" GIT_COMMIT_HASH = "" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.du = DownloadUtils(self.callback_url) self.pu = DnDs_Utils() self.dpu = Data_Process_Utils() self.hu = htmlreportutils() self.config = config logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def run_dN_dS_ratio(self, ctx, params): ''' This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String ''' # ctx is the context object # return variables are: output #BEGIN run_dN_dS_ratio print(params) self.dpu.validate_params(params) workspace = params['workspace_name'] output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) self.ws_url = self.config['workspace-url'] self.ws = Workspace(url=self.ws_url, token=ctx['token']) variation_ref = params['variation_ref'] variation = self.du.get_variation(variation_ref) #self.du.tabix_index(variation) variation_obj = self.ws.get_objects2( {'objects': [{ 'ref': variation_ref }]})['data'][0] data = self.ws.get_objects2({ 'objects': [{ "ref": variation_ref, 'included': ['/sample_set_ref'] }] })['data'][0]['data'] sample_set_ref = data['sample_set_ref'] assembly_ref = variation_obj['data']['assembly_ref'] assembly_path = self.du.get_assembly(assembly_ref, output_dir) gff_ref = params['genome_ref'] gff_path = self.du.get_gff(gff_ref) gene_id = params['gene_id'] gff_subsample_path = os.path.join(output_dir, "sub_sample.gff") self.dpu.filter_gff(gene_id, gff_path, gff_subsample_path) with open(gff_subsample_path, 'r') as f: line = f.readline() rec = line.split("\t") chrom = rec[0] start = rec[3] end = rec[4] sub_sample_vcf = os.path.join(output_dir, "sub_sample.vcf") self.dpu.index_vcf_file(variation) self.dpu.tabix_query(variation, chrom, start, end, sub_sample_vcf) assembly_path = output_dir + '/ref_genome.fa' variation = output_dir + '/sub_sample.vcf' gff_path = output_dir + '/sub_sample.gff' sequence = self.pu.read_refseq(assembly_path) print(sequence) var_list = self.pu.read_vcf(variation, sequence) print(var_list) var_file = os.path.join(output_dir, "variant_info.tsv") with open(var_file, 'w') as variant_tmp_file: var_temp = csv.writer(variant_tmp_file, delimiter='\t') var_temp.writerow([ "#chr", "ref", "alt", "pos", "codon number", "pos in codon", "codon start", "codon", "mutation type", "coverage" ]) for var_gene_list in var_list: var_temp.writerow(var_gene_list) gff_data = self.pu.read_gff_file(gff_path) codon_list = self.pu.get_triplets(sequence, gff_data) codon_result_file = os.path.join(output_dir, "codon_results_temp.tsv") corrected_codon_result_file = os.path.join( output_dir, "corrected_variant_info.tsv") with open(codon_result_file, 'w') as cdr_tmp_file: cdr_temp = csv.writer(cdr_tmp_file, delimiter='\t') cdr_temp.writerow([ "#chr", "gene", "codon", "codon start", "codon end", "codon positions", "codon number", "N", "S" ]) for gene_codon_list in codon_list: for codon in gene_codon_list: cdr_temp.writerow(codon) merged_list = self.pu.merge_files(corrected_codon_result_file, codon_result_file, var_file) all_possible_codon = self.pu.get_all_possible_codon( merged_list) # generating all possible codon self.pu.generate_statistics(corrected_codon_result_file, codon_result_file, all_possible_codon, output_dir) ############# html reporting ############################################################3 workspace = params['workspace_name'] output = self.hu.create_html_report(self.callback_url, output_dir, workspace) #END run_dN_dS_ratio # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_dN_dS_ratio return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class BwaIndexBuilder: def __init__(self, scratch_dir, ws_url, callback_url, service_wizard_url, provenance): self.scratch_dir = scratch_dir self.ws_url = ws_url self.ws = Workspace(self.ws_url) self.callback_url = callback_url self.service_wizard_url = service_wizard_url self.bwa = BwaRunner(self.scratch_dir) self.provenance = provenance def get_index(self, params): ''' The key function of this module- get a bwa index for the specified input ''' # validate the parameters and fetch assembly_info validated_params = self._validate_params(params) assembly_info = self._get_assembly_info(validated_params['ref']) # check the cache (keyed off of assembly_info) index_info = self._get_cached_index(assembly_info, validated_params) if index_info: index_info['from_cache'] = 1 index_info['pushed_to_cache'] = 0 else: # on a cache miss, build the index index_info = self._build_index(assembly_info, validated_params) index_info['from_cache'] = 0 # pushed_to_cache will be set in return from _build_index index_info['assembly_ref'] = assembly_info['ref'] index_info['genome_ref'] = assembly_info['genome_ref'] return index_info def _validate_params(self, params): ''' validate parameters; can do some processing here to produce validated params ''' # params['ref'] = params['assembly_or_genome_ref'] validated_params = {'ref': None} if 'ref' in params and params['ref']: validated_params['ref'] = params['ref'] else: raise ValueError('"ref" field indicating either an assembly or genome is required.') if 'output_dir' in params: validated_params['output_dir'] = params['output_dir'] else: validated_params['output_dir'] = os.path.join(self.scratch_dir, 'bwa_index_' + str(int(time.time() * 100))) if os.path.exists(validated_params['output_dir']): raise ('Output directory name specified (' + validated_params['output_dir'] + ') already exists. Will not overwrite, so aborting.') if 'ws_for_cache' in params and params['ws_for_cache']: validated_params['ws_for_cache'] = params['ws_for_cache'] else: print('WARNING: bwa index if created will not be cached because "ws_for_cache" field not set') validated_params['ws_for_cache'] = None return validated_params def _get_assembly_info(self, ref): ''' given a ref to an assembly or genome, figure out the assembly and return its info ''' info = self.ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0] obj_type = info[2] if obj_type.startswith('KBaseGenomeAnnotations.Assembly') or obj_type.startswith('KBaseGenomes.ContigSet'): return {'info': info, 'ref': ref, 'genome_ref': None} if obj_type.startswith('KBaseGenomes.Genome'): # we need to get the assembly for this genome ga = GenomeAnnotationAPI(self.service_wizard_url) assembly_ref = ga.get_assembly({'ref': ref}) # using the path ensures we can access the assembly even if we don't have direct access ref_path = ref + ';' + assembly_ref info = self.ws.get_object_info3({'objects': [{'ref': ref_path}]})['infos'][0] return {'info': info, 'ref': ref_path, 'genome_ref': ref} raise ValueError('Input object was not of type: Assembly, ContigSet or Genome. Cannot get bwa Index.') def _get_cached_index(self, assembly_info, validated_params): try: # note: list_reference_objects does not yet support reference paths, so we need to call # with the direct reference. So we won't get a cache hit if you don't have direct access # to the assembly object right now (although you can still always build the assembly object) # Once this call supports paths, this should be changed to set ref = assembly_info['ref'] info = assembly_info['info'] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) objs = self.ws.list_referencing_objects([{'ref': ref}])[0] # iterate through each of the objects that reference the assembly bwa_indexes = [] for o in objs: if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'): bwa_indexes.append(o) # Nothing refs this assembly, so cache miss if len(bwa_indexes) == 0: return False # if there is more than one hit, get the most recent one # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that) bwa_indexes.sort(key=lambda x: x[3]) bwa_index_info = bwa_indexes[-1] index_ref = str(bwa_index_info[6]) + '/' + str(bwa_index_info[0]) + '/' + str(bwa_index_info[4]) # get the object data index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data'] # download the handle object os.makedirs(validated_params['output_dir']) dfu = DataFileUtil(self.callback_url) dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'), 'handle_id': index_obj_data['handle']['hid'], 'unpack': 'unpack'}) print('Cache hit: ') pprint(index_obj_data) return {'output_dir': validated_params['output_dir'], 'index_files_basename': index_obj_data['index_files_basename']} except Exception: # if we fail in saving the cached object, don't worry print('WARNING: exception encountered when trying to lookup in cache:') print(traceback.format_exc()) print('END WARNING: exception encountered when trying to lookup in cache.') return None def _put_cached_index(self, assembly_info, index_files_basename, output_dir, ws_for_cache): if not ws_for_cache: print('WARNING: bwa index cannot be cached because "ws_for_cache" field not set') return False try: dfu = DataFileUtil(self.callback_url) result = dfu.file_to_shock({'file_path': output_dir, 'make_handle': 1, 'pack': 'targz'}) bwa_index = {'handle': result['handle'], 'size': result['size'], 'assembly_ref': assembly_info['ref'], 'index_files_basename': index_files_basename} ws = Workspace(self.ws_url) save_params = {'objects': [{'hidden': 1, 'provenance': self.provenance, 'name': os.path.basename(output_dir), 'data': bwa_index, 'type': 'KBaseRNASeq.Bowtie2IndexV2' }] } if ws_for_cache.strip().isdigit(): save_params['id'] = int(ws_for_cache) else: save_params['workspace'] = ws_for_cache.strip() save_result = ws.save_objects(save_params) print('Bowtie2IndexV2 cached to: ') pprint(save_result[0]) return True except Exception: # if we fail in saving the cached object, don't worry print('WARNING: exception encountered when trying to cache the index files:') print(traceback.format_exc()) print('END WARNING: exception encountered when trying to cache the index files') return False def _build_index(self, assembly_info, validated_params): # get the assembly as a fasta file using AssemblyUtil au = AssemblyUtil(self.callback_url) fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']}) # make the target destination folder (check again it wasn't created yet) if os.path.exists(validated_params['output_dir']): raise ('Output directory name specified (' + validated_params['output_dir'] + ') already exists. Will not overwrite, so aborting.') os.makedirs(validated_params['output_dir']) # configure the command line args and run it cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params) self.bwa.run('index', cli_params) # self.bwa.run('index', cli_params) for file in glob.glob(r'/kb/module/work/tmp/' + fasta_info['assembly_name'] + '.*'): print(file) shutil.copy(file, validated_params['output_dir']) index_info = {'output_dir': validated_params['output_dir'], 'index_files_basename': fasta_info['assembly_name']} # cache the result, mark if it worked or not cache_success = self._put_cached_index(assembly_info, fasta_info['assembly_name'], validated_params['output_dir'], validated_params['ws_for_cache']) if cache_success: index_info['pushed_to_cache'] = 1 else: index_info['pushed_to_cache'] = 0 return index_info def _build_cli_params(self, fasta_file_path, index_files_basename, validated_params): cli_params = [] # always run in quiet mode # positional args: first the fasta path, then the base name used for the index files cli_params.append(fasta_file_path) cli_params.append("-p") cli_params.append(index_files_basename) return cli_params
class VirSorterUtils: def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.mgu = MetagenomeUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.ws = Workspace(config['workspace-url'], token=config['token']) def VirSorter_help(self): command = 'wrapper_phage_contigs_sorter_iPlant.pl --help' self._run_command(command) def get_fasta(self, ref): # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0 obj_type = self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0][2] if 'assembly' in obj_type.lower(): genome_ref = ref elif 'kbasegenomes' in obj_type.lower(): data = self.ws.get_objects2({ 'objects': [{ 'ref': ref, 'included': ['assembly_ref'], 'strict_maps': 1 }] })['data'][0]['data'] genome_ref = data['assembly_ref'] else: raise ValueError( f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or " f"KBaseGenomeAnnotations.Assembly required.") return self.au.get_assembly_as_fasta({'ref': genome_ref})['path'] def run_VirSorter(self, params): params['SDK_CALLBACK_URL'] = self.callback_url params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] # Get contigs from 'assembly' genome_fp = self.get_fasta(params['genomes']) command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data' # Add in first args command += f' -f {genome_fp} --db {params["database"]}' # Check if additional genomes were submitted if params.get('add_genomes'): add_genomes_fp = self.get_fasta(params['add_genomes']) print(f'Added genomes DETECTED: {add_genomes_fp}') command += f' --cp {add_genomes_fp}' bool_args = ['virome', 'diamond', 'keep_db', 'no_c'] # keep_db = keep-db for bool_arg in bool_args: if params[ bool_arg] == 1: # 0 is true and therefore run... though for some reason it's reversed on json if bool_arg == 'keep_db': bool_arg = 'keep-db' command += f' --{bool_arg}' self._run_command(command) report = self._generate_report( params) # Basically, do everything that's after the tool runs return report def _run_command(self, command): """ :param command: :return: """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, err = pipe.communicate() exitCode = pipe.returncode if exitCode == 0: log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format( exitCode, output, err) raise RuntimeError(error_msg) def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id): columns = [ 'Contig_id', 'Nb genes contigs', 'Fragment', 'Nb genes', 'Category', 'Nb phage hallmark genes', 'Phage gene enrichment sig', 'Non-Caudovirales phage gene enrichment sig', 'Pfam depletion sig', 'Uncharacterized enrichment sig', 'Strand switch depletion sig', 'Short genes enrichment sig', ] try: with open(virsorter_global_fp, 'r') as vir_fh: data = {} category = '' for line in vir_fh: if line.startswith('## Contig_id'): continue elif line.startswith( '## ' ): # If 'header' lines are consumed by 1st if, then remaining should be good category = line.split('## ')[-1].split(' -')[0] else: values = line.strip().split(',') data[values[0]] = dict(zip(columns[1:], values[1:])) except: vir_path = os.path.join(os.getcwd(), 'virsorter-out') files = os.listdir(vir_path) raise RuntimeError( f"{virsorter_global_fp} is not a file. existing files {files}." ) df = pd.DataFrame().from_dict(data, orient='index') df.index.name = columns[0] df.reset_index(inplace=True) html = df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute( html_table=html, affi_contigs_shock_id=affi_contigs_shock_id) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace( ' style="text-align: right;"', '').replace( 'thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[ start_pos + 8:] return final_html def get_assembly_contig_ids(self, assembly_ref): """get contig ids from assembly_ref""" contigs = self.ws.get_objects2( {'objects': [{ 'ref': assembly_ref, 'included': ['contigs'] }]})['data'][0]['data']['contigs'] return contigs.keys() def _generate_report(self, params): """ :param params: :return: """ # Get URL self.dfu = dfu(params['SDK_CALLBACK_URL']) # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out') print( f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}' ) # Replacing individual download files with BinnedContigs # kb_deseq adds output files, then builds report files and sends all of them to the workspace output_files = [] # Appended list of dicts containing attributes # Collect all the files needed to report to end-user # Get all predicted viral sequences pred_fnas = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.fasta')) pred_gbs = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.gb')) # Summary 'table' glob_signal = os.path.join(virsorter_outdir, 'VIRSorter_global-phage-signal.csv') print('Identified the following predicted viral sequences:\n{}'.format( '\n\t'.join(pred_fnas))) if len(pred_fnas) == 0: print( f"Unable to find predicted viral sequences, here are the directory's content:\n" f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}" ) if os.path.exists(glob_signal): print(f'Identified the global phage signal: {glob_signal}') lines = -1 # Don't count header with open(glob_signal) as fh: for ln in fh: lines += 1 if lines == 0: print('But it is EMPTY!') else: print( 'Unable to find the global phage signal file. Was there an error during the run?' ) # Append error and out files from VIRSorter err_fp = os.path.join(virsorter_outdir, 'logs/err') # if os.path.exists(err_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/err'), # 'name': 'VIRSorter_err', # 'label': 'VIRSorter_err', # 'description': 'VIRSorter error log file, generated from the tool itself.' # }) out_fp = os.path.join(virsorter_outdir, 'logs/out') # if os.path.exists(out_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/out'), # 'name': 'VIRSorter_out', # 'label': 'VIRSorter_out', # 'description': 'VIRSorter output log file, generated from the tool itself.' # }) if not (os.path.exists(err_fp) or os.path.exists(out_fp)): print( 'Unable to find err and/or out files in LOG directory, contents:' ) print(os.listdir(os.path.join(virsorter_outdir, 'logs'))) # Make output directory output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) # Deal with nucleotide and protein fasta pred_fna_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_fna.tar.gz') with tarfile.open( pred_fna_tgz_fp, 'w:gz') as pred_fna_tgz_fh: # Compress to minimize disk usage for pred_fna in pred_fnas: pred_fna_tgz_fh.add(pred_fna, arcname=os.path.basename(pred_fna)) output_files.append({ 'path': pred_fna_tgz_fp, 'name': os.path.basename(pred_fna_tgz_fp), 'label': os.path.basename(pred_fna_tgz_fp), 'description': 'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_fna_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in FASTA format: ' f'{pred_fna_tgz_fp}') pred_gb_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_gb.tar.gz') with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh: for pred_gb in pred_gbs: pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb)) output_files.append({ 'path': pred_gb_tgz_fp, 'name': os.path.basename(pred_gb_tgz_fp), 'label': os.path.basename(pred_gb_tgz_fp), 'description': 'Genbank-formatted sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_gb_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in Genbank format: ' f'{pred_gb_tgz_fp}') # To create BinnedContig, need to create another directory with each of the "bins" as separate files? binned_contig_output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(binned_contig_output_dir) # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage # of its features, but also to feed more easily into other tools (e.g. vConTACT) created_objects = [] # Will store the objects that go to the workspace # load contig ids from the assembly input # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref) assembly_contig_ids = self.get_assembly_contig_ids( params['genomes']) # Will fail for Genome summary_fp = os.path.join( binned_contig_output_dir, 'VIRSorter.summary') # Anything that ends in .summary with open(summary_fp, 'w') as summary_fh: summary_writer = csv.writer(summary_fh, delimiter='\t', quoting=csv.QUOTE_MINIMAL) summary_writer.writerow( ['Bin name', 'Completeness', 'Genome size', 'GC content']) for category_fp in pred_fnas: # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention category = os.path.basename(category_fp).split( 'cat-')[-1].split('.')[0] dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3)) dest_fp = os.path.join(output_dir, dest_fn) binned_contig_fp = os.path.join(binned_contig_output_dir, dest_fn) genome_size = 0 gc_content = [] # Need stats for summary file # Also need to adjust sequence name so binnedContig object can retrieve sequences adjusted_sequences = [] with open(category_fp, 'rU') as category_fh: for record in SeqIO.parse(category_fh, 'fasta'): seq = record.seq gc_content.append(SeqUtils.GC(seq)) genome_size += len(seq) # This is very dirty, but need to change name to match original contigs record.id = record.id.replace('VIRSorter_', '').replace( '-circular', '').split('-cat_')[0] if 'gene' in record.id: # Prophage record.id = record.id.split('_gene')[0] record.id = record.id.rsplit('_', 1)[0] # here we make sure that the id's line up with contig ids in the input assembly object if record.id not in assembly_contig_ids: for assembly_contig_id in assembly_contig_ids: # first check if record.id is substring of current contig id, # then check if current contig id is substring of record.id # NOTE: this is not a perfect way of checking and will likely # fail in some circumstances. # A more complete check would be to make sure there is a 1:1 # mapping of contig id's in the assembly object as compared to # the binned contig object (the fasta files defined here). if (record.id in assembly_contig_id) or ( assembly_contig_id in record.id): record.id = assembly_contig_id break record.description = '' record.name = '' adjusted_sequences.append(record) if genome_size != 0: # Empty file summary_writer.writerow([ dest_fn, '100%', genome_size, (sum(gc_content) / len(gc_content)) ]) print('Copying {} to results directory'.format( os.path.basename(category_fp))) # Yes, need both. One is to get file_links in report. Second is for binnedContigs object shutil.copyfile(category_fp, dest_fp) # Write renamed sequences with open(binned_contig_fp, 'w') as binned_contig_fh: SeqIO.write(adjusted_sequences, binned_contig_fh, 'fasta') result = self.au.save_assembly_from_fasta({ 'file': { 'path': dest_fp }, 'workspace_name': params['workspace_name'], 'assembly_name': 'VirSorter-Category-{}'.format(category) }) created_objects.append({ "ref": result, "description": "KBase Assembly object from VIRSorter" }) # Create BinnedContigs object, but 1st, a little metadata generate_binned_contig_param = { 'file_directory': binned_contig_output_dir, 'assembly_ref': params['genomes'], # params.get('genomes'), self.assembly_ref 'binned_contig_name': params['binned_contig_name'], 'workspace_name': params['workspace_name'] } binned_contig_object_ref = self.mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') # Add binned contigs reference here, as it was already created above created_objects.append({ "ref": binned_contig_object_ref, "description": "BinnedContigs from VIRSorter" }) # Save VIRSorter_affi-contigs.tab for DRAM-v affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files', 'VIRSorter_affi-contigs.tab') affi_contigs_shock_id = self.dfu.file_to_shock( {'file_path': affi_contigs_fp})['shock_id'] # Use global signal (i.e. summary) file and create HTML-formatted version raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id) html_fp = os.path.join(output_dir, 'index.html') with open(html_fp, 'w') as html_fh: html_fh.write(raw_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(html_fp), 'label': os.path.basename(html_fp), 'description': 'HTML summary report for VIRSorter-predicted viral genomes.' }] report_params = { 'message': 'Here are the results from your VIRSorter run. Above, you\'ll find a report with ' 'all the identified (putative) viral genomes, and below, links to the report as ' 'well as files generated.', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'VIRSorter_report_{}'.format(str(uuid.uuid4())), 'file_links': output_files, 'objects_created': created_objects, } kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'], token=params['KB_AUTH_TOKEN']) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'], 'result_directory': binned_contig_output_dir, 'binned_contig_obj_ref': binned_contig_object_ref } return report_output def _mkdir_p(self, path): """ :param path: :return: """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
class VariationMerge: ''' Module Name: VariationMerge Module Description: A KBase module: VariationMerge ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/VariationMerge.git" GIT_COMMIT_HASH = "918495236305bcae5e2ded0be6ed18d71defd678" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.ws_url = config['workspace-url'] self.vu = VariationUtil(self.callback_url) self.mu = MergeVcfUtils() #END_CONSTRUCTOR pass def run_VariationMerge(self, ctx, params): """ :param params: instance of type "inparams" (This example function accepts any number of parameters and returns results in a KBaseReport) -> structure: parameter "obj_name" of String, parameter "workspace_name" of String, parameter "vcflist" of list of String :returns: instance of type "OutResults" -> structure: parameter "output_obj_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_VariationMerge self.ws = Workspace(url=self.ws_url, token=ctx['token']) print(params) vcf_flist = [] assembly_ref_set = set() sampleset_ref_set = set() genome_set_ref_set = set() for i in range(len(params['vcflist'])): variation_ref = params['vcflist'][i] variation_obj = self.ws.get_objects2( {'objects': [{ 'ref': variation_ref }]})['data'][0] print(variation_obj['data']['assembly_ref']) if 'assembly_ref' in variation_obj['data']: assembly_ref = variation_obj['data']['assembly_ref'] assembly_ref_set.add(assembly_ref) elif 'genome_ref' in variation_obj['data']: genome_ref = variation_obj['data']['genome_ref'] genome_set_ref_set.add(genome_ref) print(params['vcflist'][i]) vcf_filename = "/kb/module/work/tmp/variation" + str(i) + ".vcf.gz" vcf_flist.append(vcf_filename) inparams = {} inparams['variation_ref'] = variation_ref inparams['filename'] = vcf_filename self.vu.get_variation_as_vcf(inparams) os.rename("/kb/module/work/tmp/variation.vcf.gz", vcf_filename) self.mu.index_vcf(vcf_filename) var_object_ref = params['vcflist'][i] data = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref, 'included': ['/sample_set_ref'] }] })['data'][0]['data'] sampleset_ref_set.add(data['sample_set_ref']) #Raising exception if (len(genome_set_ref_set) == 0 and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (len(sampleset_ref_set) != 1): raise Exception( "variation objects are from different sample set refs") elif (len(assembly_ref_set) == 0 and len(genome_set_ref_set) != 1): raise Exception( "variation objects are from different genome set refs") merged_file = os.path.join(self.shared_folder, "merged_gatk_variation_jmc2_test.vcf") self.mu.merge_vcf(vcf_flist, merged_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_ref_set.pop(), 'sample_set_ref': sampleset_ref_set.pop(), 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': merged_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_VariationMerge # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_VariationMerge return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def run_FamaGenomeProfiling(self, ctx, params): """ Run genome functional profiling module of Fama. :param params: instance of type "FamaGenomeProfilingParams" (Parameters for genome functional profiling. workspace_name - the name of the workspace for input/output genome_refs - references to a genome object ref_dataset - the name of Fama reference dataset output_result_name - the name of the output DomainAnnotation) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of list of String, parameter "ref_dataset" of String, parameter "output_feature_set_name" of String, parameter "output_annotation_name" of String :returns: instance of type "ReportResults" (Output report parameters report_name - the name of the report object report_ref - the reference to the report object) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FamaGenomeProfiling # Import protein sequences from input genome_ref ws_client = Workspace(self.ws_url) input_genome_refs = params['genome_ref'] fama_reference = params['ref_dataset'] input_proteins = {} name2ref = {} for input_genome_ref in input_genome_refs: ret = ws_client.get_objects2( {'objects': [{ 'ref': input_genome_ref }]})['data'][0] obj_data = ret['data'] obj_name = ret['info'][1] obj_type = ret['info'][2].split('.')[1].split('-')[0] if obj_type == 'GenomeSet': print('GenomeSet data', obj_data) genome_refs = [] if 'elements' in obj_data: genome_refs = [ item['ref'] for item in obj_data['elements'].values() ] elif 'items' in obj_data: genome_refs = [item['ref'] for item in obj_data['items']] for sub_obj_ref in genome_refs: ret = ws_client.get_objects2( {'objects': [{ 'ref': sub_obj_ref }]})['data'][0] genome_data = ret['data'] genome_name = ret['info'][1] if genome_name in name2ref: raise ServerError( 'All input genome names must be unique. Check ' + genome_name) name2ref[genome_name] = sub_obj_ref proteins = genome_proteins_to_fasta( genome_data, self.shared_folder) input_proteins[genome_name] = {} input_proteins[genome_name]['fwd'] = proteins elif obj_type == 'Genome': if obj_name in name2ref: raise ServerError('All input genome names must be unique') name2ref[obj_name] = input_genome_ref proteins = genome_proteins_to_fasta(obj_data, self.shared_folder) input_proteins[obj_name] = {} input_proteins[obj_name]['fwd'] = proteins else: raise ServerError('Incompatible object: ' + input_genome_ref + ' (' + obj_name + ')') self.log('Input sequence files:', str(input_proteins)) self.log('reference: ', fama_reference) # Run Fama fama_params = { 'input_proteins': input_proteins, 'work_dir': self.shared_folder, 'reference': fama_reference, 'ws_name': params['workspace_name'], 'ws_client': ws_client, 'featureset_name': params['output_feature_set_name'], 'annotation_prefix': params['output_annotation_name'], 'name2ref': name2ref } fama_output = protein_functional_profiling_pipeline(fama_params) objects_created = fama_output['objects_created'] dfu = DataFileUtil(self.callback_url) workspace_id = dfu.ws_name_to_id(params['workspace_name']) object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': fama_output['feature_set_data'], 'name': params['output_feature_set_name'] }] } try: dfu_oi = dfu.save_objects(save_object_params)[0] except ServerError as dfue: # not really any way to test this block self.log('Logging exception saving feature set') self.log(str(dfue)) raise feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created.append({ 'ref': feature_set_obj_ref, 'description': 'Filtered genome features' }) self.log('FeatureSet saved to ' + feature_set_obj_ref) # Write HTML output to workspace message = 'Fama protein functional profiling finished successfully' try: dfu_output = dfu.file_to_shock( {'file_path': fama_output['html_report']}) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('HTML report saved: ' + str(dfu_output)) html_links = [{ 'shock_id': dfu_output['shock_id'], 'description': 'HTML report for Fama App', 'name': 'fama_report.html', 'label': 'Fama_report' }] for krona_file in fama_output['krona_charts']: try: dfu_output = dfu.file_to_shock({'file_path': krona_file}) html_links.append({ 'shock_id': dfu_output['shock_id'], 'description': 'Krona chart for function taxonomy profile', 'name': fama_output['krona_charts'][krona_file][0], 'label': fama_output['krona_charts'][krona_file][1] }) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('Krona chart saved: ' + str(dfu_output)) # Save report report_params = { 'message': message, 'objects_created': objects_created, 'direct_html_link_index': 0, 'html_links': html_links, 'file_links': fama_output['report_files'], 'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'], 'html_window_height': 460 } try: self.log('Call KBaseReport at ' + str(self.callback_url)) report = KBaseReport(self.callback_url) self.log('Ready to save KBase report: ' + str(report_params)) report_info = report.create_extended_report(report_params) except ServerError as kre: # not really any way to test this block self.log('Logging exception saving report') self.log(str(kre)) raise report_info['report_params'] = report_params self.log('KBase report saved: ' + str(report_info)) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_FamaGenomeProfiling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FamaGenomeProfiling return value ' + 'output is not type dict as required.') # return the results return [output]
class kb_ReadSim: ''' Module Name: kb_ReadSim Module Description: A KBase module: kb_ReadSim ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git" GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.du = DownloadUtils(self.callback_url) self.su = SimUtils() self.ru = ReadsUtils(self.callback_url) self.vu = VariationUtil(self.callback_url) self.eu = VcfEvalUtils() self.hu = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def run_kb_ReadSim(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Inparams" -> structure: parameter "workspace_name" of String, parameter "input_sample_set" of String, parameter "strain_info" of String, parameter "assembly_or_genome_ref" of String, parameter "base_error_rate" of String, parameter "outer_distance" of String, parameter "standard_deviation" of String, parameter "num_read_pairs" of String, parameter "len_first_read" of String, parameter "len_second_read" of String, parameter "mutation_rate" of String, parameter "frac_indels" of String, parameter "variation_object_name" of String, parameter "output_read_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ReadSim output_dir = self.shared_folder print(params) self.su.validate_simreads_params(params) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') self.du.download_genome(assembly_ref, output_dir) ref_genome = os.path.join(self.shared_folder, "ref_genome.fa") output_fwd_paired_file_path = os.path.join(self.shared_folder, "raed1.fq") output_rev_paired_file_path = os.path.join(self.shared_folder, "raed2.fq") self.eu.check_path_exists(ref_genome) self.su.simreads(ref_genome, output_fwd_paired_file_path, output_rev_paired_file_path, params) self.eu.check_path_exists(output_fwd_paired_file_path) self.eu.check_path_exists(output_rev_paired_file_path) retVal = self.ru.upload_reads({ 'wsname': params['workspace_name'], 'name': params['output_read_object'], 'sequencing_tech': 'illumina', 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path }) logfile = os.path.join(self.shared_folder, "variant.txt") self.eu.check_path_exists(logfile) vcf_file = self.su.format_vcf(logfile) self.eu.check_path_exists(vcf_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ReadSim # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ReadSim return value ' + 'output is not type dict as required.') # return the results return [output] def run_eval_variantcalling(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Evalparams" -> structure: parameter "workspace_name" of String, parameter "sim_varobject_name" of String, parameter "calling_varobject_name" of String, parameter "output_var_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_eval_variantcalling print(params) self.eu.validate_eval_params(params) report_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(report_dir) self.ws = Workspace(url=self.ws_url, token=ctx['token']) var_object_ref1 = params['varobject_ref1'] sampleset_ref1 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref1, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] var_object_ref2 = params['varobject_ref2'] sampleset_ref2 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref2, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] if (sampleset_ref1 != sampleset_ref2): raise Exception( "Variation objects are from different sample set\n") assembly_ref_set = set() genomeset_ref_set = set() variation_obj1 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref1 }]})['data'][0] if 'assembly_ref' in variation_obj1['data']: assembly_ref1 = variation_obj1['data']['assembly_ref'] assembly_ref_set.add(assembly_ref1) elif 'genome_ref' in variation_obj1['data']: genome_ref1 = variation_obj1['data']['genome_ref'] genomeset_ref_set.add(genome_ref1) variation_obj2 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref2 }]})['data'][0] if 'assembly_ref' in variation_obj2['data']: assembly_ref2 = variation_obj2['data']['assembly_ref'] assembly_ref_set.add(assembly_ref2) elif 'genome_ref' in variation_obj2['data']: genome_ref2 = variation_obj2['data']['genome_ref'] genomeset_ref_set.add(genome_ref2) assembly_or_genome_ref = None if (not genomeset_ref_set and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (not assembly_ref_set and len(genomeset_ref_set) != 1): raise Exception("variation objects are from different genome refs") simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz") simvarpath = self.du.download_variations(var_object_ref1, simvarfile) os.rename(simvarpath, simvarfile) self.eu.index_vcf(simvarfile) callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz") callingvarpath = self.du.download_variations(var_object_ref2, callingvarfile) os.rename(callingvarpath, callingvarfile) self.eu.index_vcf(callingvarfile) eval_results = self.eu.variant_evalation(simvarfile, callingvarfile, report_dir) unique_vcf1 = eval_results['unique1'] self.eu.check_path_exists(unique_vcf1) unique_vcf2 = eval_results['unique2'] self.eu.check_path_exists(unique_vcf2) common_vcf = eval_results['common'] self.eu.check_path_exists(common_vcf) image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1, unique_vcf2, common_vcf) self.eu.check_path_exists(image_path) ''' if(len(assembly_ref_set) != 0): assembly_or_genome_ref = assembly_ref_set.pop() elif(len(genomeset_ref_set) != 0): assembly_or_genome_ref = genomeset_ref_set.pop() logging.info("Saving Unique1 vcf\n") save_unique_variation_params1 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr1', 'vcf_staging_file_path': unique_vcf1, 'variation_object_name': params['output_variant_object'] + "_sample1_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params1) logging.info("Saving done\n") logging.info("Saving Unique2 vcf\n") save_unique_variation_params2 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr2', 'vcf_staging_file_path': unique_vcf2, 'variation_object_name': params['output_variant_object'] + "_sample2_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params2) logging.info("Saving done\n") logging.info("Saving Common vcf\n") save_common_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_common_attr', 'vcf_staging_file_path': common_vcf, 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common" } self.vu.save_variation_from_vcf(save_common_variation_params) logging.info("Saving done\n") ''' workspace = params['workspace_name'] output = self.hu.create_html_report(self.callback_url, report_dir, workspace) #END run_eval_variantcalling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_eval_variantcalling return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def build_report_view_data(host: str, ws_client: Workspace, result: list) -> dict: """ Returns a structure like this: { html: { height: max height string for iframes (default = 500px, unless present in report), set_height: boolean - if True, then apply height to the height style value as well. direct: string (optional) - direct html to plop in the page, iframe_style: string (optional) - styling for direct html iframe, links: [{ url: string, name: string, description: string, handle: ? label: ? }], paths: [ path1, path2, path3, ... ] for all urls in links (just a convenience), link_idx: index of paths to use (this is a little funky, might get cleared up in a later iteration.) (I suspect this'll be here 3 years later. Today's 2/13/2020. Let's see!) file_links: [{ 'URL': 'https://ci.kbase.us/services/shock-api/node/a2625b71-48d5-4ba6-8603-355485508da8', 'description': 'JGI Metagenome Assembly Report', 'handle': 'KBH_253154', 'label': 'assembly_report', 'name': 'assembly_report.zip' }] } objects: [{ 'upa': '...', 'name': 'foo', 'type': '...', 'description': '...' }] summary: '', summary_height: height string for summary panel (default = 500px unless specified in report), report: '' } """ if not result: return {} if not isinstance(result, list): result = [result] if (not result[0] or not isinstance(result[0], dict) or not result[0].get('report_name') or not result[0].get('report_ref')): return {} report_ref = result[0]['report_ref'] report = ws_client.get_objects2({'objects': [{ 'ref': report_ref }]})['data'][0]['data'] """{'direct_html': None, 'direct_html_link_index': None, 'file_links': [], 'html_links': [], 'html_window_height': None, 'objects_created': [{'description': 'Annotated genome', 'ref': '43666/6/1'}], 'summary_window_height': None, 'text_message': 'Genome saved to: wjriehl:narrative_1564507007662/some_genome\nNumber of genes predicted: 3895\nNumber of protein coding genes: 3895\nNumber of genes with non-hypothetical function: 2411\nNumber of genes with EC-number: 1413\nNumber of genes with Seed Subsystem Ontology: 1081\nAverage protein length: 864 aa.\n', 'warnings': []} """ created_objs = [] if report.get('objects_created'): report_objs_created = report['objects_created'] # make list to look up obj types with get_object_info3 info_lookup = [{"ref": o["ref"]} for o in report_objs_created] infos = ws_client.get_object_info3({'objects': info_lookup})['infos'] for idx, info in enumerate(infos): created_objs.append({ 'upa': report_objs_created[idx]['ref'], 'description': report_objs_created[idx].get('description', ''), 'name': info[1], 'type': info[2].split('-')[0].split('.')[-1], 'link': host + '/#dataview/' + report_objs_created[idx]['ref'] }) html_height = report.get("html_window_height") if html_height is None: html_height = 500 html = {"height": f"{html_height}px", "set_height": True} if report.get("direct_html"): if not report.get("direct_html").startswith("<html"): html["set_height"] = False html["direct"] = "data:text/html;charset=utf-8," + quote( report.get("direct_html")) if report.get("html_links"): idx = report.get("direct_html_link_index", 0) if idx is None or idx < 0 or idx >= len(report["html_links"]): idx = 0 html["links"] = report["html_links"] html["paths"] = list() for i, link in enumerate(html["links"]): html["paths"].append(f'/api/v1/{report_ref}/$/{i}/{link["name"]}') html["link_idx"] = idx if report.get("file_links"): html["file_links"] = report["file_links"] summary_height = report.get("summary_window_height") if summary_height is None: summary_height = 500 html["iframe_style"] = f"max-height: {html['height']}" if html["set_height"]: html["iframe_style"] += f"; height: {html['height']}" else: html["iframe_style"] += "; height: auto" return { "objects": created_objs, "summary": report.get("text_message", ""), "summary_height": f"{summary_height}px", "html": html }
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) # 1) generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files obj_name = self.get_data_obj_name (input_ref) type_name = self.get_data_obj_type (input_ref) # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1}) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join (input_dir,fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref)) else: genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i,this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError ("unable to fetch genome: "+this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError (msg) continue elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref']) print (msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref']) print (msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Unknown type slipped through # else: raise ValueError('Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
class AveExpressionMatrixBuilder: def _validate_calculate_average_expression_matrix_params(self, params): """ _validate_calculate_average_expression_matrix_params: validates params passed to calculate_average_expression_matrix method """ log('start validating calculate_average_expression_matrix params') # check for required parameters for p in ['expression_matrix_ref', 'output_suffix', 'workspace_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def _generate_report(self, expression_matrix_ref, workspace_name): """ _generate_report: generate report """ objects_created = [{'ref': expression_matrix_ref, 'description': 'Average ExpressionMatrix'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, # 'html_links': output_html_files, # 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_ave_expr_matrix_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _save_expression_matrix(self, em_data, em_obj_name, workspace_name): """ _save_expression_matrix: saving ExpressionMatrix """ try: log('saving ExpressionMatrix [{}]'.format(em_obj_name)) data_type = 'KBaseFeatureValues.ExpressionMatrix' obj_info = self.dfu.save_objects({'id': self.dfu.ws_name_to_id(workspace_name), 'objects': [{'type': data_type, 'data': em_data, 'name': em_obj_name}]})[0] except Exception as e: log(e) raise Exception('Failed Saving ExpressionMatrix to Workspace') expression_matrix_ref = str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4]) return expression_matrix_ref def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.scratch = config['scratch'] def calculate_average_expression_matrix(self, params): """ calculate_average_expression_matrix: create an average ExpressionMatrix object from a ExpressionMatrix object required params: expression_matrix_ref: ExpressionMatrix object reference output_suffix: output average ExpressionMatrix name suffix workspace_name: the name of the workspace it gets saved to return: average_expression_matrix_ref: generated average ExpressionMatrix object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning AveExpressionMatrixBuilder.calculate_average_expression_matrix\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_calculate_average_expression_matrix_params(params) expression_matrix_ref = params.get('expression_matrix_ref') expression_matrix = self.ws.get_objects2({'objects': [{'ref': expression_matrix_ref}]})['data'][0] expression_matrix_data = expression_matrix['data'] expression_matrix_info = expression_matrix['info'] condition_map = expression_matrix_data['condition_mapping'] ori_data = expression_matrix_data['data'] ori_col_ids = ori_data['col_ids'] ori_row_ids = ori_data['row_ids'] ori_values = ori_data['values'] labels = list(condition_map.keys()) if set(labels) != set(ori_col_ids): error_msg = 'available labels: {}\n'.format(ori_col_ids) error_msg += 'labels in condition_mapping: {}'.format(labels) raise ValueError(error_msg) condition_pos = {} for label, condition in condition_map.items(): if condition not in condition_pos: condition_pos.update({condition: [ori_col_ids.index(label)]}) else: condition_list = condition_pos[condition] condition_list.append(ori_col_ids.index(label)) condition_pos.update({condition: condition_list}) conditions = list(condition_pos.keys()) ave_values = [] for ori_value in ori_values: ave_value = [None] * len(conditions) for condition, poss in condition_pos.items(): ave_pos = conditions.index(condition) sum_value = 0.0 for pos in poss: sum_value += round(float(ori_value[pos]), 3) average = sum_value / len(poss) ave_value[ave_pos] = round(average, 2) ave_values.append(ave_value) average_data = {} average_data.update({'row_ids': ori_row_ids}) average_data.update({'col_ids': conditions}) average_data.update({'values': ave_values}) em_data = {} genome_ref = expression_matrix_data.get('genome_ref') if genome_ref: em_data.update({'genome_ref': genome_ref}) em_data.update({'scale': expression_matrix_data.get('scale')}) em_data.update({'type': expression_matrix_data.get('type')}) em_data.update({'feature_mapping': expression_matrix_data.get('feature_mapping')}) em_data.update({'condition_mapping': expression_matrix_data.get('condition_mapping')}) em_data.update({'data': average_data}) expression_matrix_name = expression_matrix_info[1] ave_expression_matrix_name = expression_matrix_name + params.get('output_suffix') workspace_name = params.get('workspace_name') ave_expression_matrix_ref = self._save_expression_matrix(em_data, ave_expression_matrix_name, workspace_name) returnVal = {'average_expression_matrix_ref': ave_expression_matrix_ref} report_output = self._generate_report(ave_expression_matrix_ref, workspace_name) returnVal.update(report_output) return returnVal
class DESeqUtil: PREPDE_TOOLKIT_PATH = '/kb/module/lib/kb_deseq/Utils' def _validate_run_deseq2_app_params(self, params): """ _validate_run_deseq2_app_params: validates params passed to run_deseq2_app method """ logging.info('start validating run_deseq2_app params') # check for required parameters for p in ['expressionset_ref', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_run_deseq2_app_with_condition_set_params(self, params): """ _validate_run_deseq2_app_params: validates params passed to run_deseq2_app method """ logging.info( 'start validating run_deseq2_app_with_condition_set params') # check for required parameters for p in [ 'expressionset_ref', 'workspace_name', 'diff_expression_obj_name', 'conditionset_ref', 'group_factor' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _xor(self, a, b): return bool(a) != bool(b) def _run_command(self, command): """ _run_command: run command and print result """ logging.info('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exit_code = pipe.returncode if exit_code == 0: logging.info(f'Executed command:\n{command}\n' f'Exit Code: {exit_code}\nOutput:\n{output}') else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exit_code, output) raise ValueError(error_msg) def _generate_html_report(self, result_directory, diff_expression_obj_ref, params): """ _generate_html_report: generate html summary report """ logging.info('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(output_directory, exist_ok=True) result_file_path = os.path.join(output_directory, 'report.html') visualization_content = '' dispersion_plots_name = 'deseq2_dispersion_plot.png' dispersion_plots_display_name = 'Dispersion plot' shutil.copy2(os.path.join(result_directory, dispersion_plots_name), os.path.join(output_directory, dispersion_plots_name)) visualization_content += '<div class="gallery">' visualization_content += '<a target="_blank" href="{}">'.format( dispersion_plots_name) visualization_content += '<img src="{}" '.format(dispersion_plots_name) visualization_content += 'alt="{}" width="600" height="400">'.format( dispersion_plots_display_name) visualization_content += '</a><div class="desc">{}</div></div>'.format( dispersion_plots_display_name) pca_plots_name = 'deseq2_PCA_plot.png' pca_plots_display_name = 'PCA plot' shutil.copy2(os.path.join(result_directory, pca_plots_name), os.path.join(output_directory, pca_plots_name)) visualization_content += '<div class="gallery">' visualization_content += '<a target="_blank" href="{}">'.format( pca_plots_name) visualization_content += '<img src="{}" '.format(pca_plots_name) visualization_content += 'alt="{}" width="600" height="400">'.format( pca_plots_display_name) visualization_content += '</a><div class="desc">{}</div></div>'.format( pca_plots_display_name) diff_expr_set_data = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_obj_ref }]})['data'][0]['data'] overview_content = '' overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet' overview_content += ' Object</th></tr>' overview_content += '<tr><td>{} ({})'.format( params.get('diff_expression_obj_name'), diff_expression_obj_ref) overview_content += '</td></tr></table>' overview_content += '<p><br/></p>' overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix' overview_content += ' Object</th><th></th><th></th><th></th></tr>' overview_content += '<tr><th>Differential Expression Matrix Name</th>' overview_content += '<th>Feature Count</th>' overview_content += '</tr>' for item in diff_expr_set_data['items']: diff_expr_ref = item['ref'] diff_expr_object = self.ws.get_objects2( {'objects': [{ 'ref': diff_expr_ref }]})['data'][0] diff_expr_data = diff_expr_object['data'] diff_expr_info = diff_expr_object['info'] diff_expr_name = diff_expr_info[1] number_features = len(diff_expr_data['data']['row_ids']) overview_content += '<tr><td>{} ({})</td>'.format( diff_expr_name, diff_expr_ref) overview_content += '<td>{}</td></tr>'.format(number_features) overview_content += '</table>' with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', overview_content) report_template = report_template.replace( '<p>Visualization_Content</p>', visualization_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for DESeq2 App' }) return html_report def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ logging.info('start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(output_directory, exist_ok=True) result_file = os.path.join(output_directory, 'DESeq2_result.zip') plot_file = os.path.join(output_directory, 'DESeq2_plot.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.zip') or file.endswith('.png') or file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by DESeq2 App' }) with zipfile.ZipFile(plot_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if file.endswith('.png'): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': plot_file, 'name': os.path.basename(plot_file), 'label': os.path.basename(plot_file), 'description': 'Visualization plots by DESeq2 App' }) return output_files def _generate_report(self, diff_expression_obj_ref, params, result_directory): """ _generate_report: generate summary report """ logging.info('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report( result_directory, diff_expression_obj_ref, params) diff_expr_set_data = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_obj_ref }]})['data'][0]['data'] items = diff_expr_set_data['items'] description_set = 'DifferentialExpressionMatrixSet generated by DESeq2' description_object = 'DifferentialExpressionMatrix generated by DESeq2' objects_created = [] objects_created.append({ 'ref': diff_expression_obj_ref, 'description': description_set }) for item in items: diff_expr_ref = item['ref'] objects_created.append({ 'ref': diff_expr_ref, 'description': description_object }) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_deseq2_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _save_count_matrix_file(self, result_directory): """ _save_count_matrix_file: download gtf file for each expression run prepDE.py on them and save resulting count matrix file """ logging.info('generating count matrix file') conditions = [] genome_ref = None items = self.expression_set_data['items'] gtf_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(gtf_directory, exist_ok=True) mapping_file = os.path.join(gtf_directory, "mapping.txt") with open(mapping_file, 'w') as input_mapping: for item in items: expression_ref = item['ref'] expression_object = self.ws.get_objects2( {'objects': [{ 'ref': expression_ref }]})['data'][0] expression_data = expression_object['data'] expression_info = expression_object['info'] handle_id = expression_data.get('file').get('hid') expression_name = expression_info[1] conditions.append(expression_data['condition']) genome_ref = expression_data['genome_id'] tmp_gtf_directory = os.path.join(gtf_directory, expression_name) os.makedirs(tmp_gtf_directory, exist_ok=True) self.dfu.shock_to_file({ 'handle_id': handle_id, 'file_path': tmp_gtf_directory, 'unpack': 'unpack' }) input_mapping.write("{}\t{}/transcripts.gtf\n".format( expression_name, tmp_gtf_directory)) self._run_prepDE(result_directory, mapping_file) return ",".join(conditions), genome_ref def _run_prepDE(self, result_directory, input): """ _run_prepDE: run prepDE.py script ref: http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual#deseq """ logging.info('generating matrix of read counts') command = self.PREPDE_TOOLKIT_PATH + '/prepDE.py ' command += '-i {} '.format(input) command += '-g {} '.format( os.path.join(result_directory, 'raw_gene_count_matrix.csv')) command += '-t {} '.format( os.path.join(result_directory, 'transcript_count_matrix.csv')) self._run_command(command) # remove novel genes from results (ideally should compare against expression set) with open(os.path.join(result_directory, 'raw_gene_count_matrix.csv')) as infile, open( os.path.join(result_directory, 'gene_count_matrix.csv'), 'w') as outfile: outfile.writelines([l for l in infile if "STRG." not in l]) def _generate_diff_expression_csv(self, result_directory, condition_string, params): """ _generate_diff_expression_csv: get different expression matrix with DESeq2 """ result_files = os.listdir(result_directory) if 'gene_count_matrix.csv' not in result_files: error_msg = 'Missing gene_count_matrix.csv, available files: {}'.format( result_files) raise ValueError(error_msg) pair_string = ",".join( ["_vs_".join(x) for x in params['condition_labels']]) rcmd_list = [ 'Rscript', os.path.join(os.path.dirname(__file__), 'run_DESeq.R') ] rcmd_list.extend( ['--result_directory', '"{}"'.format(result_directory)]) rcmd_list.extend( ['--condition_string', '"{}"'.format(condition_string)]) rcmd_list.extend(['--contrast_pairs', '"{}"'.format(pair_string)]) if params.get('input_type') == 'transcripts': rcmd_list.extend(['--transcripts']) rcmd_str = " ".join(str(x) for x in rcmd_list) self._run_command(rcmd_str) def _save_diff_expression(self, result_directory, params): """ _save_diff_expression: save DifferentialExpression object to workspace """ logging.info( 'start saving KBaseFeatureValues.DifferentialExpressionMatrix object' ) workspace_name = params.get('workspace_name') diff_expression_obj_name = params.get('diff_expression_obj_name') destination_ref = workspace_name + '/' + diff_expression_obj_name diff_expr_files = list() for res_file in os.listdir(result_directory): if 'deseq_results.csv' not in res_file: continue condition_labels = res_file.replace('_deseq_results.csv', '').split('_vs_', 2)[:2] genes_results_filepath = os.path.join(result_directory, res_file) with open(genes_results_filepath, "r") as f: reader = csv.reader(f) columns = next(reader)[1:] columns[columns.index('log2FoldChange')] = 'log2_fold_change' columns[columns.index('pvalue')] = 'p_value' columns[columns.index('padj')] = 'q_value' for line in fileinput.input(genes_results_filepath, inplace=True): if fileinput.isfirstline(): print('gene_id,' + ','.join(columns)) else: print(line) reader = csv.DictReader(open(genes_results_filepath)) diffexpr_filepath = genes_results_filepath.replace( 'deseq_results.csv', 'differential_expression_result.csv') with open(diffexpr_filepath, 'w') as csvfile: fieldnames = [ 'gene_id', 'log2_fold_change', 'p_value', 'q_value' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in reader: writer.writerow({ 'gene_id': row.get('gene_id'), 'log2_fold_change': row.get('log2_fold_change'), 'p_value': row.get('p_value'), 'q_value': row.get('q_value') }) diff_expr_files.append({ 'condition_mapping': { condition_labels[0]: condition_labels[1] }, 'diffexpr_filepath': diffexpr_filepath }) upload_diff_expr_params = { 'destination_ref': destination_ref, 'diffexpr_data': diff_expr_files, 'tool_used': 'deseq', 'tool_version': '1.16.1', 'genome_ref': params['genome_ref'] } deu_upload_return = self.deu.save_differential_expression_matrix_set( upload_diff_expr_params) diff_expression_obj_ref = deu_upload_return['diffExprMatrixSet_ref'] return diff_expression_obj_ref def _get_condition_labels(self): """ _get_condition_labels: get all possible condition label pairs """ logging.info('getting all possible condition pairs') items = self.expression_set_data.get('items') condition_replicate_name_mapping = collections.OrderedDict() for item in items: expression_ref = item['ref'] expr_object = self.ws.get_objects2( {'objects': [{ 'ref': expression_ref }]})['data'][0] expr_data = expr_object['data'] expr_info = expr_object['info'] expr_name = expr_info[1] expr_condition = expr_data['condition'] expr_name_list = condition_replicate_name_mapping.get( expr_condition) if expr_name_list: expr_name_list.append(expr_name) condition_replicate_name_mapping.update( {expr_condition: expr_name_list}) else: condition_replicate_name_mapping.update( {expr_condition: [expr_name]}) condition_labels = list(condition_replicate_name_mapping.keys()) condition_label_pairs = [ list(pair) for pair in itertools.combinations(condition_labels, 2) ] logging.info( 'all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, condition_labels @staticmethod def _check_input_labels(condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: first_label = condition_pair['condition_label_1'][0].strip() second_label = condition_pair['condition_label_2'][0].strip() if first_label not in available_condition_labels: error_msg = 'Condition: {} is not available. '.format( first_label) error_msg += 'Available conditions: {}'.format( available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not available. '.format( second_label) error_msg += 'Available conditions: {}'.format( available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _generate_condition_string(self, expression_set_data, conditionset_ref, group_factor): """ _generate_condition_string: generate condition string based on conditionset factors """ condition_strings = [] condition_set_obj = self.dfu.get_objects( {'object_refs': [conditionset_ref]})['data'][0] condition_set_data = condition_set_obj['data'] conditions = condition_set_data.get('conditions') factors = [ factor.get('factor') for factor in condition_set_data.get('factors') ] try: position = factors.index(group_factor) except: error_msg = 'Group Factor {} is not available\n'.format( group_factor) error_msg += 'Available factors {}'.format(factors) raise ValueError(error_msg) for expr in expression_set_data.get('items'): condition_id = expr.get('label') try: condition = conditions[condition_id] except KeyError: error_msg = 'Condition ID [{}] '.format(condition_id) error_msg += 'is not available in ConditionSet object' raise ValueError(error_msg) condition_strings.append(condition[position]) return ",".join(condition_strings) def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev') self.gsu = GenomeSearchUtil(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = config['scratch'] def run_deseq2_app_with_condition_set(self, params): """ run_deseq2_app_with_condition_set: run DESeq2 app with ConditionSet (https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html) required params: expressionset_ref: ExpressionSet object reference diff_expression_obj_name: DifferentialExpressoinMatrixSet object name workspace_name: the name of the workspace it gets saved to conditionset_ref: ConditionSet object reference group_factor: factor in conditionset used for grouping expressions optional params: run_all_combinations: run all paired condition combinations condition_labels: conditions for expression set object alpha_cutoff: q value cutoff fold_change_cutoff: fold change cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] return: result_directory: folder path that holds all files generated by run_deseq2_app diff_expression_obj_ref: generated RNASeqDifferetialExpression object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ logging.info( '--->\nrunning DESeqUtil.run_deseq2_app_with_condition_set\n' f'params:\n{json.dumps(params, indent=1)}') self._validate_run_deseq2_app_with_condition_set_params(params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(result_directory, exist_ok=True) expressionset_ref = params.get('expressionset_ref') expression_set_obj = self.dfu.get_objects( {'object_refs': [expressionset_ref]})['data'][0] self.expression_set_data = expression_set_obj['data'] # run prepDE.py and save count matrix file condition_ids, params['genome_ref'] = self._save_count_matrix_file( result_directory) conditionset_ref = params.get('conditionset_ref') group_factor = params.get('group_factor') # overwrite condition_string with conditionset factors condition_string = self._generate_condition_string( self.expression_set_data, conditionset_ref, group_factor) condition_labels = list(set(condition_string.split(','))) condition_label_pairs = [ list(pair) for pair in itertools.combinations(condition_labels, 2) ] if condition_label_pairs: params['condition_labels'] = condition_label_pairs else: error_msg = 'Only 1 condition was fetched from ConditionSet for fatcor {}'.format( group_factor) raise ValueError(error_msg) self._generate_diff_expression_csv(result_directory, condition_string, params) diff_expression_obj_ref = self._save_diff_expression( result_directory, params) returnVal = { 'result_directory': result_directory, 'diff_expression_obj_ref': diff_expression_obj_ref } report_output = self._generate_report(diff_expression_obj_ref, params, result_directory) returnVal.update(report_output) return returnVal def run_deseq2_app(self, params): """ run_deseq2_app: run DESeq2 app (https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html) required params: expressionset_ref: ExpressionSet object reference diff_expression_obj_name: DifferentialExpressoinMatrixSet object name workspace_name: the name of the workspace it gets saved to optional params: run_all_combinations: run all paired condition combinations condition_labels: conditions for expression set object alpha_cutoff: q value cutoff fold_change_cutoff: fold change cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] return: result_directory: folder path that holds all files generated by run_deseq2_app diff_expression_obj_ref: generated RNASeqDifferetialExpression object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ if params.get('conditionset_ref'): return self.run_deseq2_app_with_condition_set(params) logging.info('--->\nrunning DESeqUtil.run_deseq2_app\n' + f'params:\n{json.dumps(params, indent=1)}') self._validate_run_deseq2_app_params(params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) os.makedirs(result_directory, exist_ok=True) expressionset_ref = params.get('expressionset_ref') expression_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': expressionset_ref }]})['data'][0] self.expression_set_data = expression_set_obj['data'] available_condition_label_pairs, available_condition_labels = self._get_condition_labels( ) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: self._check_input_labels(condition_pairs, available_condition_labels) condition_label_pairs = [] for condition_pair in condition_pairs: condition_labels = [ condition_pair.get('condition_label_1')[0].strip(), condition_pair.get('condition_label_2')[0].strip() ] condition_label_pairs.append(condition_labels) params['condition_labels'] = condition_label_pairs # run prepDE.py and save count matrix file condition_string, params['genome_ref'] = self._save_count_matrix_file( result_directory) self._generate_diff_expression_csv(result_directory, condition_string, params) diff_expression_obj_ref = self._save_diff_expression( result_directory, params) returnVal = { 'result_directory': result_directory, 'diff_expression_obj_ref': diff_expression_obj_ref } report_output = self._generate_report(diff_expression_obj_ref, params, result_directory) returnVal.update(report_output) return returnVal
class AMAUtils(): def __init__(self, ws_url, cb_url, token, scratch): self.ws = Workspace(ws_url, token=token) self.cb_url = cb_url self.token = token self.scratch = scratch def _confirm_ws_type(self, ref): """confirm whether 'ref' is of type 'KBaseMetagenomes.AnnotatedMetagenomeAssembly if not, throw error. """ if ref is None: raise ValueError(" 'ref' argument must be specified.") obj_info = self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0] # check object type is 'KBaseMetagenome.AnnotatedMetagenomeAssembly' obj_type = obj_info[2] if 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' not in obj_type: raise ValueError( f"input ref '{ref}' is of type {obj_type}. function " "'get_annotated_metagenome_assembly' requires objects" " of type KBaseMetagenome.AnnotatedMetagenomeAssembly") def get_annotated_metagenome_assembly(self, params): """ params: ref - workspace reference included_fields - list of fields to include, defaults to list below if not specified. output genomes - contains the returned data fields from the workspace request. """ ref = params.get('ref', None) included_fields = params.get('included_fields', None) self._confirm_ws_type(ref) get_obj_params = {'ref': ref} if included_fields is not None: get_obj_params['included'] = included_fields data = self.ws.get_objects2({'objects': [get_obj_params]})['data'] return {'genomes': data} def get_annotated_metagenome_assembly_features(self, params): """ params: ref - workspace reference for KBaseMetagenomes.AnnotatedMetagenomeAssembly object output: features - list of features, each representing a dict. """ ref = params['ref'] self._confirm_ws_type(ref) ret = self.ws.get_objects2( {"objects": [{ "ref": ref, "included": ["features_handle_ref"] }]})['data'] features_handle_ref = ret[0]['data']['features_handle_ref'] dfu = DataFileUtil(self.cb_url, token=self.token) file_name = 'features.json.gz' file_path = os.path.join(self.scratch, file_name) shock_ret = dfu.shock_to_file({ 'handle_id': features_handle_ref, 'file_path': file_path, 'unpack': "uncompress" }) file_path = shock_ret['file_path'] with open(file_path) as fd: json_features = json.load(fd) if params.get('feature_type'): accepted_feature_types = [ "cds", "gene", "mrna", "trna", "rrna", "repeat_region" ] feat_type = params['feature_type'] if feat_type.lower() not in accepted_feature_types: raise ValueError( f"{feat_type} not an accepted feature type; accepted feature" " types (in lower case) are {accepted_feature_types}") json_features = [ feature for feature in json_features if feature['type'].lower() == feat_type.lower() ] if params.get('only_ids'): json_features = [{ 'id': feature['id'] } for feature in json_features] return {'features': json_features}
def export_genome_as_gff(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_as_gff if 'input_ref' not in params: raise ValueError('Cannot run export_genome_as_gff- no "input_ref" ' 'field defined.') # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_objects2({ 'objects': [{ 'ref': params['input_ref'], 'included': ['/assembly_ref', '/contigset_ref', '/id', '/gff_handle_ref'] }] })['data'][0]['data'] # export to file (building from KBase Genome Object) result = self.genome_to_gff(ctx, {'genome_ref': params['input_ref']})[0] # get assembly if 'assembly_ref' in info: assembly_ref = info['assembly_ref'] else: assembly_ref = info['contigset_ref'] print(('Assembly reference = ' + assembly_ref)) print('Downloading assembly') au = AssemblyUtil(self.cfg.callbackURL) assembly_file_path = au.get_assembly_as_fasta( {'ref': params['input_ref'] + ";" + assembly_ref})['path'] # create the output directory and move the files there export_package_dir = os.path.join(self.cfg.sharedFolder, info['id']) os.makedirs(export_package_dir) shutil.move( result['file_path'], os.path.join( export_package_dir, 'KBase_derived_' + os.path.basename(result['file_path']))) shutil.move( assembly_file_path, os.path.join(export_package_dir, os.path.basename(assembly_file_path))) # add cached genome if appropriate exporter = GenomeToGFF(self.cfg) cached = exporter.get_gff_handle(info, export_package_dir) # package it up dfUtil = DataFileUtil(self.cfg.callbackURL) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = {'shock_id': package_details['shock_id']} #END export_genome_as_gff # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_as_gff return value ' + 'output is not type dict as required.') # return the results return [output]
class AMPLICON: ''' Module Name: AMPLICON Module Description: A KBase module: AMPLICON ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "" GIT_COMMIT_HASH = "" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def run_AMPLICON(self, ctx, params): # ctx is the context object # return variables are: output #BEGIN run_AMPLICON print('Starting AMPLICON function and validating parameters.') if not params.get('workspace_name'): print('Parameters provided were', str(params)) raise TypeError('Must pass a non-empty `workspace_name` arg.') if not params.get('ref'): print('Parameters provided were', str(params)) raise TypeError('Must pass a non-empty `ref` arg.') ws_name = params['workspace_name'] # get the amplicon data obj = self.ws_client.get_objects2( {'objects': [{ 'ref': params['ref'] }]})['data'][0]['data'] # define file names parse_out_file = os.path.join('work/tmp', 'parse_out.tsv') input_file = parse_out_file output_file = os.path.join('work/tmp', 'output.tsv') # 1. convert data into tsv format parse_input_data(obj, parse_out_file) # 2. run subprocess FAPROTAX run_program(input_file, output_file) # 3. create html tables using output_file output = create_report(self.callback_url, self.shared_folder, ws_name, output_file) #END run_AMPLICON # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_AMPLICON return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def load_fastas(config, scratch, upa): ''' ''' dfu = DataFileUtil(config['callback_url']) au = AssemblyUtil(config['callback_url']) mgu = MetagenomeUtils(config['callback_url']) ws = Workspace(config['workspace-url']) obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0] upa = str(obj_data['info'][6]) + '/' + str( obj_data['info'][0]) + '/' + str(obj_data['info'][4]) obj_type = obj_data['info'][2] id_to_assy_info = {} if 'KBaseSets.GenomeSet' in obj_type: upas = [gsi['ref'] for gsi in obj_data['data']['items']] elif 'KBaseSearch.GenomeSet' in obj_type: upas = [gse['ref'] for gse in obj_data['data']['elements'].values()] elif "KBaseGenomes.Genome" in obj_type: upas = [upa] elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type: # in this case we use the assembly file util to get the fasta file # file_output = os.path.join(scratch, "input_fasta.fa") faf = au.get_assembly_as_fasta({ "ref": upa, 'filename': upa_to_path(scratch, upa) }) return {file_safe_upa(upa): faf} elif "KBaseSets.AssemblySet" in obj_type: for item_upa in obj_data['data']['items']: faf = au.get_assembly_as_fasta({ "ref": upa + ';' + item_upa['ref'], 'filename': upa_to_path(scratch, item_upa['ref']) }) id_to_assy_info[file_safe_upa(item_upa['ref'])] = faf return id_to_assy_info elif 'KBaseMetagenomes.BinnedContigs' in obj_type: return handle_binned_contigs(upa, mgu, scratch) for genome_upa in upas: # this could be sped up by batching the get_objects call # does assy file util not take bulk calls? # maybe doesn't matter since Shock doesn't handle bulk calls if upa != genome_upa: # for single genomes, upa and genome_upa will be the same genome_upa = upa + ';' + genome_upa genome_data = ws.get_objects2({'objects': [{ "ref": genome_upa }]})['data'][0]['data'] target_upa = genome_data.get('contigset_ref') or genome_data.get( 'assembly_ref') assembly_upa = genome_upa + ';' + target_upa faf = au.get_assembly_as_fasta({ 'ref': assembly_upa, 'filename': upa_to_path(scratch, target_upa) }) id_to_assy_info[file_safe_upa(target_upa)] = faf return id_to_assy_info
class VariationAnnotation: ''' Module Name: VariationAnnotation Module Description: A KBase module: VariationAnnotation ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/man4ish/VariationAnnotation.git" GIT_COMMIT_HASH = "233ab11cd942b99c960f7b83aaee2b3800685bb4" #BEGIN_CLASS_HEADER def build_genome_index(self, genome_ref): #Downloads gff, fasta and puts it in the right directory # and returns the genome_index name that can be used by snpeff.jar #TODO: READ GENOME TAXONOMY from genome_ref and # TODO: Get genome taxonomy/classification from user so that There # is no confusion. pass #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.VU = VariationUtil(self.callback_url) self.SU = SnpEffUtils() self.DU = DownloadUtils() self.HU = htmlreportutils() self.config = config #self.snpeff=<path_to_snpeff> #END_CONSTRUCTOR pass def annotate_variants(self, ctx, params): """ This method extracts VCF from variation object, runs SNPEFF workflow (http://snpeff.sourceforge.net/SnpEff_manual.html) and annotate and predict the effects of genetic variants (such as amino acid changes) :param params: instance of type "input_params" (variation_ref: Reference to Variation object out_variation_name: Name by which the output object will be saved) -> structure: parameter "variation_ref" of String, parameter "out_variation_name" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN annotate_variants # Validate the parameters # Extract vcf from variation using VariationUtil # output_dir = os.path.join(self.scratch, str(uuid.uuid4())) # os.mkdir(output_dir) # #filename = os.path.join(output_dir, "variation.vcf.gz") # print(filename) # vcf_path = self.VU.get_variation_as_vcf({ # 'variation_ref': params['variation_ref'], # 'filename':filename # }) # TODO current vcf path is hard coded for testing which need to be removed. self.SU.validate_params(params) vcf_path = "/kb/module/work/variation.vcf.gz" print(vcf_path) # TODO: Need to think through how to get this from the USERS # because variation_ref may or may not have a genome_ref field filled in # our spec.json may require some work # There is a chance that user may provide wrong genome as input if we don't deal with this properly # params['genome_ref'] # Download gff and assembly based on geome_ref #gff_path = ..... #assembly_path ... workspace = params['workspace_name'] self.ws_url = self.config['workspace-url'] self.ws = Workspace(self.ws_url, token=ctx['token']) # TODO current file name is hard coded but that need to be changed later. filename = "/kb/module/work/variation.vcf" output_dir = os.path.join(self.scratch, str(uuid.uuid4())) os.mkdir(output_dir) shutil.copytree("/kb/module/deps/snp_eff", output_dir + "/snp_eff") variation_ref = params['variation_ref'] variation_obj = self.ws.get_objects2({'objects': [{'ref': variation_ref}]})['data'][0] data = self.ws.get_objects2( {'objects':[{"ref":variation_ref, 'included': ['/sample_set_ref']}]})['data'][0]['data'] sample_set_ref = data['sample_set_ref'] assembly_ref = variation_obj['data']['assembly_ref'] assembly_path = self.DU.get_assembly(assembly_ref, output_dir) gff_ref = params['genome_ref'] gff_path = self.DU.get_gff(gff_ref, output_dir) # Todo: It is temporary fix but need to find logical removal of exons based on coordinates. fix_cmd = "grep -v \"exon\" "+ gff_path + " > /kb/module/work/tmp/output.gff" print(fix_cmd) os.system(fix_cmd) #os.system("cp /kb/module/work/tmp/output.gff " + os.path.join(output_dir, "/snp_eff/data/kbase_v1/genes.gff")) #shutil.copyfile("/kb/module/work/tmp/output.gff", output_dir + "/snp_eff/data/kbase_v1/genes.gff") vcf_path = self.VU.get_variation_as_vcf({ 'variation_ref': params['variation_ref'], 'filename': filename }) new_gff_path = "/kb/module/work/tmp/output.gff" genome_index_name = self.SU.build_genome(new_gff_path, assembly_path, output_dir) annotated_vcf_path = self.SU.annotate_variants(genome_index_name, vcf_path['path'], params, output_dir) ''' params['vcf_staging_file_path'] = annotated_vcf_path params['variation_object_name'] = params['output_object_name'] params['genome_or_assembly_ref'] = params['genome_ref'] ''' save_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['genome_ref'], 'sample_set_ref': sample_set_ref, 'sample_attribute_name':'sample_attr', 'vcf_staging_file_path': annotated_vcf_path, 'variation_object_name': params['output_object_name'] } variantion_ref = self.VU.save_variation_from_vcf(save_variation_params)['variation_ref'] created_objects = [] created_objects.append({ "ref": variation_ref, "description": "Variation Object" }) #self.VU. #upload file to shock # TODO: Add parameters for snpeff in parameters # Parse the snpeff parameters from params and build snpeff command # TODO: We are hardcoding this for now print("\n\n\n") print("$$$$$$$$" + output_dir + "$$$$$$$$$") arr = os.listdir(output_dir + "/snp_eff") for files in arr: print("########" + files + "###########") print("\n\n\n") #os.rename(os.path.join(output_dir, "snp_eff/snpEff_summary.html"), os.path.join(output_dir, "snp_eff/index.html")) snp_eff_resultdir = os.path.join(output_dir, "snp_eff_results") os.mkdir(snp_eff_resultdir) #shutil.copyfile(os.path.join(output_dir, "snp_eff/index.html"), os.path.join(snp_eff_resultdir, "index.html")) shutil.copyfile(os.path.join(output_dir, "snp_eff/snpEff_genes.txt"), os.path.join(snp_eff_resultdir, "snpEff_genes.txt")) #report_dirpath = os.path.join(output_dir, "snp_eff") logging.info("creating html report ...") output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace) # output = self.HU.create_html_report(self.callback_url, snp_eff_resultdir, workspace, created_objects) ''' report = KBaseReport(self.callback_url) output = { "x":vcf_path } ''' #END annotate_variants # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_variants return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
def process_kbase_objects(host_ref, virus_ref, shared_folder, callback, workspace, token): """ Convert KBase object(s) into usable files for VirMatcher :param host_ref: Putative host / microbial genomes with KBase '#/#/#' used to describe each object :param virus_ref: Viral genomes with KBase '#/#/#' used to describe each object :param shared_folder: KBase job node's "working" directory, where actual files exist :param callback: :param workspace: Workspace name :param token: Job token :return: """ dfu = DataFileUtil(callback, token=token) ws = Workspace(workspace, token=token) mgu = MetagenomeUtils(callback, token=token) au = AssemblyUtil(callback, token=token) # Need to determine KBase type in order to know how to properly proceed host_type = ws.get_object_info3({'objects': [{ 'ref': host_ref }]})['infos'][0][2].split('-')[0] virus_type = ws.get_object_info3({'objects': [{ 'ref': virus_ref }]})['infos'][0][2].split('-')[0] logging.info(f'Potential hosts identified as: {host_type}') logging.info(f'Viruses identified as: {virus_type}') # Create new directory to house virus and host files host_dir = Path(shared_folder) / 'host_files' if not host_dir.exists(): os.mkdir(host_dir) host_count = 0 if host_type == 'KBaseGenomeAnnotations.Assembly': # No info about individual genomes, so treat each as organism host_fps = au.get_assembly_as_fasta( {'ref': host_ref})['path'] # Consists of dict: path + assembly_name logging.info( f'Identified {host_type}. Each sequence will be treated as a separate organism.' ) records = SeqIO.parse(host_fps, 'fasta') for record in records: host_count += 1 tmp_fp = host_dir / f'{record.id}.fasta' # TODO Illegal filenames? SeqIO.write([record], tmp_fp, 'fasta') elif host_type == 'KBaseGenomes.Genomes': # TODO Genomes?! genome_data = ws.get_objects2({'objects': [{ 'ref': host_ref }]})['data'][0]['data'] genome_data.get('contigset_ref') or genome_data.get('assembly_ref') # elif host_type == 'KBaseSets.GenomeSet' elif host_type == 'KBaseSets.AssemblySet': obj_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0] for subobj in obj_data['data']['items']: host_fp = au.get_assembly_as_fasta({'ref': subobj['ref']})['path'] if os.path.splitext(host_fp)[-1] != 'fasta': # Ensure extension always = fasta target_fn = os.path.splitext( os.path.basename(host_fp))[0].strip('_') + '.fasta' else: target_fn = os.path.basename(host_fp).strip('_') shutil.copyfile(host_fp, host_dir / target_fn) host_count += 1 elif host_type == 'KBaseMetagenomes.BinnedContigs': # This is what we want! host_kbase_dir = mgu.binned_contigs_to_file({ 'input_ref': host_ref, 'save_to_shock': 0 })['bin_file_directory'] # Dict of bin_file_dir and shock_id for (dirpath, dirnames, fns) in os.walk( host_kbase_dir): # Dirnames = all folders under dirpath for fn in fns: if os.path.splitext(fn)[-1] != 'fasta': fn = os.path.splitext(fn)[0] + '.fasta' fp = Path(dirpath) / fn shutil.copy(fp, host_dir) host_count += 1 else: raise ValueError(f'{host_type} is not supported.') logging.info(f'{host_count} potential host genomes were identified.') virus_count = 0 if virus_type == 'KBaseGenomeAnnotations.Assembly': virus_fps = au.get_assembly_as_fasta({'ref': virus_ref})['path'] records = SeqIO.parse(virus_fps, 'fasta') virus_count = len(list(records)) # for record in records: # virus_count += 1 # tmp_fp = virus_dir / f'{record.id}.fasta' # SeqIO.write([record], tmp_fp, 'fasta') else: raise ValueError(f'{virus_type} is not supported.') logging.info(f'{virus_count} potential viral genomes were identified.') # TODO Do we even need any of this data? We don't care about what the sequences are called # host_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0] # virus_data = dfu.get_objects({'object_refs': [virus_ref]})['data'][0] return host_dir, virus_fps