def get_data_obj_type_by_name(self, input_ref, remove_module=False): # 0 obj_id objid - the numerical id of the object. # 1 obj_name name - the name of the object. # 2 type_string type - the type of the object. # 3 timestamp save_date - the save date of the object. # 4 obj_ver ver - the version of the object. # 5 username saved_by - the user that saved or copied the object. # 6 ws_id wsid - the workspace containing the object. # 7 ws_name workspace - the workspace containing the object. # 8 string chsum - the md5 checksum of the object. # 9 int size - the size of the object in bytes. # 10 usermeta meta - arbitrary user-supplied metadata about # the object. [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple ws = Workspace(self.ws_url) input_info = ws.get_object_info3({'objects': [{ 'ref': input_ref }]})['infos'][0] obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] if remove_module: type_name = type_name.split('.')[1] return {obj_name: type_name}
def get_data_obj_name(self, input_ref): [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0] obj_name = input_info[NAME_I] #type_name = input_info[TYPE_I].split('-')[0] return obj_name
def get_data_obj_type(self, input_ref, remove_module=False): [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0] #obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] if remove_module: type_name = type_name.split('.')[1] return type_name
def get_static_info(ws_url: str, token: str, ws_id: int) -> Dict: """ Looks up the static narrative info for the given Workspace id. That info is stashed in the Workspace metadata, so that gets fetched, munged into a structure, and returned. If there's no static narrative, this returns an empty structure, as there's no info. If ws_id is not present, or not numeric, raises a ValueError. If there's a problem when contacting the Workspace (anything that raises a ServerError), this raises a WorkspaceError. :param ws_url: the URL for the workspace endpoint :param token: the user auth token :param ws_id: the workspace id of the narrative to fetch info for. :returns: a dictionary with the following keys if a static narrative is present: ws_id - int - the workspace id narrative_id - int - the id of the narrative object version - int - the version of the narrative object made static url - str - the url of the static narrative narr_saved - int - the timestamp of when the narrative that the static version is based on was saved (ms since epoch) static_saved - int - the timestamp of when the static narrative was saved (ms since epoch) """ if not ws_id or not str(ws_id).isdigit(): raise ValueError(f"The parameter ws_id must be an integer, not {ws_id}") ws_client = Workspace(url=ws_url, token=token) try: ws_info = ws_client.get_workspace_info({"id": ws_id}) except ServerError as err: raise WorkspaceError(err, ws_id) info = {} meta = ws_info[8] if "static_narrative_ver" in meta: info = { "ws_id": ws_id, "version": int(meta["static_narrative_ver"]), "narrative_id": int(meta["narrative"]), "url": meta["static_narrative"], "static_saved": int(meta["static_narrative_saved"]) } try: obj_info = ws_client.get_object_info3({ "objects": [{ "ref": f"{ws_id}/{info['narrative_id']}/{info['version']}" }] }) except ServerError as err: raise WorkspaceError(err, ws_id) ts = date_parser.isoparse(obj_info["infos"][0][3]).timestamp() info["narr_saved"] = int(ts*1000) return info
def get_object_type(ref, ws_url): """ Fetches and returns the typed object name of ref from the given workspace url. If that object doesn't exist, or there's another Workspace error, this raises a RuntimeError exception. """ ws = Workspace(ws_url) info = ws.get_object_info3({"objects": [{"ref": ref}]}) obj_info = info.get("infos", [[]])[0] if len(obj_info) == 0: raise RuntimeError("An error occurred while fetching type info from the Workspace. " "No information returned for reference {}".format(ref)) return obj_info[2]
def get_object_names(ref_list, ws_url): """ From a list of workspace references, returns a mapping from ref -> name of the object. """ ws = Workspace(ws_url) obj_ids = list() for ref in ref_list: obj_ids.append({"ref": ref}) info = ws.get_object_info3({"objects": obj_ids}) name_map = dict() # might be in a data palette, so we can't just use the ref. # we already have the refs as passed previously, so use those for mapping, as they're in # the same order as what's returned. for i in range(len(info["infos"])): name_map[ref_list[i]] = info["infos"][i][1] return name_map
def extract_dna_sequences(self, token, params): """Takes an assembly/contig set ref and one or more locations and returns the DNA sequence from the assembly at that location while caching the assembly for efficiency""" if not params.get('ref'): raise ValueError("'ref', a reference to an assembly must be provided") ref = params['ref'] locs = params.get('locations', []) ws = Workspace(self.ws_url, token=token) # This is also a cheap way to ensure that the object exists and that the user has access obj_type = ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0][2] if obj_type.split('-')[0] not in self.valid_types: raise ValueError(f'{obj_type} is not a valid input type for this function') assembly_dir = os.path.join(self.cache_dir, ref.replace('/', ':')) if not os.path.exists(assembly_dir): self._cache_assembly(ws, token, ref, assembly_dir) return [_extract_sequence(assembly_dir, l) for l in locs]
def check_assembly_cache(self, ref, token): ws = Workspace(self.ws_url, token=token) info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0] inner_chsum = info[8] index_file = os.path.join(self.assembly_index_dir, inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz") if not os.path.isfile(index_file): if self.debug: print(" Loading WS object...") t1 = time.time() if 'KBaseGenomeAnnotations.Assembly' in info[2]: included = ["/contigs"] assembly_data = ws.get_objects2( {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data'] contigs = list(assembly_data['contigs'].values()) self.save_assembly_tsv(contigs, inner_chsum) elif 'KBaseGenomes.ContigSet' in info[2]: included = ["/contigs/[*]/id", "/contigs/[*]/length", "/contigs/[*]/md5", "/contigs/[*]/description"] cs_data = ws.get_objects2( {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data'] contigs = [] for c in cs_data['contigs']: this_contig_data = {'contig_id': ''} if 'id' in c: this_contig_data['contig_id'] = c['id'] if 'md5' in c: this_contig_data['md5'] = c['md5'] if 'length' in c: this_contig_data['length'] = c['length'] if 'description' in c: this_contig_data['description'] = c['description'] contigs.append(this_contig_data) self.save_assembly_tsv(contigs, inner_chsum) else: raise ValueError('The "ref" is not an Assembly or ContigSet data object. ' 'It was a ' + info[2]) if self.debug: print(f" (time={time.time() - t1})") return inner_chsum
def check_object_cache(self, ref, search_object, info_included, index_dir, object_suffix, debug): ws = Workspace(self.ws_url, token=self.token) info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0] inner_chsum = info[8] index_file = os.path.join(index_dir, inner_chsum + object_suffix + ".tsv.gz") if not os.path.isfile(index_file): if debug: print(" Loading WS object...") t1 = time.time() included = self.build_info_included(search_object, info_included) object = ws.get_objects2({'objects': [{'ref': ref, 'included': included}]})['data'][0]['data'] self.save_object_tsv(object[search_object], inner_chsum, info_included, index_dir, object_suffix) if debug: print(" (time=" + str(time.time() - t1) + ")") return inner_chsum
def get_upa_name(ws_url, cb_url, upa, is_test): ''' ''' if is_test: return "test_object" ws = Workspace(ws_url) objs = ws.get_object_info3({'objects': [{'ref': upa}]}) upa_names = [info[1] for info in objs['infos']] if len(upa_names) > 0: return upa_names[0] dfu = DataFileUtil(cb_url) objs = dfu.get_objects({'object_refs': [upa]})['data'] upa_names = [obj['info'][1] for obj in objs] if len(upa_names) > 0: return upa_names[0] else: raise ValueError("Could not find name of workspace object with id %s" % upa)
def _make_upa_dict(self, value, param_spec: dict): upas = list() if param_spec["field_type"] == "text": valid_ws_types = param_spec.get("text_options", {}).get("valid_ws_types", []) if len(valid_ws_types) > 0 and value: if isinstance(value, list): for v in value: if self._is_upa(v): upas.append(v) else: if self._is_upa(value): upas.append(value) upa_map = dict() if len(upas): ws = Workspace(url=self.ws_url, token=self.token) obj_infos = ws.get_object_info3( {"objects": [{ "ref": upa } for upa in upas]})["infos"] upa_map = {u: obj_infos[i] for i, u in enumerate(upas)} return upa_map
def fetch_pangenome_summary( pangenome_ref: str, workspace_url: str, token: str) -> dict: """ Construct a summary data object for a single pangenome, used in the "simple_summary" method. Args: pangenome_ref: Workspace reference to the pangenome object workspace_url: URL of the Workspace being used in the current env token: authorization token for fetching the data Returns: A python object adhering to the SimpleSummaryResult type in PanGenomeAPI.spec """ ws_client = Workspace(workspace_url, token=token) # Download the full pangenome workspace dataset resp = ws_client.get_objects2({ 'objects': [{'ref': pangenome_ref}] }) data = resp['data'][0]['data'] # Fetch the object infos for each genome genome_refs = [{"ref": ref} for ref in data["genome_refs"]] genome_infos = ws_client.get_object_info3({ "objects": genome_refs, "includeMetadata": 1 })["infos"] name_mapping = _genome_name_mapping(genome_infos) ret = { "pangenome_id": data["id"], "genomes_count": len(data["genome_refs"]), "genes": _count_genes(data), "families": _count_families(data), "genomes": _genome_counts(data, genome_infos, name_mapping), "shared_family_map": _shared_family_map(data, name_mapping), "genome_ref_name_map": name_mapping, } return ret
def fetch_fasta_from_genome(genome_ref, ws_url, callback_url): """ Returns an assembly or contigset as FASTA. """ if not check_ref_type(genome_ref, ['KBaseGenomes.Genome'], ws_url): raise ValueError( "The given genome_ref {} is not a KBaseGenomes.Genome type!") # test if genome references an assembly type # do get_objects2 without data. get list of refs ws = Workspace(ws_url) genome_obj_info = ws.get_objects2({ 'objects': [{ 'ref': genome_ref }], 'no_data': 1 }) # get the list of genome refs from the returned info. # if there are no refs (or something funky with the return), this will be an empty list. # this WILL fail if data is an empty list. But it shouldn't be, and we know because # we have a real genome reference, or get_objects2 would fail. genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', []) # see which of those are of an appropriate type (ContigSet or Assembly), if any. assembly_ref = list() ref_params = [{'ref': genome_ref + ";" + x} for x in genome_obj_refs] ref_info = ws.get_object_info3({'objects': ref_params}) for idx, info in enumerate(ref_info.get('infos')): if "KBaseGenomeAnnotations.Assembly" in info[ 2] or "KBaseGenomes.ContigSet" in info[2]: assembly_ref.append(";".join(ref_info.get('paths')[idx])) if len(assembly_ref) == 1: return fetch_fasta_from_assembly(assembly_ref[0], ws_url, callback_url) else: raise ValueError( "Multiple assemblies found associated with the given genome ref {}! " "Unable to continue.")
class VariationUtil: ''' Module Name: VariationUtil Module Description: A KBase module: VariationUtil ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.4" GIT_URL = "" GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR # TODO: Make sure we need to define config just once # TODO: Change the code tp match this style self.config = config self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL'] self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.config['ws_url'] = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shared_folder = config['scratch'] self.hr = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.shock_url = config['shock-url'] self.sw_url = config['srv-wiz-url'] pass #END_CONSTRUCTOR pass def save_variation_from_vcf(self, ctx, params): """ Save a variation (and trait?) object to Kbase given a reference genome, object output name, Variant Call Format (VCF) file, and sample attribute file. :param params: instance of type "save_variation_input" (## funcdef save_variation_from_vcf ## required input params: genome_or_assembly_ref: KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly object reference *** variation input data *** vcf_staging_file_path: path to location data associated with samples variation_object_name: output name for KBase variation object *** sample input data *** sample_attribute_ref: x/y/z reference to kbase sample attribute optional params: NA output report: report_name report_ref HTML visualization: Manhattan plot *** Visualization *** plot_maf: generate histogram of minor allele frequencies plot_hwe: generate histogram of Hardy-Weinberg Equilibrium p-values) -> structure: parameter "workspace_name" of String, parameter "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "vcf_staging_file_path" of type "filepath" (KBase file path to staging files), parameter "variation_object_name" of String, parameter "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "save_variation_output" -> structure: parameter "variation_ref" of String, parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: report #BEGIN save_variation_from_vcf # Get workspace id ws_id = self.dfu.ws_name_to_id(params['workspace_name']) genome_ref = None assembly_ref = None # 1) Find whether the input is a genome or assembly # and get genome_ref and assembly_ref genome_or_assembly_ref = params['genome_or_assembly_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') # 2) Validate VCF, compress, and build VCF index logging.info("Validating VCF, Compressing VCF and Indexing VCF") VCFUtilsConfig = {"scratch": self.scratch} VCFUtilsParams = { 'vcf_staging_file_path': params['vcf_staging_file_path'] } VCU = VCFUtils(VCFUtilsConfig) vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf( VCFUtilsParams) if vcf_index is not None: logging.info("vcf compressed :" + str(vcf_compressed)) logging.info("vcf index :" + str(vcf_index)) logging.info("vcf strain ids :" + str(vcf_strain_ids)) else: raise ValueError( "No result obtained after compression and indexing step") # Get strain info # TODO: Remove hard coded stuff StrainInfoConfig = self.config StrainInfoParams = { "ws_id": ws_id, "vcf_strain_ids": vcf_strain_ids, "sample_set_ref": params["sample_set_ref"], "sample_attribute_name": params["sample_attribute_name"] } si = StrainInfo(StrainInfoConfig) sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams) print(sample_attribute_ref) print(strains) # 3) Create json for variation object. In a following step genomic_indexes will be # added to this json before it is saved as Variation object VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch} VCFToVariationParams = { "vcf_compressed": vcf_compressed, "vcf_index": vcf_index, "assembly_ref": assembly_ref } if genome_ref is not None: VCFToVariationParams['genome_ref'] = genome_ref vtv = VCFToVariation(VCFToVariationConfig) variation_object_data = vtv.generate_variation_object_data( VCFToVariationParams) # Append sample information if sample_attribute_ref: variation_object_data[ 'sample_attribute_ref'] = sample_attribute_ref else: raise ValueError(f'sample attribute ref not found') if strains: variation_object_data['strains'] = strains else: raise ValueError(f'strains not found') if 'sample_set_ref' in params: variation_object_data['sample_set_ref'] = params['sample_set_ref'] else: raise ValueError(f'sample_set_ref not found in params') # 4) JbrowseConfig = { "ws_url": self.ws_url, "scratch": self.scratch, "sw_url": self.sw_url, "shock_url": self.shock_url } JbrowseParams = { "vcf_path": vcf_compressed, "assembly_ref": assembly_ref, "binsize": 10000, "vcf_shock_id": variation_object_data['vcf_handle']['id'], "vcf_index_shock_id": variation_object_data['vcf_index_handle']['id'] } if genome_ref is not None: JbrowseParams["genome_ref"] = genome_ref jb = JbrowseUtil(JbrowseConfig) jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams) # 5) Now we have the genomic indices and we have all the information needed to save # the variation object # TODO: Take out the genomic_indexes field from the object spec # TODO: Take out the vcf_handle stuff not needed variation_object_data['genomic_indexes'] = jbrowse_report[ 'genomic_indexes'] var_obj = self.dfu.save_objects({ 'id': self.dfu.ws_name_to_id(params['workspace_name']), 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_object_data, 'name': params['variation_object_name'] }] })[0] var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str( var_obj[4]) print(var_obj_ref) # 5) Build Variation report # This is a simple report # workspace = params['workspace_name'] created_objects = [] created_objects.append({ "ref": var_obj_ref, "description": "Variation Object" }) ReportConfig = { "ws_url": self.ws_url, "scratch": self.scratch, } ReportParams = {"variation_ref": var_obj_ref} vr = VariationReport(ReportConfig) htmlreport_dir = vr.create_variation_report(ReportParams) report = self.hr.create_html_report(htmlreport_dir, workspace, created_objects) report['variation_ref'] = var_obj_ref print(report) #END save_variation_from_vcf # At some point might do deeper type checking... if not isinstance(report, dict): raise ValueError('Method save_variation_from_vcf return value ' + 'report is not type dict as required.') # return the results return [report] def export_variation_as_vcf(self, ctx, params): """ Export KBase variation object as Variant Call Format (VCF) file :param params: instance of type "export_variation_input" (## funcdef export_variation_as_vcf ## required input params: Variation object reference optional params: NA output report: Shock id pointing to exported vcf file) -> structure: parameter "input_var_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "export_variation_output" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) output = vtv.export_as_vcf(params) #END export_variation_as_vcf # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_variation_as_vcf return value ' + 'output is not type dict as required.') # return the results return [output] def get_variation_as_vcf(self, ctx, params): """ Given a reference to a variation object, and output name: return a Variant Call Format (VCF) file path and name. :param params: instance of type "get_variation_input" (## funcdef get_variation_as_vcf ## required input params: Variation object reference output file name optional params: NA output report: path to returned vcf name of variation object) -> structure: parameter "variation_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "filename" of String :returns: instance of type "get_variation_output" -> structure: parameter "path" of type "filepath" (KBase file path to staging files), parameter "variation_name" of String """ # ctx is the context object # return variables are: file #BEGIN get_variation_as_vcf vtv = VariationToVCF(self.callback_url, self.shared_folder) file = vtv.variation_to_vcf(params) #END get_variation_as_vcf # At some point might do deeper type checking... if not isinstance(file, dict): raise ValueError('Method get_variation_as_vcf return value ' + 'file is not type dict as required.') # return the results return [file] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def run_FamaReadProfiling(self, ctx, params): """ Run metagenome functional profiling module of Fama. :param params: instance of type "FamaReadProfilingParams" (Parameters for metagenome functional profiling. workspace_name - the name of the workspace for input/output read_library_refs - references to the name of the PE read library or SE read library ref_dataset - the name of Fama reference dataset is_paired_end - 1 for paired-end library, 0 for single-end library output_functional_profile_name - the name of the output functional profile output_read_library_ref - the name of the output filtered PE or SE read library) -> structure: parameter "workspace_name" of String, parameter "read_library_refs" of list of String, parameter "ref_dataset" of String, parameter "is_paired_end" of type "bool" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "output_functional_profile_name" of String, parameter "output_read_library_name" of String :returns: instance of type "ReportResults" (Output report parameters report_name - the name of the report object report_ref - the reference to the report object) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_FamaReadProfiling # Import Read Library and save as two paired-end FASTQ files input_refs = params['read_library_refs'] fama_reference = params['ref_dataset'] ws_client = Workspace(self.ws_url) ru = ReadsUtils(self.callback_url) ret = ws_client.get_object_info3( {'objects': [{ 'ref': ref } for ref in input_refs]}) name2ref = {} input_reads = {} for input_ref in input_refs: ret = ws_client.get_object_info3({'objects': [{'ref': input_ref}]}) obj_name = ret['infos'][0][1] name2ref[obj_name] = input_ref reads_params = { 'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } reads = ru.download_reads(reads_params)['files'] print('Input reads files downloaded:') print(reads) fwd_reads_file = reads[input_ref]['files']['fwd'] rev_reads_file = reads[input_ref]['files']['rev'] print('forward: ' + str(fwd_reads_file)) print('reverse: ' + str(rev_reads_file)) input_reads[obj_name] = {} input_reads[obj_name]['fwd'] = fwd_reads_file input_reads[obj_name]['rev'] = rev_reads_file fama_params = { 'input_reads': input_reads, 'work_dir': self.shared_folder, 'reference': fama_reference, 'is_paired_end': params['is_paired_end'], 'name2ref': name2ref, 'ws_name': params['workspace_name'], 'ws_client': ws_client, 'output_read_library_name': params['output_read_library_name'], 'output_functional_profile_name': params['output_functional_profile_name'], 'input_read_refs': params['read_library_refs'] } # Run Fama fama_output = functional_profiling_pipeline(fama_params) # Write filtered reads to workspace reads_params = { 'fwd_file': fama_output['fwd_reads'], 'sequencing_tech': reads[input_ref]['sequencing_tech'], 'single_genome': '0', 'wsname': params['workspace_name'], 'name': params['output_read_library_name'] } if 'rev_reads' in fama_output: reads_params['rev_file'] = fama_output['rev_reads'] reads_params['interleaved'] = '0' ru_ret = ru.upload_reads(reads_params) print('reads_params', reads_params) print('ru_ret', ru_ret) output_reads_ref = ru_ret['obj_ref'] # Write HTML output to workspace message = 'Fama functional profiling finished successfully' dfu = DataFileUtil(self.callback_url) try: dfu_output = dfu.file_to_shock( {'file_path': fama_output['html_report']}) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise html_links = [{ 'shock_id': dfu_output['shock_id'], 'description': 'HTML report for Fama App', 'name': 'fama_report.html', 'label': 'Fama_report' }] for krona_file in fama_output['krona_charts']: try: dfu_output = dfu.file_to_shock({'file_path': krona_file}) html_links.append({ 'shock_id': dfu_output['shock_id'], 'description': 'Krona chart for function taxonomy profile', 'name': fama_output['krona_charts'][krona_file][0], 'label': fama_output['krona_charts'][krona_file][1] }) except ServerError as dfue: # not really any way to test this block self.log('Logging exception loading results to shock') self.log(str(dfue)) raise self.log('Krona chart saved: ' + str(dfu_output)) # Save report report_params = { 'message': message, 'objects_created': [{ 'ref': output_reads_ref, 'description': 'Filtered Read Library' }, { 'ref': fama_output['trait_matrix_ref'], 'description': 'Raw counts matrix' }, { 'ref': fama_output['functional_profile_ref'], 'description': 'Functional profile' }], 'direct_html_link_index': 0, 'html_links': html_links, 'file_links': fama_output['report_files'], 'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'], 'html_window_height': 460 } try: report = KBaseReport(self.callback_url) report_info = report.create_extended_report(report_params) except ServerError as kre: # not really any way to test this block self.log('Logging exception saving report') self.log(str(kre)) raise report_info['report_params'] = report_params self.log(str(report_info)) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_FamaReadProfiling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_FamaReadProfiling return value ' + 'output is not type dict as required.') # return the results return [output]
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in [ 'diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError( '"fold_scale_type" parameter must be set to "logarithm", if used' ) @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report( up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{ 'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object' }] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{ 'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object' }] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{ 'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object' }] report_params = { 'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': up_feature_set_ref }]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format( feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': down_feature_set_ref }]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format( feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace( '<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report' }) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_set_ref }]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_ref }]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({ 'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2] }) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = { 'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements } object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': feature_set_data, 'name': feature_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects( {'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub( '_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = { 'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name } # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data'][ 'diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [ diff_expression_matrix_ref ] save_object_params = {'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format( dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: try: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] except IndexError: raise IndexError('No selected values for Partial Condition') if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format( first_label) error_msg += 'Available conditions: {}'.format( available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format( second_label) error_msg += 'Available conditions: {}'.format( available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2( {'objects': [{ 'ref': diff_expression_set_ref }]})['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': len(ids), 'structured_query': { "$or": [{ "feature_id": x } for x in ids] }, 'sort_by': [['feature_id', True]] })['features'] features_ids = set( (feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']})['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [ x for x in base_set['element_ordering'] if x not in new_feature_set['elements'] ] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [ x for x in genome_refs if x not in new_feature_set['elements'][element] ] else: new_feature_set['elements'][element] = genome_refs new_feature_set[ 'description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError( 'Feature ID {} does not exist in the supplied genome {}'. format(new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3( {"objects": [{ "ref": diff_expression_set_ref }]})['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels ) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [ x.strip() for x in label_string.split(',') ] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get( 'filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append( filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set( up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set( down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = { 'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list } report_output = self._generate_report( up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params( params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]})['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{ 'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object' }] message = "Filtered Expression Matrix based of the {} feature ids present in {}" \ .format(len(feature_ids), feature_set_name) report_params = { 'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return { 'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref'] } def build_feature_set(self, params): self.validate_params(params, { 'output_feature_set', 'workspace_name', }, { 'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description' }) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError( "You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set'] }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{ 'ref': feature_set_obj_ref, 'description': 'Feature Set' }] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = { 'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return { 'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref'] }
class kb_GATK: ''' Module Name: kb_GATK Module Description: A KBase module: kb_GATK ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_GATK.git" GIT_COMMIT_HASH = "5e6e4bdca9a7749bba0abab081736c56007212ed" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) self.gu = GATKUtils() logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) self.vu = VariationUtil(self.callback_url) self.du = DownloadAlignmentUtils(self.callback_url) #END_CONSTRUCTOR pass def run_kb_GATK(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_GATK source_ref = params['alignment_ref'] alignment_out = self.du.downloadreadalignment(source_ref, params, self.callback_url) sam_file = os.path.join(alignment_out['destination_dir'], "reads_alignment.sam") ''' #Todo Reading sample set and sample strains information ''' ''' command.extend(["-filter-name", "\"QD_filter\"", "-filter", "\"QD", "<", params['snp_filter']['snp_qd_filter'] + "\""]) command.extend(["-filter-name", "\"FS_filter\"", "-filter", "\"FS", "<", params['snp_filter']['snp_fs_filter'] + "\""]) command.extend(["-filter-name", "\"MQ_filter\"", "-filter", "\"MQ", "<", params['snp_filter']['snp_mq_filter'] + "\""]) command.extend(["-filter-name", "\"SOR_filter\"", "-filter", "\"SOR", "<", params['snp_filter']['snp_sor_filter'] + "\""]) command.extend(["-filter-name", "\"MQRankSum_filter\"", "-filter", "\"MQRankSum", "<", params['snp_filter']['snp_mqrankSum_filter'] + "\""]) command.extend(["-filter-name", "\"ReadPosRankSum_filter\"", "-filter", "\"ReadPosRankSum", "<", params['snp_filter']['snp_readposranksum_filter'] + "\""]) ''' print(params) strain_info = params['strain_info'] output_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(output_dir) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') assembly_file = self.du.download_genome(assembly_ref, output_dir)['path'] #output_dir = output_dir + "/" #Todo: check time for building index file or donwload from cache. #Todo: To discuss about cache_id to be used. #Todo: In case of copying genome, find the way of finding original genome (ref id) for getting original cache id. self.gu.build_genome(assembly_file) self.gu.index_assembly(assembly_file) self.gu.generate_sequence_dictionary(assembly_file) self.gu.duplicate_marking(output_dir, sam_file) #self.gu.sort_bam_index(output_dir) self.gu.collect_alignment_and_insert_size_metrics( assembly_file, output_dir) #self.gu.analyze_covariates(output_dir) #Todo: avoid writing intermediate fies to save space and time I/O. self.gu.variant_calling(assembly_file, output_dir) self.gu.extract_variants(assembly_file, output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps.vcf", output_dir, params) self.gu.filter_Indels(assembly_file, "filtered_indels.vcf", output_dir, params) self.gu.exclude_filtered_variants(output_dir) self.gu.base_quality_score_recalibration(assembly_file, "recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "recal_data.table", output_dir) self.gu.base_quality_score_recalibration(assembly_file, "post_recal_data.table", output_dir) self.gu.apply_BQSR(assembly_file, "post_recal_data.table", output_dir) self.gu.filter_SNPs(assembly_file, "filtered_snps_final.vcf", output_dir, params) #Todo: To save indels also using VariationUtils or merge with snps and sort them with chr & pos and save using variaiotiontuils. #Todo: To get an example for saving structural variants(specially CNV) and compare with standard vcf output. self.gu.filter_Indels(assembly_file, "filtered_indels_final.vcf", output_dir, params) ''' os.system("grep '##fileformat' " + output_dir + "/filtered_snps_final.vcf > " + output_dir + "/sample.vcf") cmd = "grep -v '##' " + output_dir + "/filtered_snps_final.vcf >> " + output_dir + "/sample.vcf" os.system(cmd) # TODO : need to remove system command after fixing variationUtils. ''' vcf_filepath = self.gu.index_vcf_file(output_dir + "/filtered_snps_final.vcf") reheader_vcf_file = self.gu.reheader(vcf_filepath, strain_info) #Todo : check existence of final filtered finals snps. #Todo : chnage assembly_or_genome_ref to genome_or_assembly_ref #Todo: to derive name of sample_attribute_name from sample set ref by prefixing/suffixing. Attribute mapping should have one sample. save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': reheader_vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_GATK # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_GATK return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class masurca_utils: """ masurca_utils: defining a system of utils for running masurca """ MaSuRCA_VERSION = 'MaSuRCA-3.2.9' MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca' PARAM_IN_WS = 'workspace_name' PARAM_IN_THREADN = 'num_threads' PARAM_IN_READS_LIBS = 'reads_libraries' PARAM_IN_JUMP_LIBS = 'jump_libraries' PARAM_IN_JF_SIZE = 'jf_size' PARAM_IN_CS_NAME = 'output_contigset_name' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') def __init__(self, prj_dir, config): self.workspace_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] if 'shock-url' in config: self.shock_url = config['shock-url'] if 'handle-service-url' in config: self.handle_url = config['handle-service-url'] self.ws_client = Workspace(self.workspace_url, token=self.token) self.ru = ReadsUtils(self.callback_url, token=self.token) self.au = AssemblyUtil(self.callback_url, token=self.token) self.kbr = KBaseReport(self.callback_url) self.kbq = kb_quast(self.callback_url) self.proj_dir = prj_dir self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir) def _has_long_reads(self, params): """ _has_long_reads: check if a long reads input exists in the parameters """ return (params.get('pacbio_reads', None) or params.get('nanopore_reads', None) or params.get('other_frg_file', None)) def _get_data_portion(self, pe_reads_data, jp_reads_data=None, pacbio_reads_file='', nanopore_reads_file='', other_frg_file=''): """ _get_data_portion: build the 'DATA...END' portion for the config.txt file """ data_str = '' if pe_reads_data: # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1))) for pe in pe_reads_data: if data_str != '': data_str += '\n' data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \ str(pe['pe_stdev']) + ' ' + pe['fwd_file'] if pe.get('rev_file', None): data_str += ' ' + pe['rev_file'] if jp_reads_data: # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1))) for jp in jp_reads_data: if data_str != '': data_str += '\n' data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \ str(jp['jp_stdev']) + ' ' + jp['fwd_file'] if jp.get('rev_file', None): data_str += ' ' + jp['rev_file'] # Adding the pacbio_reads # Note that pcbio reads must be in a single fasta file! # For example: # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta' # ***if you have both types of reads supply them both as NANOPORE type*** if pacbio_reads_file != '': if data_str != '': data_str += '\n' if nanopore_reads_file != '': data_str += 'NANOPORE=' + pacbio_reads_file else: data_str += 'PACBIO=' + pacbio_reads_file # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file! # For example: # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta' if nanopore_reads_file != '': if data_str != '': data_str += '\n' data_str += 'NANOPORE= ' + nanopore_reads_file # Adding the other_frg_file inputs if any # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into # Celera Assembler compatible .frg file # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg if other_frg_file != '': if data_str != '': data_str += '\n' data_str += 'OTHER=' + other_frg_file return data_str def _get_parameters_portion(self, params): """ build the 'PARAMETERS...END' portion for the config.txt file """ # set the default parameters as suggested in the example configuration file param_str = ( "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE" + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0") if (params.get('graph_kmer_size', None) and type(params['graph_kmer_size']) == int): if param_str != '': param_str += '\n' param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size']) else: if param_str != '': param_str += '\n' param_str += 'GRAPH_KMER_SIZE=auto' if params.get('use_linking_mates', None): if param_str != '': param_str += '\n' if params['use_linking_mates'] == 1 and not self._has_long_reads( params): param_str += 'USE_LINKING_MATES=1' else: param_str += 'USE_LINKING_MATES=0' if params.get('limit_jump_coverage', None): if param_str != '': param_str += '\n' param_str += 'LIMIT_JUMP_COVERAGE = ' + str( params['limit_jump_coverage']) if params.get('cgwErrorRate', None): if param_str != '': param_str += '\n' param_str += 'CA_PARAMETERS = cgwErrorRate=' + str( params['cgwErrorRate']) if params.get(self.PARAM_IN_THREADN, None): if param_str != '': param_str += '\n' param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN]) if params.get('jf_size', None): if param_str != '': param_str += '\n' param_str += 'JF_SIZE=' + str(params['jf_size']) if params.get('kmer_count_threshold', None): if param_str != '': param_str += '\n' param_str += 'KMER_COUNT_THRESHOLD=' + str( params['kmer_count_threshold']) if params.get('do_homopolymer_trim', None): if param_str != '': param_str += '\n' if params['do_homopolymer_trim'] == 1: param_str += 'DO_HOMOPOLYMER_TRIM=1' else: param_str += 'DO_HOMOPOLYMER_TRIM=0' if params.get('close_gaps', None): if param_str != '': param_str += '\n' if params['close_gaps'] == 1: param_str += 'CLOSE_GAPS=1' else: param_str += 'CLOSE_GAPS=0' if params.get('soap_assembly', None): if param_str != '': param_str += '\n' if params['soap_assembly'] == 1: param_str += 'SOAP_ASSEMBLY=1' else: param_str += 'SOAP_ASSEMBLY=0' return param_str def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt): """ replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text examples of parameters: begin_patn1 = "DATA\n" begin_patn2 = "PARAMETERS\n" end_patn1 = "END\nPARAMETERS\n" end_patn2 = "END\n" repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' + ' /kb/module/work/testReads/small.reverse.fq\n') repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' + 'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n DO_HOMOPOLYMER_TRIM=0\n') """ if repl_txt != '': # create regular expression pattern repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL) repl_txt = begin_patn + repl_txt + '\n' + end_patn # replace the text between begin_patn and end_patn with repl_txt txt_replaced = repl.sub(repl_txt, orig_txt) # pprint(txt_replaced) return txt_replaced else: return orig_txt def _unique_prefix_check(self, pfix, refs): prefix_lookup = {} for ref in refs: pre = ref[pfix][0:2] if pre not in prefix_lookup: prefix_lookup[pre] = 1 else: raise ValueError('The first two characters in \'' + ref[pfix] + '\' has been used.') def _get_pereads_info(self, input_params): """ _get_pereads_info--from a list of paired_readsParams structures fetches the corresponding reads info with the paired_readsParams[pe_id] returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'pe_prefix': the two-letter prefix for the reads library, 'pe_mean': the average reads length for the reads library, 'pe_stdev': the standard deviation for the reads library, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ rds_params = copy.deepcopy(input_params) wsname = rds_params[self.PARAM_IN_WS] rds_refs = [] rds_data = [] # reads_libraries grouped params if rds_params.get(self.PARAM_IN_READS_LIBS, None): pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS] for pe_lib in pe_reads_libs: if pe_lib.get('pe_id', None): rds_refs.append(pe_lib['pe_id']) rds_data = self._get_kbreads_info(wsname, rds_refs) for pe_lib in pe_reads_libs: i = 0 for rds in rds_data: i += 1 if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[ 'reads_ref']: if pe_lib.get('pe_prefix', None): rds['pe_prefix'] = pe_lib['pe_prefix'][0] else: rds['pe_prefix'] = 'p' rds['pe_prefix'] += str(i) pe_lib['pe_prefix'] = rds['pe_prefix'] if pe_lib.get('pe_mean', None) is None: pe_lib['pe_mean'] = 500 rds['pe_mean'] = pe_lib['pe_mean'] if pe_lib.get('pe_stdev', None) is None: pe_lib['pe_stdev'] = 50 rds['pe_stdev'] = pe_lib['pe_stdev'] self._unique_prefix_check('pe_prefix', pe_reads_libs) else: raise ValueError("Parameter {} is required.".format( self.PARAM_IN_READS_LIBS)) return rds_data def _get_jpreads_info(self, input_params): """ _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding reads info with the paired_readsParams[pe_id] returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'jp_prefix': the two-letter prefix for the reads library, 'jp_mean': the average reads length for the reads library, 'jp_stdev': the standard deviation for the reads library, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ rds_params = copy.deepcopy(input_params) wsname = rds_params[self.PARAM_IN_WS] rds_refs = [] rds_data = [] # jump_libraries grouped params if rds_params.get(self.PARAM_IN_JUMP_LIBS, None): jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS] for jp_lib in jp_reads_libs: if jp_lib.get('jp_id', None): rds_refs.append(jp_lib['jp_id']) rds_data = self._get_kbreads_info(wsname, rds_refs) for jp_lib in jp_reads_libs: i = 0 for rds in rds_data: i += 1 if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[ 'reads_ref']: if jp_lib.get('jp_prefix', None): rds['jp_prefix'] = jp_lib['jp_prefix'][0] else: rds['jp_prefix'] = 's' rds['jp_prefix'] += str(i) jp_lib['jp_prefix'] = rds['jp_prefix'] if jp_lib.get('jp_mean', None) is None: jp_lib['jp_mean'] = 3600 rds['jp_mean'] = jp_lib['jp_mean'] if jp_lib.get('jp_stdev', None) is None: jp_lib['jp_stdev'] = 200 rds['jp_stdev'] = jp_lib['jp_stdev'] self._unique_prefix_check('jp_prefix', jp_reads_libs) return rds_data def _get_kbreads_info(self, wsname, reads_refs): """ _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding reads info with as deinterleaved fastq files and returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ obj_ids = [] for r in reads_refs: if r: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) if not obj_ids: return [] ws_info = self.ws_client.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = self.ru.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false' })['files'] except ServerError as se: log('logging stacktrace from dynamic client error') log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary ' + 'KBaseFile.SingleEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise # log('Downloaded reads data from KBase:\n' + pformat(reads)) reads_data = [] for ref in reads_refs: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]['sequencing_tech'] rds_info = { 'fwd_file': f['fwd'], 'reads_ref': ref, 'type': f['type'], 'seq_tech': seq_tech, 'reads_name': reads_name } if f.get('rev', None) is not None: rds_info['rev_file'] = f['rev'] reads_data.append(rds_info) return reads_data def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.proj_dir, str(uuid.uuid4())) mkdir_p(output_directory) masurca_output = os.path.join(output_directory, 'masurca_output.zip') self._zip_folder(out_dir, masurca_output) output_files.append({ 'path': masurca_output, 'name': os.path.basename(masurca_output), 'label': os.path.basename(masurca_output), 'description': 'Output file(s) generated by MaSuRCA' }) return output_files def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) # print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print("{} created successfully.".format(output_path)) # with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _load_stats(self, input_file_name): log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly') log('Building Object.') if not os.path.isfile(input_file_name): raise Exception('The input file name {0} is not a file!'.format( input_file_name)) with open(input_file_name, 'r') as input_file_handle: contig_id = None sequence_len = 0 fasta_dict = dict() first_header_found = False # Pattern for replacing white space pattern = re.compile(r'\s+') for current_line in input_file_handle: if current_line[0] == '>': # found a header line # Wrap up previous fasta sequence if not first_header_found: first_header_found = True else: fasta_dict[contig_id] = sequence_len sequence_len = 0 fasta_header = current_line.replace('>', '').strip() try: contig_id = fasta_header.strip().split(' ', 1)[0] except (IndexError, KeyError, ValueError): contig_id = fasta_header.strip() else: sequence_len += len(re.sub(pattern, '', current_line)) # wrap up last fasta sequence if not first_header_found: raise Exception("There are no contigs in this file") else: fasta_dict[contig_id] = sequence_len return fasta_dict def _check_reference(self, ref): """ Tests the given ref string to make sure it conforms to the expected object reference format. Returns True if it passes, False otherwise. """ obj_ref_regex = re.compile( "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$") ref_path = ref.strip().split(";") for step in ref_path: if not obj_ref_regex.match(step): return False return True def _check_ref_type(self, ref, allowed_types): """ Validates the object type of ref against the list of allowed types. If it passes, this returns True, otherwise False. Really, all this does is verify that at least one of the strings in allowed_types is a substring of the ref object type name. Ex1: ref = "KBaseGenomes.Genome-4.0" allowed_types = ["assembly", "KBaseFile.Assembly"] returns False Ex2: ref = "KBaseGenomes.Genome-4.0" allowed_types = ["assembly", "genome"] returns True """ obj_type = self._get_object_type(ref).lower() for t in allowed_types: if t.lower() in obj_type: return True return False def _get_object_type(self, ref): """ Fetches and returns the typed object name of ref from the given workspace url. If that object doesn't exist, or there's another Workspace error, this raises a RuntimeError exception. """ info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]}) obj_info = info.get('infos', [[]])[0] if len(obj_info) == 0: raise RuntimeError( "An error occurred while fetching type info from the Workspace. " "No information returned for reference {}".format(ref)) return obj_info[2] def _get_fasta_from_assembly(self, assembly_ref): """ From an assembly or contigset, this uses a data file to build a FASTA file and return the path to it. """ allowed_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] if not self._check_ref_type(assembly_ref, allowed_types): raise ValueError( "The reference {} cannot be used to fetch a FASTA file".format( assembly_ref)) au = AssemblyUtil(self.callback_url) return au.get_assembly_as_fasta({'ref': assembly_ref}) def generate_report(self, contig_file_name, params, out_dir, wsname): """ generate_report: reporting results """ log('Generating and saving report') contig_file_with_path = os.path.join(out_dir, contig_file_name) fasta_stats = self._load_stats(contig_file_with_path) lengths = [fasta_stats[contig_id] for contig_id in fasta_stats] assembly_ref = params[self.PARAM_IN_WS] + '/' + params[ self.PARAM_IN_CS_NAME] report_text = '' report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n' report_text += 'Assembly saved to: ' + assembly_ref + '\n' report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report_text += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n' for c in range(bins): report_text += (' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n') print('Running QUAST') quastret = self.kbq.run_QUAST({ 'files': [{ 'path': contig_file_with_path, 'label': params[self.PARAM_IN_CS_NAME] }] }) output_files = self._generate_output_file_list(out_dir) print('Saving report') report_output = self.kbr.create_extended_report({ 'message': report_text, 'objects_created': [{ 'ref': assembly_ref, 'description': 'Assembled contigs' }], 'direct_html_link_index': 0, 'file_links': output_files, 'html_links': [{ 'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report' }], 'report_object_name': 'kb_masurca_report_' + str(uuid.uuid4()), 'workspace_name': params[self.PARAM_IN_WS] }) report_name = report_output['name'] report_ref = report_output['ref'] return report_name, report_ref def validate_params(self, params): """ validate_params: checks params passed to run_masurca_app method and set default values """ # log('Start validating run_masurca_app parameters:\n{}'.format( # json.dumps(params, indent=1))) # check for mandatory parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory') if self.PARAM_IN_THREADN not in params: raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory') if params.get(self.PARAM_IN_JF_SIZE, None) is None: raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory') if params.get(self.PARAM_IN_READS_LIBS, None) is None: raise ValueError(self.PARAM_IN_READS_LIBS + ' parameter is mandatory') if type(params[self.PARAM_IN_READS_LIBS]) != list: raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list') if params.get(self.PARAM_IN_CS_NAME, None) is None: raise ValueError('Parameter {} is mandatory!'.format( self.PARAM_IN_CS_NAME)) if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]): raise ValueError('Invalid workspace object name: {}.'.format( params[self.PARAM_IN_CS_NAME])) if 'dna_source' in params: dna_src = params.get('dna_source') if dna_src == 'bacteria': params['limit_jump_coverage'] = 60 params['cgwErrorRate'] = 0.25 else: params['limit_jump_coverage'] = 300 params['cgwErrorRate'] = 0.15 if params.get('create_report', None) is None: params['create_report'] = 0 return params def construct_masurca_assembler_cfg(self, params): # STEP 1: get the working folder housing the config.txt file and the masurca results wsname = params[self.PARAM_IN_WS] config_file_path = os.path.join(self.proj_dir, 'config.txt') # STEP 2.1: retrieve the reads data from input parameter pe_reads_data = self._get_pereads_info(params) jp_reads_data = [] if params.get(self.PARAM_IN_JUMP_LIBS, None): jp_reads_data = self._get_jpreads_info(params) if 'jp_mean' not in params or type(params['jp_mean']) != int: params['jp_mean'] = 3600 if 'jp_stdev' not in params or type(params['jp_stdev']) != int: params['jp_stdev'] = 200 # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa; assbl_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] reads_types = [ 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary', 'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary' ] pb_reads_file = '' if params.get('pacbio_reads', None): pb_ref = params['pacbio_reads'] if self._check_ref_type(pb_ref, assbl_types): pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get( 'path', '') else: if self._check_ref_type(pb_ref, reads_types): pb_rd = self._get_kbreads_info(wsname, [pb_ref]) pb_reads_file = pb_rd[0]['fwd_file'] if pb_rd[0].get('rev_file', None): pb_reads_file += ' ' + pb_rd[0]['rev_file'] # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied # as NANOPORE=reads.fa np_reads_file = '' if params.get('nanopore_reads', None): np_ref = params['nanopore_reads'] if self._check_ref_type(np_ref, assbl_types): np_reads_file = (self._get_fasta_from_assembly(np_ref)).get( 'path', '') else: if self._check_ref_type(np_ref, reads_types): np_rd = self._get_kbreads_info(wsname, [np_ref]) np_reads_file = np_rd[0]['fwd_file'] if np_rd[0].get('rev_file', None): np_reads_file += ' ' + np_rd[0]['rev_file'] # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first # converted into Celera Assembler compatible .frg files # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg other_frg = '' if params.get('other_frg_file', None): other_frg = params['other_frg_file'] # STEP 3: construct and save the config.txt file for running masurca try: # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file data_str = self._get_data_portion(pe_reads_data, jp_reads_data, pb_reads_file, np_reads_file, other_frg) if data_str == '': # no reads libraries are specified, no further actions return '' config_template = '' with codecs.open(os.path.join(os.path.dirname(__file__), 'config_template.txt'), mode='r', encoding='utf-8') as config_template_file: config_template = config_template_file.read() begin_patn1 = "DATA\n" end_patn1 = "END\nPARAMETERS\n" config_with_data = self._replaceSectionText( config_template, begin_patn1, end_patn1, data_str) # log("\n***After DATA section replacement:\n{}\nSaved at {}".format( # config_with_data.encode('utf-8').decode('utf-8'), config_file_path)) with codecs.open(config_file_path, mode='w', encoding='utf-8') as config_file: config_file.write(config_with_data) # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above param_str = self._get_parameters_portion(params) if param_str == '': # no parameters are specified, no further actions return '' previous_config = '' with codecs.open(config_file_path, mode='r', encoding='utf-8') as previous_config_file: previous_config = previous_config_file.read() begin_patn2 = "PARAMETERS\n" end_patn2 = "END\n" final_config = self._replaceSectionText(previous_config, begin_patn2, end_patn2, param_str) log("\n***Configuration file content:\n{}\nSaved at {}".format( final_config.encode('utf-8').decode('utf-8'), config_file_path)) with codecs.open(config_file_path, mode='w', encoding='utf-8') as config_file: config_file.write(final_config) except IOError as ioerr: log('Creation of the config.txt file raised error:\n') pprint(ioerr) return '' else: return config_file_path def generate_assemble_script(self, config_file): if os.path.isfile(config_file): f_dir, f_nm = os.path.split(config_file) m_cmd = [self.MaSuRCA_BIN] m_cmd.append(config_file) try: self.prog_runner.run(m_cmd, f_dir) assemble_file = os.path.join(f_dir, 'assemble.sh') log('Created the assemble.sh file at {}.\n'.format( assemble_file)) return assemble_file except ValueError as ve: log('Error generating assemble.sh file: \n{}'.format(ve)) raise ValueError('Failed to generate assemble.sh file!') else: log("The config file {} is not found.\n".format(config_file)) log('NO assemble.sh file created.\n') return '' def run_assemble(self, asmbl_file): exit_code = 1 if os.path.isfile(asmbl_file): log("The assemble.sh file exists at {}\n".format(asmbl_file)) f_dir, f_nm = os.path.split(asmbl_file) a_cmd = ['/bin/bash'] a_cmd.append(asmbl_file) log("The working directory is {}\n".format(f_dir)) log("The assembling command is {}\n".format(' '.join(a_cmd))) try: exit_code = self.prog_runner.run(a_cmd, f_dir) except ValueError as ve: log('Error running assemble: \n{}'.format(ve)) else: log("The assemble.sh file {} is not found.".format(asmbl_file)) return exit_code def save_assembly(self, contig_fa, wsname, a_name): if os.path.isfile(contig_fa): log('Uploading FASTA file to Assembly...') self.au.save_assembly_from_fasta({ 'file': { 'path': contig_fa }, 'workspace_name': wsname, 'assembly_name': a_name }) else: log("The contig file {} is not found.".format(contig_fa))
class AMAUtils(): def __init__(self, ws_url, cb_url, token, scratch): self.ws = Workspace(ws_url, token=token) self.cb_url = cb_url self.token = token self.scratch = scratch def _confirm_ws_type(self, ref): """confirm whether 'ref' is of type 'KBaseMetagenomes.AnnotatedMetagenomeAssembly if not, throw error. """ if ref is None: raise ValueError(" 'ref' argument must be specified.") obj_info = self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0] # check object type is 'KBaseMetagenome.AnnotatedMetagenomeAssembly' obj_type = obj_info[2] if 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' not in obj_type: raise ValueError( f"input ref '{ref}' is of type {obj_type}. function " "'get_annotated_metagenome_assembly' requires objects" " of type KBaseMetagenome.AnnotatedMetagenomeAssembly") def get_annotated_metagenome_assembly(self, params): """ params: ref - workspace reference included_fields - list of fields to include, defaults to list below if not specified. output genomes - contains the returned data fields from the workspace request. """ ref = params.get('ref', None) included_fields = params.get('included_fields', None) self._confirm_ws_type(ref) get_obj_params = {'ref': ref} if included_fields is not None: get_obj_params['included'] = included_fields data = self.ws.get_objects2({'objects': [get_obj_params]})['data'] return {'genomes': data} def get_annotated_metagenome_assembly_features(self, params): """ params: ref - workspace reference for KBaseMetagenomes.AnnotatedMetagenomeAssembly object output: features - list of features, each representing a dict. """ ref = params['ref'] self._confirm_ws_type(ref) ret = self.ws.get_objects2( {"objects": [{ "ref": ref, "included": ["features_handle_ref"] }]})['data'] features_handle_ref = ret[0]['data']['features_handle_ref'] dfu = DataFileUtil(self.cb_url, token=self.token) file_name = 'features.json.gz' file_path = os.path.join(self.scratch, file_name) shock_ret = dfu.shock_to_file({ 'handle_id': features_handle_ref, 'file_path': file_path, 'unpack': "uncompress" }) file_path = shock_ret['file_path'] with open(file_path) as fd: json_features = json.load(fd) if params.get('feature_type'): accepted_feature_types = [ "cds", "gene", "mrna", "trna", "rrna", "repeat_region" ] feat_type = params['feature_type'] if feat_type.lower() not in accepted_feature_types: raise ValueError( f"{feat_type} not an accepted feature type; accepted feature" " types (in lower case) are {accepted_feature_types}") json_features = [ feature for feature in json_features if feature['type'].lower() == feat_type.lower() ] if params.get('only_ids'): json_features = [{ 'id': feature['id'] } for feature in json_features] return {'features': json_features}
def save_variation_from_vcf(self, ctx, params): """ Save a variation (and trait?) object to Kbase given a reference genome, object output name, Variant Call Format (VCF) file, and sample attribute file. :param params: instance of type "save_variation_input" (## funcdef save_variation_from_vcf ## required input params: genome_or_assembly_ref: KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly object reference *** variation input data *** vcf_staging_file_path: path to location data associated with samples variation_object_name: output name for KBase variation object *** sample input data *** sample_attribute_ref: x/y/z reference to kbase sample attribute optional params: NA output report: report_name report_ref HTML visualization: Manhattan plot *** Visualization *** plot_maf: generate histogram of minor allele frequencies plot_hwe: generate histogram of Hardy-Weinberg Equilibrium p-values) -> structure: parameter "workspace_name" of String, parameter "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style reference), parameter "vcf_staging_file_path" of type "filepath" (KBase file path to staging files), parameter "variation_object_name" of String, parameter "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference) :returns: instance of type "save_variation_output" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: report #BEGIN save_variation_from_vcf genome_or_assembly_ref = params['genome_or_assembly_ref'] ws_url = self.config['workspace-url'] wsc = Workspace(ws_url) obj_type = wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): params['genome_ref'] = genome_or_assembly_ref elif ('KBaseGenomeAnnotations.Assembly' in obj_type): params['assembly_ref'] = genome_or_assembly_ref else: raise ValueError( obj_type + ' is not the right input for this method. Valid input include KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly ' ) vtv = VCFToVariation(self.config, self.shared_folder, self.callback_url) var_obj = vtv.import_vcf(params) var_obj_ref = str(var_obj[0][6]) + "/" + str( var_obj[0][0]) + "/" + str(var_obj[0][4]) upload_message = "Variation object created." upload_message += "\nObject #" + str(var_obj[0][0]) upload_message += "\nObject name: " + str(var_obj[0][1]) upload_message += "\nGenotypes in variation: " + str( var_obj[1]['numgenotypes']) upload_message += "\nVariants in VCF file: " + str( var_obj[1]['numvariants']) report_obj = { 'objects_created': [{ 'ref': var_obj_ref, 'description': 'Variation object from VCF file.' }], 'text_message': upload_message } report_client = KBaseReport(self.callback_url) report_create = report_client.create({ 'report': report_obj, 'workspace_name': params['workspace_name'] }) report = { "report_name": report_create['name'], "report_ref": report_create['ref'], "workspace_name": params["workspace_name"] } #END save_variation_from_vcf # At some point might do deeper type checking... if not isinstance(report, dict): raise ValueError('Method save_variation_from_vcf return value ' + 'report is not type dict as required.') # return the results return [report]
class StringTieUtil: STRINGTIE_TOOLKIT_PATH = "/kb/deployment/bin/StringTie" GFFREAD_TOOLKIT_PATH = "/kb/deployment/bin/gffread" GFFCOMPARE_TOOLKIT_PATH = "/kb/deployment/bin/gffcompare" OPTIONS_MAP = { "output_transcripts": "-o", "gene_abundances_file": "-A", "num_threads": "-p", "fr_firststrand": "--rf", "fr_secondstrand": "--fr", "cov_refs_file": "-C", "junction_base": "-a", "junction_coverage": "-j", "disable_trimming": "-t", "min_locus_gap_sep_value": "-g", "ballgown_mode": "-B", "skip_reads_with_no_ref": "-e", "maximum_fraction": "-M", "label": "-l", "gtf_file": "-G", "min_length": "-m", "min_read_coverage": "-c", "min_isoform_abundance": "-f", } BOOLEAN_OPTIONS = [ "disable_trimming", "ballgown_mode", "skip_reads_with_no_ref" ] def _validate_run_stringtie_params(self, params): """ _validate_run_stringtie_params: validates params passed to run_stringtie method """ log("start validating run_stringtie params") # check for required parameters for p in [ "alignment_object_ref", "workspace_name", "expression_suffix", "expression_set_suffix", ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _generate_command(self, params): """ _generate_command: generate stringtie command """ command = self.STRINGTIE_TOOLKIT_PATH + "/stringtie " for key, option in self.OPTIONS_MAP.items(): option_value = params.get(key) if key in self.BOOLEAN_OPTIONS and option_value: option_value = " " if option_value: command += "{} {} ".format(option, option_value) command += "{} ".format(params.get("input_file")) log("generated stringtie command: {}".format(command)) return command def _run_command(self, command): """ _run_command: run command and print result """ log("start executing command:\n{}".format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if exitCode == 0: log("Executed command:\n{}\n".format(command) + "Exit Code: {}\nOutput:\n{}".format(exitCode, output)) else: error_msg = "Error running command:\n{}\n".format(command) error_msg += "Exit Code: {}\nOutput:\n{}".format(exitCode, output) raise ValueError(error_msg) def _run_gffread(self, gff_path, gtf_path): """ _run_gffread: run gffread script ref: http://ccb.jhu.edu/software/stringtie/gff.shtml """ log("converting gff to gtf") command = self.GFFREAD_TOOLKIT_PATH + "/gffread " command += "-E {0} -T -o {1}".format(gff_path, gtf_path) self._run_command(command) def _run_gffcompare(self, gff_path, gtf_path): """ _run_gffcompare: run gffcompare script ref: http://ccb.jhu.edu/software/stringtie/gff.shtml """ log("converting gff to gtf") output = os.path.dirname(gtf_path) + "/gffcmp" command = self.GFFCOMPARE_TOOLKIT_PATH + "/gffcompare " command += "-r {} -G -o {} {}".format(gff_path, output, gtf_path) self._run_command(command) def _get_input_file(self, alignment_ref): """ _get_input_file: get input SAM/BAM file from Alignment object """ log("getting bam file from alignment") bam_file_dir = self.rau.download_alignment( {"source_ref": alignment_ref})["destination_dir"] files = os.listdir(bam_file_dir) bam_file_list = [ file for file in files if re.match(r".*\_sorted\.bam", file) ] if not bam_file_list: bam_file_list = [ file for file in files if re.match(r".*(?<!sorted)\.bam", file) ] if not bam_file_list: raise ValueError("Cannot find .bam file from alignment {}".format( alignment_ref)) bam_file_name = bam_file_list[0] bam_file = os.path.join(bam_file_dir, bam_file_name) return bam_file def _get_gtf_file(self, alignment_ref, result_directory): """ _get_gtf_file: get the reference annotation file (in GTF format) """ alignment_data = self.ws.get_objects2( {"objects": [{ "ref": alignment_ref }]})["data"][0]["data"] genome_ref = alignment_data.get("genome_id") # annotation_file = self._create_gtf_file(genome_ref, result_directory) annotation_file = self._create_gtf_annotation_from_genome( genome_ref, result_directory) gene_name_annotation_file = (annotation_file.split(".gtf")[0] + "_append_name.gtf") with open(gene_name_annotation_file, "w") as output_file: with open(annotation_file, "r") as input_file: for line in input_file: if ('gene_id "' in line) and ('gene_name "' not in line): line = line.replace("\n", "") gene_id = line.split('gene_id "')[1].split('"')[0] line += ' gene_name "{}";\n'.format(gene_id) output_file.write(line) else: output_file.write(line) return gene_name_annotation_file def _create_gtf_annotation_from_genome(self, genome_ref, result_directory): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ "ref": genome_ref, "included": ["contigset_ref", "assembly_ref"] }]) contig_id = None if "contigset_ref" in ref[0]["data"]: contig_id = ref[0]["data"]["contigset_ref"] elif "assembly_ref" in ref[0]["data"]: contig_id = ref[0]["data"]["assembly_ref"] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print(contig_id) log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta( {"ref": genome_ref + ";" + contig_id}) fa_output_file = ret["path"] if os.path.dirname(fa_output_file) != result_directory: shutil.copy(fa_output_file, result_directory) # get the GFF ret = self.gfu.genome_to_gff({ "genome_ref": genome_ref, "target_dir": result_directory }) genome_gff_file = ret["file_path"] gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + ".gtf" self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path def _create_gtf_file(self, genome_ref, result_directory): """ _create_gtf_file: create reference annotation file from genome """ log("start generating reference annotation file") genome_gtf_file = self.gfu.genome_to_gff({ "genome_ref": genome_ref, "target_dir": result_directory, "is_gtf": True })["file_path"] return genome_gtf_file def _save_expression( self, result_directory, alignment_ref, workspace_name, expression_suffix, genome_ref="", transcripts=0, ): """ _save_expression: save Expression object to workspace """ log("start saving Expression object") alignment_data_object = self.ws.get_objects2( {"objects": [{ "ref": alignment_ref }]})["data"][0] alignment_name = alignment_data_object["info"][1] if re.match(".*_*[Aa]lignment", alignment_name): expression_obj_name = re.sub("_*[Aa]lignment", expression_suffix, alignment_name) else: expression_obj_name = alignment_name + expression_suffix destination_ref = workspace_name + "/" + expression_obj_name upload_expression_params = { "destination_ref": destination_ref, "source_dir": result_directory, "alignment_ref": alignment_ref, "tool_used": "StringTie", "tool_version": "1.3.3", "genome_ref": genome_ref, "transcripts": transcripts, } expression_ref = self.eu.upload_expression( upload_expression_params)["obj_ref"] return expression_ref def _save_expression_set( self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_suffix, genome_ref=None, ): """ _save_expression_set: save ExpressionSet object to workspace """ log("start saving ExpressionSet object") items = [] for alignment_expression in alignment_expression_map: items.append({ "ref": alignment_expression.get("expression_obj_ref"), "label": alignment_expression.get("alignment_label"), }) expression_set_data = { "description": "ExpressionSet using StringTie", "items": items, } alignment_set_data_object = self.ws.get_objects2( {"objects": [{ "ref": alignment_set_ref }]})["data"][0] alignment_set_name = alignment_set_data_object["info"][1] if re.match(".*_*[Aa]lignment_*[Ss]et", alignment_set_name): expression_set_name = re.sub("_*[Aa]lignment_*[Ss]et", expression_set_suffix, alignment_set_name) else: expression_set_name = alignment_set_name + expression_set_suffix expression_set_save_params = { "data": expression_set_data, "workspace": workspace_name, "genome_ref": genome_ref, "output_object_name": expression_set_name, } save_result = self.set_client.save_expression_set_v1( expression_set_save_params) expression_set_ref = save_result["set_ref"] return expression_set_ref def _save_expression_matrix(self, expressionset_ref, workspace_name): """ _save_expression_matrix: save FPKM and TPM ExpressionMatrix """ log("start saving ExpressionMatrix object") expression_set_name = self.ws.get_object_info( [{ "ref": expressionset_ref }], includeMetadata=None)[0][1] output_obj_name_prefix = re.sub("_*[Ee]xpression_*[Ss]et", "", expression_set_name) upload_expression_matrix_params = { "expressionset_ref": expressionset_ref, "output_obj_name": output_obj_name_prefix, "workspace_name": workspace_name, } expression_matrix_refs = self.eu.get_expressionMatrix( upload_expression_matrix_params) return expression_matrix_refs def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log("start packing result files") output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, "stringtie_result.zip") with zipfile.ZipFile(result_file, "w", zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not file.endswith(".DS_Store"): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file), ) output_files.append({ "path": result_file, "name": os.path.basename(result_file), "label": os.path.basename(result_file), "description": "File(s) generated by StringTie App", }) result_dirs = os.listdir(result_directory) if "merge_result" in result_dirs: merge_file = os.path.join(result_directory, "merge_result", "stringtie_merge.gtf") output_files.append({ "path": merge_file, "name": os.path.basename(merge_file), "label": os.path.basename(merge_file), "description": "merge file generated by StringTie App", }) return output_files def _generate_merge_html_report(self, result_directory): """ _generate_html_report: generate html summary report """ log("start generating merge html report") html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, "report.html") result_dirs = os.listdir(result_directory) Overview_Content = "" Overview_Content += "<br/><table><tr><th>Generated Files</th>" Overview_Content += "<th></th></tr>" Overview_Content += "<tr><th>Directory</th><th>File Name</th></tr>" for result_dir in result_dirs: result_files = os.listdir( os.path.join(result_directory, result_dir)) result_files.sort() first_file = True for file_name in result_files: if first_file: Overview_Content += "<tr><td>{}</td>".format(result_dir) Overview_Content += "<td>{}</td></tr>".format(file_name) first_file = False else: Overview_Content += "<tr><td>{}</td>".format("") Overview_Content += "<td>{}</td></tr>".format(file_name) Overview_Content += "</table>" with open(result_file_path, "w") as result_file: with open( os.path.join(os.path.dirname(__file__), "report_template.html"), "r") as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( "<p>Overview_Content</p>", Overview_Content) result_file.write(report_template) html_report.append({ "path": result_file_path, "name": os.path.basename(result_file_path), "label": os.path.basename(result_file_path), "description": "HTML summary report for StringTie App", }) return html_report def _generate_html_report(self, result_directory, obj_ref): """ _generate_html_report: generate html summary report """ log("start generating html report") html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, "report.html") expression_object = self.ws.get_objects2( {"objects": [{ "ref": obj_ref }]})["data"][0] expression_info = expression_object["info"] expression_data = expression_object["data"] expression_object_type = expression_info[2] Overview_Content = "" if re.match("KBaseRNASeq.RNASeqExpression-\d.\d", expression_object_type): Overview_Content += "<br/><table><tr><th>Generated Expression Object</th>" Overview_Content += "<th></th></tr>" Overview_Content += "<tr><th>Expression Name</th><th>Condition</th></tr>" Overview_Content += "<tr><td>{} ({})</td>".format( expression_info[1], obj_ref) Overview_Content += "<td>{}</td></tr>".format( expression_data["condition"]) Overview_Content += "</table>" elif re.match("KBaseSets.ExpressionSet-\d.\d", expression_object_type): Overview_Content += ( "<br/><table><tr><th>Generated ExpressionSet Object</th></tr>") Overview_Content += "<tr><td>{} ({})".format( expression_info[1], obj_ref) Overview_Content += "</td></tr></table>" Overview_Content += "<p><br/></p>" Overview_Content += "<table><tr><th>Generated Expression Objects</th>" Overview_Content += "<th></th></tr>" Overview_Content += "<tr><th>Expression Name</th><th>Condition</th></tr>" for item in expression_data["items"]: item_expression_object = self.ws.get_objects2( {"objects": [{ "ref": item["ref"] }]})["data"][0] item_expression_info = item_expression_object["info"] item_expression_data = item_expression_object["data"] expression_name = item_expression_info[1] Overview_Content += "<tr><td>{} ({})</td>".format( expression_name, item["ref"]) Overview_Content += "<td>{}</td>".format( item_expression_data["condition"]) Overview_Content += "</tr>" Overview_Content += "</table>" with open(result_file_path, "w") as result_file: with open( os.path.join(os.path.dirname(__file__), "report_template.html"), "r") as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( "<p>Overview_Content</p>", Overview_Content) result_file.write(report_template) html_report.append({ "path": result_file_path, "name": os.path.basename(result_file_path), "label": os.path.basename(result_file_path), "description": "HTML summary report for StringTie App", }) return html_report def _generate_merge_report(self, workspace_name, result_directory): """ _generate_merge_report: generate summary report """ log("creating merge report") output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_merge_html_report(result_directory) report_params = { "message": "", "workspace_name": workspace_name, "file_links": output_files, "html_links": output_html_files, "direct_html_link_index": 0, "html_window_height": 366, "report_object_name": "kb_stringtie_report_" + str(uuid.uuid4()), } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { "report_name": output["name"], "report_ref": output["ref"] } return report_output def _generate_report( self, obj_ref, workspace_name, result_directory, exprMatrix_FPKM_ref=None, exprMatrix_TPM_ref=None, genome_ref=None, ): """ _generate_report: generate summary report """ log("creating report") output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report(result_directory, obj_ref) expression_object = self.ws.get_objects2( {"objects": [{ "ref": obj_ref }]})["data"][0] expression_info = expression_object["info"] expression_data = expression_object["data"] objects_created = [] expression_object_type = expression_info[2] if re.match("KBaseRNASeq.RNASeqExpression-\d+.\d+", expression_object_type): objects_created.append({ "ref": obj_ref, "description": "Expression generated by StringTie" }) elif re.match("KBaseSets.ExpressionSet-\d+.\d+", expression_object_type): objects_created.append({ "ref": obj_ref, "description": "ExpressionSet generated by StringTie" }) items = expression_data["items"] for item in items: objects_created.append({ "ref": item["ref"], "description": "Expression generated by StringTie", }) objects_created.append({ "ref": exprMatrix_FPKM_ref, "description": "FPKM ExpressionMatrix generated by StringTie", }) objects_created.append({ "ref": exprMatrix_TPM_ref, "description": "TPM ExpressionMatrix generated by StringTie", }) if genome_ref: objects_created.append({ "ref": genome_ref, "description": "Genome containing novel transcripts generated " "by StringTie", }) report_params = { "message": "", "workspace_name": workspace_name, "file_links": output_files, "objects_created": objects_created, "html_links": output_html_files, "direct_html_link_index": 0, "html_window_height": 366, "report_object_name": "kb_stringtie_report_" + str(uuid.uuid4()), } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { "report_name": output["name"], "report_ref": output["ref"] } return report_output def _process_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log("start processing RNASeqAlignment object\n") log("params:\n{}".format(json.dumps(params, indent=1))) alignment_ref = params.get("alignment_ref") alignment_set_object = self.ws.get_objects2( {"objects": [{ "ref": alignment_ref }]})["data"][0] alignment_info = alignment_set_object["info"] alignment_data = alignment_set_object["data"] alignment_name = alignment_info[1] alignment_label = alignment_data["condition"] result_directory = os.path.join( self.scratch, alignment_name + "_" + str(uuid.uuid4())) self._mkdir_p(result_directory) # input files if not params.get("gtf_file"): params["gtf_file"] = self._get_gtf_file(alignment_ref, result_directory) if params.get("label"): if params["label"] in open(params["gtf_file"]).read(): raise ValueError( "Provided prefix for transcripts matches an existing " "feature ID. Please select a different label for " "transcripts.") else: shutil.copy(params.get("gtf_file"), result_directory) params["input_file"] = self._get_input_file(alignment_ref) log("using {} as reference annotation file.".format( params.get("gtf_file"))) # output files self.output_transcripts = "transcripts.gtf" params["output_transcripts"] = os.path.join(result_directory, self.output_transcripts) self.gene_abundances_file = "genes.fpkm_tracking" params["gene_abundances_file"] = os.path.join( result_directory, self.gene_abundances_file) command = self._generate_command(params) self._run_command(command) if params.get("exchange_gene_ids"): exchange_gene_ids(result_directory) if "generate_ws_object" in params and not params.get( "generate_ws_object"): log("skip generating expression object") expression_obj_ref = "" else: expression_obj_ref = self._save_expression( result_directory, alignment_ref, params.get("workspace_name"), params["expression_suffix"], params.get("genome_ref"), params.get("novel_isoforms", 0), ) returnVal = { "result_directory": result_directory, "expression_obj_ref": expression_obj_ref, "alignment_ref": alignment_ref, "annotation_file": params["gtf_file"], "alignment_label": alignment_label, } return returnVal def _process_alignment_set_object(self, params): """ _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object """ log("start processing AlignmentSet object\nparams:\n{}".format( json.dumps(params, indent=1))) alignment_set_ref = params.get("alignment_set_ref") alignment_set = self.set_client.get_reads_alignment_set_v1({ "ref": alignment_set_ref, "include_item_info": 0, "include_set_item_ref_paths": 1, }) # pull down the genome once so as to avoid duplicate effort if not params.get("gtf_file"): alignment_ref = alignment_set["data"]["items"][0]["ref_path"] params["gtf_file"] = self._get_gtf_file(alignment_ref, self.scratch) if params.get("label"): if params["label"] in open(params["gtf_file"]).read(): raise ValueError( "Provided prefix for transcripts matches an existing " "feature ID. Please select a different label for " "transcripts.") def wrapped_process_alignment_object(params): try: returnVal = self._process_alignment_object(params) except: log("caught exception in worker") exctype, value = sys.exc_info()[:2] returnVal = {"exception": "{}: {}".format(exctype, value)} return returnVal mul_processor_params = [] for alignment in alignment_set["data"]["items"]: alignment_ref = alignment["ref_path"] alignment_upload_params = params.copy() alignment_upload_params["alignment_ref"] = alignment_ref mul_processor_params.append(alignment_upload_params) cpus = min(params.get("num_threads"), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log("running _process_alignment_object with {} cpus".format(cpus)) alignment_expression_map = pool.map(wrapped_process_alignment_object, mul_processor_params) for proc_alignment_return in alignment_expression_map: if "exception" in proc_alignment_return: error_msg = "Caught exception in worker\n" error_msg += "Exception: {}".format( proc_alignment_return["exception"]) raise ValueError(error_msg) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) for proc_alignment_return in alignment_expression_map: alignment_ref = proc_alignment_return.get("alignment_ref") alignment_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_ref }]}) alignment_name = alignment_info["infos"][0][1] self._run_command("cp -R {} {}".format( proc_alignment_return.get("result_directory"), os.path.join(result_directory, alignment_name), )) if "generate_ws_object" in params and not params.get( "generate_ws_object"): log("skip generating expression set object") expression_obj_ref = "" expression_matrix_refs = {} else: expression_obj_ref = self._save_expression_set( alignment_expression_map, alignment_set_ref, params.get("workspace_name"), params["expression_set_suffix"], params.get("genome_ref"), ) expression_matrix_refs = self._save_expression_matrix( expression_obj_ref, params.get("workspace_name")) annotation_file_name = os.path.basename( alignment_expression_map[0]["annotation_file"]) annotation_file_path = os.path.join(result_directory, os.listdir(result_directory)[0], annotation_file_name) returnVal = { "result_directory": result_directory, "expression_obj_ref": expression_obj_ref, "annotation_file": annotation_file_path, "exprMatrix_FPKM_ref": expression_matrix_refs.get("exprMatrix_FPKM_ref"), "exprMatrix_TPM_ref": expression_matrix_refs.get("exprMatrix_TPM_ref"), } return returnVal def _run_merge_option(self, result_directory, params, annotation_file): log("start running stringtie merge") result_dirs = os.listdir(result_directory) merge_directory = os.path.join(result_directory, "merge_result") self._mkdir_p(merge_directory) option_params = params.copy() option_params.pop("num_threads", None) option_params.pop("ballgown_mode", None) option_params.pop("skip_reads_with_no_ref", None) option_params.pop("junction_coverage", None) option_params.pop("junction_base", None) option_params.pop("min_read_coverage", None) option_params.pop("min_locus_gap_sep_value", None) output_merge = "stringtie_merge.gtf" option_params["output_transcripts"] = os.path.join( merge_directory, output_merge) command = self.STRINGTIE_TOOLKIT_PATH + "/stringtie " command += "--merge " command += "-G {} ".format(annotation_file) for key, option in self.OPTIONS_MAP.items(): option_value = option_params.get(key) if key in self.BOOLEAN_OPTIONS and option_value: option_value = " " if option_value: command += "{} {} ".format(option, option_value) for result_dir in result_dirs: gtf_file = os.path.join(result_directory, result_dir, "transcripts.gtf") command += "{} ".format(gtf_file) self._run_command(command) def _get_genome_ref(self, alignment_set_ref): """Get a genome ref from an alignment set""" alignment_set_data = self.dfu.get_objects( {"object_refs": [alignment_set_ref]})["data"][0]["data"] for alignment in alignment_set_data["items"]: alignment_data = self.dfu.get_objects( {"object_refs": [alignment["ref"]]})["data"][0]["data"] return alignment_data["genome_id"] def _save_genome_with_novel_isoforms(self, workspace, genome_ref, gff_file, new_genome_name=None): """""" log("Saving genome with novel isoforms") genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]["data"] if "assembly_ref" in genome_data: assembly_ref = genome_data["assembly_ref"] elif "contigset_ref" in genome_data: assembly_ref = genome_data["contigset_ref"] else: raise ValueError("Genome missing assembly") fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"] if not new_genome_name: new_genome_name = genome_data["id"] + "_stringtie" ret = self.gfu.fasta_gff_to_genome({ "workspace_name": workspace, "genome_name": new_genome_name, "fasta_file": { "path": fasta_file }, "gff_file": { "path": gff_file }, "source": "StringTie", }) return ret["genome_ref"] def _novel_isoform_mode(self, alignment_object_ref, params): """This is a three step process: First, run StringTie on all the alignments individually which will produce novel transcripts. Next, merge the resulting transcripts together. Finally, rerun StringTie with the merged GTF file as the reference genome. """ log("running Stringtie the 1st time") params.update({ "ballgown_mode": 0, "skip_reads_with_no_ref": 0, "generate_ws_object": False, "exchange_gene_ids": 1, }) returnVal = self._process_alignment_set_object(params) first_run_result_dir = returnVal.get("result_directory") annotation_file = returnVal["annotation_file"] log("running StringTie merge") self._run_merge_option(first_run_result_dir, params, annotation_file) merge_file = os.path.join(first_run_result_dir, "merge_result", "stringtie_merge.gtf") old_genome_ref = self._get_genome_ref(alignment_object_ref) ret = self.gfu.genome_to_gff({ "genome_ref": old_genome_ref, "target_dir": first_run_result_dir }) self._run_gffcompare(ret["file_path"], merge_file) comp_file = os.path.join(first_run_result_dir, "merge_result", "gffcmp.annotated.gtf") upload_file = _make_gff(comp_file, ret["file_path"], params.get("label", "MSTRG.")) params["genome_ref"] = self._save_genome_with_novel_isoforms( params["workspace_name"], old_genome_ref, upload_file, params.get("novel_isoforms", {}).get("stringtie_genome_name"), ) _update_merge_file(merge_file) log("running StringTie the 3rd time with merged gtf") params.update({ "gtf_file": merge_file, "generate_ws_object": True, "exchange_gene_ids": 0, "ballgown_mode": 1, "skip_reads_with_no_ref": 1, }) returnVal = self._process_alignment_set_object(params) shutil.move( os.path.join(first_run_result_dir, "merge_result"), returnVal.get("result_directory"), ) report_output = self._generate_report( returnVal.get("expression_obj_ref"), params.get("workspace_name"), returnVal.get("result_directory"), returnVal.get("exprMatrix_FPKM_ref"), returnVal.get("exprMatrix_TPM_ref"), params["genome_ref"], ) return report_output, returnVal def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config["SDK_CALLBACK_URL"] self.token = config["KB_AUTH_TOKEN"] self.shock_url = config["shock-url"] self.srv_wiz_url = config["srv-wiz-url"] self.scratch = config["scratch"] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.set_client = SetAPI(self.srv_wiz_url, service_ver="dev") def run_stringtie_app(self, params): """ run_stringtie_app: run StringTie app (http://ccb.jhu.edu/software/stringtie/index.shtml?t=manual) required params: alignment_object_ref: Alignment or AlignmentSet object reference workspace_name: the name of the workspace it gets saved to expression_set_suffix: suffix append to expression set object name expression_suffix: suffix append to expression object name mode: one of ['normal', 'merge', 'novel_isoform'] optional params: num_threads: number of processing threads junction_base: junctions that don't have spliced reads junction_coverage: junction coverage disable_trimming: disables trimming at the ends of the assembled transcripts min_locus_gap_sep_value: minimum locus gap separation value ballgown_mode: enables the output of Ballgown input table files skip_reads_with_no_ref: reads with no reference will be skipped maximum_fraction: maximum fraction of muliple-location-mapped reads label: prefix for the name of the output transcripts min_length: minimum length allowed for the predicted transcripts min_read_coverage: minimum input transcript coverage min_isoform_abundance: minimum isoform abundance return: result_directory: folder path that holds all files generated by run_stringtie_app expression_obj_ref: generated Expression/ExpressionSet object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log("--->\nrunning StringTieUtil.run_stringtie\n" + "params:\n{}".format(json.dumps(params, indent=1))) self._validate_run_stringtie_params(params) if (isinstance(params.get("novel_isoforms"), dict) and "transcript_label" in params["novel_isoforms"]): params["label"] = params["novel_isoforms"]["transcript_label"] alignment_object_ref = params.get("alignment_object_ref") alignment_object_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_object_ref }]})["infos"][0] alignment_object_type = alignment_object_info[2] if re.match("KBaseRNASeq.RNASeqAlignment-\d.\d", alignment_object_type): params.update({"alignment_ref": alignment_object_ref}) returnVal = self._process_alignment_object(params) report_output = self._generate_report( returnVal.get("expression_obj_ref"), params.get("workspace_name"), returnVal.get("result_directory"), ) returnVal.update(report_output) elif re.match("KBaseRNASeq.RNASeqAlignmentSet-\d.\d", alignment_object_type) or re.match( "KBaseSets.ReadsAlignmentSet-\d.\d", alignment_object_type): params.update({"alignment_set_ref": alignment_object_ref}) if params.get("novel_isoforms"): report_output, returnVal = self._novel_isoform_mode( alignment_object_ref, params) else: params.update({ "ballgown_mode": 1, "skip_reads_with_no_ref": 1, "exchange_gene_ids": 0, }) returnVal = self._process_alignment_set_object(params) report_output = self._generate_report( returnVal.get("expression_obj_ref"), params.get("workspace_name"), returnVal.get("result_directory"), returnVal.get("exprMatrix_FPKM_ref"), returnVal.get("exprMatrix_TPM_ref"), ) returnVal.update(report_output) else: error_msg = "Invalid input object type\nObject info:\n{}".format( alignment_object_info) raise ValueError(error_msg) return returnVal
class QualiMapRunner: QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap' JAVA_MEM_DEFAULT_SIZE = '16G' LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024 # 20 GB TIMEOUT = 72 * 60 * 60 # 72 hours def _get_file_size(self, file_path): file_size = os.path.getsize(file_path) print('File size: {} -- {}'.format(file_size, file_path)) return file_size def _large_file(self, file_path): filename, file_extension = os.path.splitext(file_path) multiplier = 0 if file_extension == '.txt': total_file_size = 0 with open(file_path, 'r') as f: for line in f: bam_file_path = line.split('\t')[1] total_file_size += self._get_file_size(bam_file_path) print('Total file size: {}'.format(total_file_size)) multiplier = int(total_file_size) // int(self.LARGE_BAM_FILE_SIZE) else: multiplier = int(self._get_file_size(file_path)) // int( self.LARGE_BAM_FILE_SIZE) print('setting number of windows multiplier to: {}'.format(multiplier)) return multiplier def _timeout_handler(self, signum, frame): print('Signal handler called with signal', signum) raise ValueError('QualiMap takes too long') def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc'] def run_app(self, params): self.validate_params(params) print('Validated Params = ') pprint(params) run_info = self.get_run_info(params) if run_info.get('mode') not in ['single', 'multi']: raise ValueError( 'Error in fetching the type to determine run settings.') run_error = False try: signal.signal(signal.SIGALRM, self._timeout_handler) signal.alarm(self.TIMEOUT) if run_info['mode'] == 'single': result = self.run_bamqc(params['input_ref'], run_info['input_info']) elif run_info['mode'] == 'multi': result = self.run_multi_sample_qc(params['input_ref'], run_info['input_info']) signal.alarm(0) except Exception: run_error = True workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) os.makedirs(workdir) with open(os.path.join(workdir, 'qualimapReport.html'), 'w') as report: report.write('<html><body><p></p></body></html>') package_info = self.package_output_folder( workdir, 'QualiMap_report', 'EMPTY HTML report directory for QualiMap BAM QC', 'qualimapReport.html') result = { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info, 'shock_id': None } error_msg = 'Running QualiMap returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Generating simple report instead\n' print(error_msg) if params['create_report']: result = self.create_report(result, params['output_workspace'], run_error, params['input_ref']) return result def create_report(self, result, output_workspace, run_error=None, input_ref=None): if run_error: objects_created = [] info = self.get_obj_info(input_ref) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: objects_created.append({ 'ref': input_ref, 'description': 'Alignment' }) if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: objects_created.append({ 'ref': input_ref, 'description': 'AlignmentSet' }) reads_alignment_info = self.get_alignments_from_set(input_ref) for alignment in reads_alignment_info: alignment_ref = alignment.get('ref') objects_created.append({ 'ref': alignment_ref, 'description': 'Alignment' }) report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result qc_result_zip_info = result['qc_result_zip_info'] report_info = self.kbr.create_extended_report({ 'message': '', 'objects_created': [], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result def get_gtf_file(self, input_ref, set_op=False): print('Start fetching GFF file from genome') if set_op: set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) input_ref = set_data['data']['items'][0]['ref'] obj_data = self.dfu.get_objects({"object_refs": [input_ref]})['data'][0]['data'] genome_ref = obj_data.get('genome_id') if not genome_ref: raise ValueError( 'Alignment is not associated with a Genome object') result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4())) os.makedirs(result_directory) genome_gtf_file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'is_gtf': True, 'target_dir': result_directory })['file_path'] return genome_gtf_file def run_bamqc(self, input_ref, input_info): # download the input and setup a working dir alignment_info = self.rau.download_alignment({'source_ref': input_ref}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) try: gtf_file = self.get_gtf_file(input_ref) except: gtf_file = '' workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) options = [ '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) # always use large mem multiplier = self._large_file(bam_file_path) if multiplier: window_size = multiplier * 400 print(f'using larger window size: {window_size} and Java memory: ' f'{self.JAVA_MEM_DEFAULT_SIZE}') options.append( '-nw {}'.format(window_size)) # increase size of windows self.run_cli_command('bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap BAM QC', 'qualimapReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def run_multi_sample_qc(self, input_ref, input_info): # download the input and setup a working dir reads_alignment_info = self.get_alignments_from_set(input_ref) try: gtf_file = self.get_gtf_file(input_ref, set_op=True) except: gtf_file = '' suffix = 'qualimap_' + str(int(time.time() * 10000)) workdir = os.path.join(self.scratch_dir, suffix) os.makedirs(workdir) input_file_path = self.create_multi_qualimap_cfg( reads_alignment_info, workdir) options = [ '-d', input_file_path, '-r', '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] multiplier = self._large_file(input_file_path) if multiplier: window_size = multiplier * 400 print(f'using larger window size: {window_size} and Java memory: ' f'{self.JAVA_MEM_DEFAULT_SIZE}') options.append(f'-nw {window_size}') # increase size of windows options.append(f'--java-mem-size={self.JAVA_MEM_DEFAULT_SIZE}') self.run_cli_command('multi-bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap Multi-sample BAM QC', 'multisampleBamQcReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def get_alignments_from_set(self, alignment_set_ref): set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 1 }) items = set_data['data']['items'] reads_alignment_data = [] for alignment in items: alignment_info = self.rau.download_alignment( {'source_ref': alignment['ref']}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) label = None if 'label' in alignment: label = alignment['label'] reads_alignment_data.append({ 'bam_file_path': bam_file_path, 'ref': alignment['ref'], 'label': label, 'info': alignment['info'] }) return reads_alignment_data def create_multi_qualimap_cfg(self, reads_alignment_info, workdir): # Group by labels if there is at least one defined use_labels = False for alignment in reads_alignment_info: if alignment['label']: use_labels = True break # write the file input_file_path = os.path.join(workdir, 'multi_input.txt') input_file = open(input_file_path, 'w') name_lookup = {} for alignment in reads_alignment_info: name = alignment['info'][1] if name in name_lookup: name_lookup[name] += 1 name = name + '_' + str(name_lookup[name]) else: name_lookup[name] = 1 input_file.write(name + '\t' + alignment['bam_file_path']) if use_labels: if alignment['label']: input_file.write('\t' + alignment['label']) else: input_file.write('\tunlabeled') input_file.write('\n') input_file.close() return input_file_path def get_run_info(self, params): info = self.get_obj_info(params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: return {'mode': 'single', 'input_info': info} if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: return {'mode': 'multi', 'input_info': info} raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def validate_params(self, params): if 'input_ref' not in params: raise ValueError( 'required parameter field "input_ref" was not set') create_report = False if 'create_report' in params: if int(params['create_report']) == 1: if 'output_workspace' not in params: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) if not params['output_workspace']: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) create_report = True params['create_report'] = create_report def run_cli_command(self, command, options, cwd=None): if command not in self.valid_commands: raise ValueError('Invalid QualiMap command: ' + str(command)) command = [self.QUALIMAP_PATH, command] + options print('Running: ' + ' '.join(command)) if not cwd: cwd = self.scratch_dir p = subprocess.Popen(command, cwd=cwd, shell=False) exitCode = p.wait() if exitCode == 0: print('Success, exit code was: ' + str(exitCode)) else: raise ValueError('Error running command: ' + ' '.join(command) + '\n' + 'Exit Code: ' + str(exitCode)) def find_my_bam_file(self, dirpath): bam_path = None for f in os.listdir(dirpath): fullpath = os.path.join(dirpath, f) if os.path.isfile(fullpath) and f.lower().endswith('.bam'): if bam_path is not None: raise ValueError( 'Error! Too many BAM files were downloaded for this alignment!' ) bam_path = fullpath if bam_path is None: raise ValueError( 'Error! No BAM files were downloaded for this alignment!') return bam_path def package_output_folder(self, folder_path, zip_file_name, zip_file_description, index_html_file): """ Simple utility for packaging a folder and saving to shock """ output = self.dfu.file_to_shock({ 'file_path': folder_path, 'make_handle': 0, 'pack': 'zip' }) return { 'shock_id': output['shock_id'], 'name': zip_file_name, 'description': zip_file_description, 'index_html_file_name': index_html_file } def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
def build_report_view_data(host: str, ws_client: Workspace, result: list) -> dict: """ Returns a structure like this: { html: { height: max height string for iframes (default = 500px, unless present in report), set_height: boolean - if True, then apply height to the height style value as well. direct: string (optional) - direct html to plop in the page, iframe_style: string (optional) - styling for direct html iframe, links: [{ url: string, name: string, description: string, handle: ? label: ? }], paths: [ path1, path2, path3, ... ] for all urls in links (just a convenience), link_idx: index of paths to use (this is a little funky, might get cleared up in a later iteration.) (I suspect this'll be here 3 years later. Today's 2/13/2020. Let's see!) file_links: [{ 'URL': 'https://ci.kbase.us/services/shock-api/node/a2625b71-48d5-4ba6-8603-355485508da8', 'description': 'JGI Metagenome Assembly Report', 'handle': 'KBH_253154', 'label': 'assembly_report', 'name': 'assembly_report.zip' }] } objects: [{ 'upa': '...', 'name': 'foo', 'type': '...', 'description': '...' }] summary: '', summary_height: height string for summary panel (default = 500px unless specified in report), report: '' } """ if not result: return {} if not isinstance(result, list): result = [result] if (not result[0] or not isinstance(result[0], dict) or not result[0].get('report_name') or not result[0].get('report_ref')): return {} report_ref = result[0]['report_ref'] report = ws_client.get_objects2({'objects': [{ 'ref': report_ref }]})['data'][0]['data'] """{'direct_html': None, 'direct_html_link_index': None, 'file_links': [], 'html_links': [], 'html_window_height': None, 'objects_created': [{'description': 'Annotated genome', 'ref': '43666/6/1'}], 'summary_window_height': None, 'text_message': 'Genome saved to: wjriehl:narrative_1564507007662/some_genome\nNumber of genes predicted: 3895\nNumber of protein coding genes: 3895\nNumber of genes with non-hypothetical function: 2411\nNumber of genes with EC-number: 1413\nNumber of genes with Seed Subsystem Ontology: 1081\nAverage protein length: 864 aa.\n', 'warnings': []} """ created_objs = [] if report.get('objects_created'): report_objs_created = report['objects_created'] # make list to look up obj types with get_object_info3 info_lookup = [{"ref": o["ref"]} for o in report_objs_created] infos = ws_client.get_object_info3({'objects': info_lookup})['infos'] for idx, info in enumerate(infos): created_objs.append({ 'upa': report_objs_created[idx]['ref'], 'description': report_objs_created[idx].get('description', ''), 'name': info[1], 'type': info[2].split('-')[0].split('.')[-1], 'link': host + '/#dataview/' + report_objs_created[idx]['ref'] }) html_height = report.get("html_window_height") if html_height is None: html_height = 500 html = {"height": f"{html_height}px", "set_height": True} if report.get("direct_html"): if not report.get("direct_html").startswith("<html"): html["set_height"] = False html["direct"] = "data:text/html;charset=utf-8," + quote( report.get("direct_html")) if report.get("html_links"): idx = report.get("direct_html_link_index", 0) if idx is None or idx < 0 or idx >= len(report["html_links"]): idx = 0 html["links"] = report["html_links"] html["paths"] = list() for i, link in enumerate(html["links"]): html["paths"].append(f'/api/v1/{report_ref}/$/{i}/{link["name"]}') html["link_idx"] = idx if report.get("file_links"): html["file_links"] = report["file_links"] summary_height = report.get("summary_window_height") if summary_height is None: summary_height = 500 html["iframe_style"] = f"max-height: {html['height']}" if html["set_height"]: html["iframe_style"] += f"; height: {html['height']}" else: html["iframe_style"] += "; height: auto" return { "objects": created_objs, "summary": report.get("text_message", ""), "summary_height": f"{summary_height}px", "html": html }
class FastaToAssembly: def __init__(self, callback_url, scratch, ws_url): self.scratch = scratch self.dfu = DataFileUtil(callback_url) self.ws = Workspace(ws_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print(f'parsing FASTA file: {fasta_file_path}') assembly_data = self.parse_fasta(fasta_file_path, params) print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w')) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): """ construct the WS object data to save based on the parsed info and params """ assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] fasta_file_handle_info['handle'] = fasta_file_handle_info['handle'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0] assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}' if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return sort_dict(assembly_data) def parse_fasta(self, fasta_file_path, params): """ Do the actual work of inspecting each contig """ # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This FASTA file may have amino acids in it instead ' 'of the required nucleotides.') raise ValueError(f"This FASTA file has non nucleic acid characters: " f"{character}") # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence.encode()).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The FASTA header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data @staticmethod def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length): """ generates SeqRecords iterator for writing from a legacy contigset object """ rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter ' f'than {(min_contig_length)} bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): """ removes all contigs less than the min_contig_length provided """ filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() if len(obj_data["contigs"]) == 0: raise ValueError('There are no contigs to save, thus there is no valid assembly.') obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): """ Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; """ print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): """ Setup the input_directory by fetching the files and returning the path to the file""" file_path = None if 'file' in params: if not os.path.isfile(params['file']['path']): raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.') file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print(f'Downloading file from SHOCK node: {params["shock_id"]}') sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print(f'Downloading file from: {params["ftp_url"]}') sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid FASTA could be extracted based on the input parameters') @staticmethod def validate_params(params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one FASTA file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
class VirSorterUtils: def __init__(self, config): self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.mgu = MetagenomeUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.ws = Workspace(config['workspace-url'], token=config['token']) def VirSorter_help(self): command = 'wrapper_phage_contigs_sorter_iPlant.pl --help' self._run_command(command) def get_fasta(self, ref): # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0 obj_type = self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0][2] if 'assembly' in obj_type.lower(): genome_ref = ref elif 'kbasegenomes' in obj_type.lower(): data = self.ws.get_objects2({ 'objects': [{ 'ref': ref, 'included': ['assembly_ref'], 'strict_maps': 1 }] })['data'][0]['data'] genome_ref = data['assembly_ref'] else: raise ValueError( f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or " f"KBaseGenomeAnnotations.Assembly required.") return self.au.get_assembly_as_fasta({'ref': genome_ref})['path'] def run_VirSorter(self, params): params['SDK_CALLBACK_URL'] = self.callback_url params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN'] # Get contigs from 'assembly' genome_fp = self.get_fasta(params['genomes']) command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data' # Add in first args command += f' -f {genome_fp} --db {params["database"]}' # Check if additional genomes were submitted if params.get('add_genomes'): add_genomes_fp = self.get_fasta(params['add_genomes']) print(f'Added genomes DETECTED: {add_genomes_fp}') command += f' --cp {add_genomes_fp}' bool_args = ['virome', 'diamond', 'keep_db', 'no_c'] # keep_db = keep-db for bool_arg in bool_args: if params[ bool_arg] == 1: # 0 is true and therefore run... though for some reason it's reversed on json if bool_arg == 'keep_db': bool_arg = 'keep-db' command += f' --{bool_arg}' self._run_command(command) report = self._generate_report( params) # Basically, do everything that's after the tool runs return report def _run_command(self, command): """ :param command: :return: """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, err = pipe.communicate() exitCode = pipe.returncode if exitCode == 0: log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format( exitCode, output, err) raise RuntimeError(error_msg) def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id): columns = [ 'Contig_id', 'Nb genes contigs', 'Fragment', 'Nb genes', 'Category', 'Nb phage hallmark genes', 'Phage gene enrichment sig', 'Non-Caudovirales phage gene enrichment sig', 'Pfam depletion sig', 'Uncharacterized enrichment sig', 'Strand switch depletion sig', 'Short genes enrichment sig', ] try: with open(virsorter_global_fp, 'r') as vir_fh: data = {} category = '' for line in vir_fh: if line.startswith('## Contig_id'): continue elif line.startswith( '## ' ): # If 'header' lines are consumed by 1st if, then remaining should be good category = line.split('## ')[-1].split(' -')[0] else: values = line.strip().split(',') data[values[0]] = dict(zip(columns[1:], values[1:])) except: vir_path = os.path.join(os.getcwd(), 'virsorter-out') files = os.listdir(vir_path) raise RuntimeError( f"{virsorter_global_fp} is not a file. existing files {files}." ) df = pd.DataFrame().from_dict(data, orient='index') df.index.name = columns[0] df.reset_index(inplace=True) html = df.to_html(index=False, classes='my_class table-striped" id = "my_id') # Need to file write below direct_html = html_template.substitute( html_table=html, affi_contigs_shock_id=affi_contigs_shock_id) # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer start_header = Literal("<thead>") end_header = Literal("</thead>") text = start_header + SkipTo(end_header) new_text = '' for data, start_pos, end_pos in text.scanString(direct_html): new_text = ''.join(data).replace( ' style="text-align: right;"', '').replace( 'thead>', 'tfoot>\n ') + '\n</tfoot>' # Get start and end positions to insert new text end_tbody = Literal("</tbody>") end_table = Literal("</table>") insertion_pos = end_tbody + SkipTo(end_table) final_html = '' for data, start_pos, end_pos in insertion_pos.scanString(direct_html): final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[ start_pos + 8:] return final_html def get_assembly_contig_ids(self, assembly_ref): """get contig ids from assembly_ref""" contigs = self.ws.get_objects2( {'objects': [{ 'ref': assembly_ref, 'included': ['contigs'] }]})['data'][0]['data']['contigs'] return contigs.keys() def _generate_report(self, params): """ :param params: :return: """ # Get URL self.dfu = dfu(params['SDK_CALLBACK_URL']) # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out') print( f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}' ) # Replacing individual download files with BinnedContigs # kb_deseq adds output files, then builds report files and sends all of them to the workspace output_files = [] # Appended list of dicts containing attributes # Collect all the files needed to report to end-user # Get all predicted viral sequences pred_fnas = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.fasta')) pred_gbs = glob.glob( os.path.join(virsorter_outdir, 'Predicted_viral_sequences/VIRSorter_*.gb')) # Summary 'table' glob_signal = os.path.join(virsorter_outdir, 'VIRSorter_global-phage-signal.csv') print('Identified the following predicted viral sequences:\n{}'.format( '\n\t'.join(pred_fnas))) if len(pred_fnas) == 0: print( f"Unable to find predicted viral sequences, here are the directory's content:\n" f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}" ) if os.path.exists(glob_signal): print(f'Identified the global phage signal: {glob_signal}') lines = -1 # Don't count header with open(glob_signal) as fh: for ln in fh: lines += 1 if lines == 0: print('But it is EMPTY!') else: print( 'Unable to find the global phage signal file. Was there an error during the run?' ) # Append error and out files from VIRSorter err_fp = os.path.join(virsorter_outdir, 'logs/err') # if os.path.exists(err_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/err'), # 'name': 'VIRSorter_err', # 'label': 'VIRSorter_err', # 'description': 'VIRSorter error log file, generated from the tool itself.' # }) out_fp = os.path.join(virsorter_outdir, 'logs/out') # if os.path.exists(out_fp): # output_files.append({ # 'path': os.path.join(virsorter_outdir, 'logs/out'), # 'name': 'VIRSorter_out', # 'label': 'VIRSorter_out', # 'description': 'VIRSorter output log file, generated from the tool itself.' # }) if not (os.path.exists(err_fp) or os.path.exists(out_fp)): print( 'Unable to find err and/or out files in LOG directory, contents:' ) print(os.listdir(os.path.join(virsorter_outdir, 'logs'))) # Make output directory output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_dir) # Deal with nucleotide and protein fasta pred_fna_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_fna.tar.gz') with tarfile.open( pred_fna_tgz_fp, 'w:gz') as pred_fna_tgz_fh: # Compress to minimize disk usage for pred_fna in pred_fnas: pred_fna_tgz_fh.add(pred_fna, arcname=os.path.basename(pred_fna)) output_files.append({ 'path': pred_fna_tgz_fp, 'name': os.path.basename(pred_fna_tgz_fp), 'label': os.path.basename(pred_fna_tgz_fp), 'description': 'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_fna_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in FASTA format: ' f'{pred_fna_tgz_fp}') pred_gb_tgz_fp = os.path.join(output_dir, 'VIRSorter_predicted_viral_gb.tar.gz') with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh: for pred_gb in pred_gbs: pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb)) output_files.append({ 'path': pred_gb_tgz_fp, 'name': os.path.basename(pred_gb_tgz_fp), 'label': os.path.basename(pred_gb_tgz_fp), 'description': 'Genbank-formatted sequences of VIRSorter predicted viruses' }) if os.path.exists(pred_gb_tgz_fp): print( f'Generated gzipped version of the predicted viral sequences in Genbank format: ' f'{pred_gb_tgz_fp}') # To create BinnedContig, need to create another directory with each of the "bins" as separate files? binned_contig_output_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(binned_contig_output_dir) # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage # of its features, but also to feed more easily into other tools (e.g. vConTACT) created_objects = [] # Will store the objects that go to the workspace # load contig ids from the assembly input # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref) assembly_contig_ids = self.get_assembly_contig_ids( params['genomes']) # Will fail for Genome summary_fp = os.path.join( binned_contig_output_dir, 'VIRSorter.summary') # Anything that ends in .summary with open(summary_fp, 'w') as summary_fh: summary_writer = csv.writer(summary_fh, delimiter='\t', quoting=csv.QUOTE_MINIMAL) summary_writer.writerow( ['Bin name', 'Completeness', 'Genome size', 'GC content']) for category_fp in pred_fnas: # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention category = os.path.basename(category_fp).split( 'cat-')[-1].split('.')[0] dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3)) dest_fp = os.path.join(output_dir, dest_fn) binned_contig_fp = os.path.join(binned_contig_output_dir, dest_fn) genome_size = 0 gc_content = [] # Need stats for summary file # Also need to adjust sequence name so binnedContig object can retrieve sequences adjusted_sequences = [] with open(category_fp, 'rU') as category_fh: for record in SeqIO.parse(category_fh, 'fasta'): seq = record.seq gc_content.append(SeqUtils.GC(seq)) genome_size += len(seq) # This is very dirty, but need to change name to match original contigs record.id = record.id.replace('VIRSorter_', '').replace( '-circular', '').split('-cat_')[0] if 'gene' in record.id: # Prophage record.id = record.id.split('_gene')[0] record.id = record.id.rsplit('_', 1)[0] # here we make sure that the id's line up with contig ids in the input assembly object if record.id not in assembly_contig_ids: for assembly_contig_id in assembly_contig_ids: # first check if record.id is substring of current contig id, # then check if current contig id is substring of record.id # NOTE: this is not a perfect way of checking and will likely # fail in some circumstances. # A more complete check would be to make sure there is a 1:1 # mapping of contig id's in the assembly object as compared to # the binned contig object (the fasta files defined here). if (record.id in assembly_contig_id) or ( assembly_contig_id in record.id): record.id = assembly_contig_id break record.description = '' record.name = '' adjusted_sequences.append(record) if genome_size != 0: # Empty file summary_writer.writerow([ dest_fn, '100%', genome_size, (sum(gc_content) / len(gc_content)) ]) print('Copying {} to results directory'.format( os.path.basename(category_fp))) # Yes, need both. One is to get file_links in report. Second is for binnedContigs object shutil.copyfile(category_fp, dest_fp) # Write renamed sequences with open(binned_contig_fp, 'w') as binned_contig_fh: SeqIO.write(adjusted_sequences, binned_contig_fh, 'fasta') result = self.au.save_assembly_from_fasta({ 'file': { 'path': dest_fp }, 'workspace_name': params['workspace_name'], 'assembly_name': 'VirSorter-Category-{}'.format(category) }) created_objects.append({ "ref": result, "description": "KBase Assembly object from VIRSorter" }) # Create BinnedContigs object, but 1st, a little metadata generate_binned_contig_param = { 'file_directory': binned_contig_output_dir, 'assembly_ref': params['genomes'], # params.get('genomes'), self.assembly_ref 'binned_contig_name': params['binned_contig_name'], 'workspace_name': params['workspace_name'] } binned_contig_object_ref = self.mgu.file_to_binned_contigs( generate_binned_contig_param).get('binned_contig_obj_ref') # Add binned contigs reference here, as it was already created above created_objects.append({ "ref": binned_contig_object_ref, "description": "BinnedContigs from VIRSorter" }) # Save VIRSorter_affi-contigs.tab for DRAM-v affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files', 'VIRSorter_affi-contigs.tab') affi_contigs_shock_id = self.dfu.file_to_shock( {'file_path': affi_contigs_fp})['shock_id'] # Use global signal (i.e. summary) file and create HTML-formatted version raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id) html_fp = os.path.join(output_dir, 'index.html') with open(html_fp, 'w') as html_fh: html_fh.write(raw_html) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_dir, 'pack': 'zip' })['shock_id'] html_report = [{ 'shock_id': report_shock_id, 'name': os.path.basename(html_fp), 'label': os.path.basename(html_fp), 'description': 'HTML summary report for VIRSorter-predicted viral genomes.' }] report_params = { 'message': 'Here are the results from your VIRSorter run. Above, you\'ll find a report with ' 'all the identified (putative) viral genomes, and below, links to the report as ' 'well as files generated.', 'workspace_name': params['workspace_name'], 'html_links': html_report, 'direct_html_link_index': 0, 'report_object_name': 'VIRSorter_report_{}'.format(str(uuid.uuid4())), 'file_links': output_files, 'objects_created': created_objects, } kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'], token=params['KB_AUTH_TOKEN']) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'], 'result_directory': binned_contig_output_dir, 'binned_contig_obj_ref': binned_contig_object_ref } return report_output def _mkdir_p(self, path): """ :param path: :return: """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise
def process_kbase_objects(host_ref, virus_ref, shared_folder, callback, workspace, token): """ Convert KBase object(s) into usable files for VirMatcher :param host_ref: Putative host / microbial genomes with KBase '#/#/#' used to describe each object :param virus_ref: Viral genomes with KBase '#/#/#' used to describe each object :param shared_folder: KBase job node's "working" directory, where actual files exist :param callback: :param workspace: Workspace name :param token: Job token :return: """ dfu = DataFileUtil(callback, token=token) ws = Workspace(workspace, token=token) mgu = MetagenomeUtils(callback, token=token) au = AssemblyUtil(callback, token=token) # Need to determine KBase type in order to know how to properly proceed host_type = ws.get_object_info3({'objects': [{ 'ref': host_ref }]})['infos'][0][2].split('-')[0] virus_type = ws.get_object_info3({'objects': [{ 'ref': virus_ref }]})['infos'][0][2].split('-')[0] logging.info(f'Potential hosts identified as: {host_type}') logging.info(f'Viruses identified as: {virus_type}') # Create new directory to house virus and host files host_dir = Path(shared_folder) / 'host_files' if not host_dir.exists(): os.mkdir(host_dir) host_count = 0 if host_type == 'KBaseGenomeAnnotations.Assembly': # No info about individual genomes, so treat each as organism host_fps = au.get_assembly_as_fasta( {'ref': host_ref})['path'] # Consists of dict: path + assembly_name logging.info( f'Identified {host_type}. Each sequence will be treated as a separate organism.' ) records = SeqIO.parse(host_fps, 'fasta') for record in records: host_count += 1 tmp_fp = host_dir / f'{record.id}.fasta' # TODO Illegal filenames? SeqIO.write([record], tmp_fp, 'fasta') elif host_type == 'KBaseGenomes.Genomes': # TODO Genomes?! genome_data = ws.get_objects2({'objects': [{ 'ref': host_ref }]})['data'][0]['data'] genome_data.get('contigset_ref') or genome_data.get('assembly_ref') # elif host_type == 'KBaseSets.GenomeSet' elif host_type == 'KBaseSets.AssemblySet': obj_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0] for subobj in obj_data['data']['items']: host_fp = au.get_assembly_as_fasta({'ref': subobj['ref']})['path'] if os.path.splitext(host_fp)[-1] != 'fasta': # Ensure extension always = fasta target_fn = os.path.splitext( os.path.basename(host_fp))[0].strip('_') + '.fasta' else: target_fn = os.path.basename(host_fp).strip('_') shutil.copyfile(host_fp, host_dir / target_fn) host_count += 1 elif host_type == 'KBaseMetagenomes.BinnedContigs': # This is what we want! host_kbase_dir = mgu.binned_contigs_to_file({ 'input_ref': host_ref, 'save_to_shock': 0 })['bin_file_directory'] # Dict of bin_file_dir and shock_id for (dirpath, dirnames, fns) in os.walk( host_kbase_dir): # Dirnames = all folders under dirpath for fn in fns: if os.path.splitext(fn)[-1] != 'fasta': fn = os.path.splitext(fn)[0] + '.fasta' fp = Path(dirpath) / fn shutil.copy(fp, host_dir) host_count += 1 else: raise ValueError(f'{host_type} is not supported.') logging.info(f'{host_count} potential host genomes were identified.') virus_count = 0 if virus_type == 'KBaseGenomeAnnotations.Assembly': virus_fps = au.get_assembly_as_fasta({'ref': virus_ref})['path'] records = SeqIO.parse(virus_fps, 'fasta') virus_count = len(list(records)) # for record in records: # virus_count += 1 # tmp_fp = virus_dir / f'{record.id}.fasta' # SeqIO.write([record], tmp_fp, 'fasta') else: raise ValueError(f'{virus_type} is not supported.') logging.info(f'{virus_count} potential viral genomes were identified.') # TODO Do we even need any of this data? We don't care about what the sequences are called # host_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0] # virus_data = dfu.get_objects({'object_refs': [virus_ref]})['data'][0] return host_dir, virus_fps
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-' + str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3( {'objects': [{ 'ref': genome }]})['infos'][0][1] res = self.gsu.search({ 'ref': genome, 'structured_query': { 'feature_id': fids }, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({ 'Feature Id': feat['feature_id'], 'Aliases': ", ".join(sorted(feat['aliases'].keys())), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move( file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
class kb_ReadSim: ''' Module Name: kb_ReadSim Module Description: A KBase module: kb_ReadSim ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbasecollaborations/kb_ReadSim.git" GIT_COMMIT_HASH = "c9c0185e34d25be57cc6e1c901d8801fbc0f4784" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.shared_folder = config['scratch'] self.du = DownloadUtils(self.callback_url) self.su = SimUtils() self.ru = ReadsUtils(self.callback_url) self.vu = VariationUtil(self.callback_url) self.eu = VcfEvalUtils() self.hu = htmlreportutils() self.ws_url = config['workspace-url'] self.wsc = Workspace(self.ws_url) logging.basicConfig(format='%(created)s %(levelname)s: %(message)s', level=logging.INFO) #END_CONSTRUCTOR pass def run_kb_ReadSim(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Inparams" -> structure: parameter "workspace_name" of String, parameter "input_sample_set" of String, parameter "strain_info" of String, parameter "assembly_or_genome_ref" of String, parameter "base_error_rate" of String, parameter "outer_distance" of String, parameter "standard_deviation" of String, parameter "num_read_pairs" of String, parameter "len_first_read" of String, parameter "len_second_read" of String, parameter "mutation_rate" of String, parameter "frac_indels" of String, parameter "variation_object_name" of String, parameter "output_read_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_kb_ReadSim output_dir = self.shared_folder print(params) self.su.validate_simreads_params(params) genome_or_assembly_ref = params['assembly_or_genome_ref'] obj_type = self.wsc.get_object_info3( {'objects': [{ 'ref': genome_or_assembly_ref }]})['infos'][0][2] if ('KBaseGenomes.Genome' in obj_type): genome_ref = genome_or_assembly_ref subset = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }]) assembly_ref = subset[0]['data']['assembly_ref'] elif ('KBaseGenomeAnnotations.Assembly' in obj_type): assembly_ref = genome_or_assembly_ref else: raise ValueError(obj_type + ' is not the right input for this method. ' + 'Valid input include KBaseGenomes.Genome or ' + 'KBaseGenomeAnnotations.Assembly ') self.du.download_genome(assembly_ref, output_dir) ref_genome = os.path.join(self.shared_folder, "ref_genome.fa") output_fwd_paired_file_path = os.path.join(self.shared_folder, "raed1.fq") output_rev_paired_file_path = os.path.join(self.shared_folder, "raed2.fq") self.eu.check_path_exists(ref_genome) self.su.simreads(ref_genome, output_fwd_paired_file_path, output_rev_paired_file_path, params) self.eu.check_path_exists(output_fwd_paired_file_path) self.eu.check_path_exists(output_rev_paired_file_path) retVal = self.ru.upload_reads({ 'wsname': params['workspace_name'], 'name': params['output_read_object'], 'sequencing_tech': 'illumina', 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path }) logfile = os.path.join(self.shared_folder, "variant.txt") self.eu.check_path_exists(logfile) vcf_file = self.su.format_vcf(logfile) self.eu.check_path_exists(vcf_file) save_variation_params = { 'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': params['assembly_or_genome_ref'], 'sample_set_ref': params['input_sample_set'], 'sample_attribute_name': 'sample_attr', 'vcf_staging_file_path': vcf_file, 'variation_object_name': params['variation_object_name'] } self.vu.save_variation_from_vcf(save_variation_params) report = KBaseReport(self.callback_url) report_info = report.create({ 'report': { 'objects_created': [], 'text_message': 'Success' }, 'workspace_name': params['workspace_name'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END run_kb_ReadSim # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_kb_ReadSim return value ' + 'output is not type dict as required.') # return the results return [output] def run_eval_variantcalling(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of type "Evalparams" -> structure: parameter "workspace_name" of String, parameter "sim_varobject_name" of String, parameter "calling_varobject_name" of String, parameter "output_var_object" of String :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_eval_variantcalling print(params) self.eu.validate_eval_params(params) report_dir = os.path.join(self.shared_folder, str(uuid.uuid4())) os.mkdir(report_dir) self.ws = Workspace(url=self.ws_url, token=ctx['token']) var_object_ref1 = params['varobject_ref1'] sampleset_ref1 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref1, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] var_object_ref2 = params['varobject_ref2'] sampleset_ref2 = self.ws.get_objects2({ 'objects': [{ "ref": var_object_ref2, 'included': ['/sample_set_ref'] }] })['data'][0]['data']['sample_set_ref'] if (sampleset_ref1 != sampleset_ref2): raise Exception( "Variation objects are from different sample set\n") assembly_ref_set = set() genomeset_ref_set = set() variation_obj1 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref1 }]})['data'][0] if 'assembly_ref' in variation_obj1['data']: assembly_ref1 = variation_obj1['data']['assembly_ref'] assembly_ref_set.add(assembly_ref1) elif 'genome_ref' in variation_obj1['data']: genome_ref1 = variation_obj1['data']['genome_ref'] genomeset_ref_set.add(genome_ref1) variation_obj2 = self.ws.get_objects2( {'objects': [{ 'ref': var_object_ref2 }]})['data'][0] if 'assembly_ref' in variation_obj2['data']: assembly_ref2 = variation_obj2['data']['assembly_ref'] assembly_ref_set.add(assembly_ref2) elif 'genome_ref' in variation_obj2['data']: genome_ref2 = variation_obj2['data']['genome_ref'] genomeset_ref_set.add(genome_ref2) assembly_or_genome_ref = None if (not genomeset_ref_set and len(assembly_ref_set) != 1): raise Exception( "variation objects are from different assembly refs") elif (not assembly_ref_set and len(genomeset_ref_set) != 1): raise Exception("variation objects are from different genome refs") simvarfile = os.path.join(report_dir, "simvarinat.vcf.gz") simvarpath = self.du.download_variations(var_object_ref1, simvarfile) os.rename(simvarpath, simvarfile) self.eu.index_vcf(simvarfile) callingvarfile = os.path.join(report_dir, "callingvarinat.vcf.gz") callingvarpath = self.du.download_variations(var_object_ref2, callingvarfile) os.rename(callingvarpath, callingvarfile) self.eu.index_vcf(callingvarfile) eval_results = self.eu.variant_evalation(simvarfile, callingvarfile, report_dir) unique_vcf1 = eval_results['unique1'] self.eu.check_path_exists(unique_vcf1) unique_vcf2 = eval_results['unique2'] self.eu.check_path_exists(unique_vcf2) common_vcf = eval_results['common'] self.eu.check_path_exists(common_vcf) image_path = self.eu.plot_venn_diagram(report_dir, unique_vcf1, unique_vcf2, common_vcf) self.eu.check_path_exists(image_path) ''' if(len(assembly_ref_set) != 0): assembly_or_genome_ref = assembly_ref_set.pop() elif(len(genomeset_ref_set) != 0): assembly_or_genome_ref = genomeset_ref_set.pop() logging.info("Saving Unique1 vcf\n") save_unique_variation_params1 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr1', 'vcf_staging_file_path': unique_vcf1, 'variation_object_name': params['output_variant_object'] + "_sample1_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params1) logging.info("Saving done\n") logging.info("Saving Unique2 vcf\n") save_unique_variation_params2 = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_unique_attr2', 'vcf_staging_file_path': unique_vcf2, 'variation_object_name': params['output_variant_object'] + "_sample2_unique" } self.vu.save_variation_from_vcf(save_unique_variation_params2) logging.info("Saving done\n") logging.info("Saving Common vcf\n") save_common_variation_params = {'workspace_name': params['workspace_name'], 'genome_or_assembly_ref': assembly_or_genome_ref, 'sample_set_ref': sampleset_ref1, 'sample_attribute_name': 'sample_common_attr', 'vcf_staging_file_path': common_vcf, 'variation_object_name': params['output_variant_object'] + "_sample1_sample2_common" } self.vu.save_variation_from_vcf(save_common_variation_params) logging.info("Saving done\n") ''' workspace = params['workspace_name'] output = self.hu.create_html_report(self.callback_url, report_dir, workspace) #END run_eval_variantcalling # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_eval_variantcalling return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class BwaIndexBuilder: def __init__(self, scratch_dir, ws_url, callback_url, service_wizard_url, provenance): self.scratch_dir = scratch_dir self.ws_url = ws_url self.ws = Workspace(self.ws_url) self.callback_url = callback_url self.service_wizard_url = service_wizard_url self.bwa = BwaRunner(self.scratch_dir) self.provenance = provenance def get_index(self, params): ''' The key function of this module- get a bwa index for the specified input ''' # validate the parameters and fetch assembly_info validated_params = self._validate_params(params) assembly_info = self._get_assembly_info(validated_params['ref']) # check the cache (keyed off of assembly_info) index_info = self._get_cached_index(assembly_info, validated_params) if index_info: index_info['from_cache'] = 1 index_info['pushed_to_cache'] = 0 else: # on a cache miss, build the index index_info = self._build_index(assembly_info, validated_params) index_info['from_cache'] = 0 # pushed_to_cache will be set in return from _build_index index_info['assembly_ref'] = assembly_info['ref'] index_info['genome_ref'] = assembly_info['genome_ref'] return index_info def _validate_params(self, params): ''' validate parameters; can do some processing here to produce validated params ''' # params['ref'] = params['assembly_or_genome_ref'] validated_params = {'ref': None} if 'ref' in params and params['ref']: validated_params['ref'] = params['ref'] else: raise ValueError('"ref" field indicating either an assembly or genome is required.') if 'output_dir' in params: validated_params['output_dir'] = params['output_dir'] else: validated_params['output_dir'] = os.path.join(self.scratch_dir, 'bwa_index_' + str(int(time.time() * 100))) if os.path.exists(validated_params['output_dir']): raise ('Output directory name specified (' + validated_params['output_dir'] + ') already exists. Will not overwrite, so aborting.') if 'ws_for_cache' in params and params['ws_for_cache']: validated_params['ws_for_cache'] = params['ws_for_cache'] else: print('WARNING: bwa index if created will not be cached because "ws_for_cache" field not set') validated_params['ws_for_cache'] = None return validated_params def _get_assembly_info(self, ref): ''' given a ref to an assembly or genome, figure out the assembly and return its info ''' info = self.ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0] obj_type = info[2] if obj_type.startswith('KBaseGenomeAnnotations.Assembly') or obj_type.startswith('KBaseGenomes.ContigSet'): return {'info': info, 'ref': ref, 'genome_ref': None} if obj_type.startswith('KBaseGenomes.Genome'): # we need to get the assembly for this genome ga = GenomeAnnotationAPI(self.service_wizard_url) assembly_ref = ga.get_assembly({'ref': ref}) # using the path ensures we can access the assembly even if we don't have direct access ref_path = ref + ';' + assembly_ref info = self.ws.get_object_info3({'objects': [{'ref': ref_path}]})['infos'][0] return {'info': info, 'ref': ref_path, 'genome_ref': ref} raise ValueError('Input object was not of type: Assembly, ContigSet or Genome. Cannot get bwa Index.') def _get_cached_index(self, assembly_info, validated_params): try: # note: list_reference_objects does not yet support reference paths, so we need to call # with the direct reference. So we won't get a cache hit if you don't have direct access # to the assembly object right now (although you can still always build the assembly object) # Once this call supports paths, this should be changed to set ref = assembly_info['ref'] info = assembly_info['info'] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) objs = self.ws.list_referencing_objects([{'ref': ref}])[0] # iterate through each of the objects that reference the assembly bwa_indexes = [] for o in objs: if o[2].startswith('KBaseRNASeq.Bowtie2IndexV2'): bwa_indexes.append(o) # Nothing refs this assembly, so cache miss if len(bwa_indexes) == 0: return False # if there is more than one hit, get the most recent one # (obj_info[3] is the save_date timestamp (eg 2017-05-30T22:56:49+0000), so we can sort on that) bwa_indexes.sort(key=lambda x: x[3]) bwa_index_info = bwa_indexes[-1] index_ref = str(bwa_index_info[6]) + '/' + str(bwa_index_info[0]) + '/' + str(bwa_index_info[4]) # get the object data index_obj_data = self.ws.get_objects2({'objects': [{'ref': index_ref}]})['data'][0]['data'] # download the handle object os.makedirs(validated_params['output_dir']) dfu = DataFileUtil(self.callback_url) dfu.shock_to_file({'file_path': os.path.join(validated_params['output_dir'], 'bt2_index.tar.gz'), 'handle_id': index_obj_data['handle']['hid'], 'unpack': 'unpack'}) print('Cache hit: ') pprint(index_obj_data) return {'output_dir': validated_params['output_dir'], 'index_files_basename': index_obj_data['index_files_basename']} except Exception: # if we fail in saving the cached object, don't worry print('WARNING: exception encountered when trying to lookup in cache:') print(traceback.format_exc()) print('END WARNING: exception encountered when trying to lookup in cache.') return None def _put_cached_index(self, assembly_info, index_files_basename, output_dir, ws_for_cache): if not ws_for_cache: print('WARNING: bwa index cannot be cached because "ws_for_cache" field not set') return False try: dfu = DataFileUtil(self.callback_url) result = dfu.file_to_shock({'file_path': output_dir, 'make_handle': 1, 'pack': 'targz'}) bwa_index = {'handle': result['handle'], 'size': result['size'], 'assembly_ref': assembly_info['ref'], 'index_files_basename': index_files_basename} ws = Workspace(self.ws_url) save_params = {'objects': [{'hidden': 1, 'provenance': self.provenance, 'name': os.path.basename(output_dir), 'data': bwa_index, 'type': 'KBaseRNASeq.Bowtie2IndexV2' }] } if ws_for_cache.strip().isdigit(): save_params['id'] = int(ws_for_cache) else: save_params['workspace'] = ws_for_cache.strip() save_result = ws.save_objects(save_params) print('Bowtie2IndexV2 cached to: ') pprint(save_result[0]) return True except Exception: # if we fail in saving the cached object, don't worry print('WARNING: exception encountered when trying to cache the index files:') print(traceback.format_exc()) print('END WARNING: exception encountered when trying to cache the index files') return False def _build_index(self, assembly_info, validated_params): # get the assembly as a fasta file using AssemblyUtil au = AssemblyUtil(self.callback_url) fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']}) # make the target destination folder (check again it wasn't created yet) if os.path.exists(validated_params['output_dir']): raise ('Output directory name specified (' + validated_params['output_dir'] + ') already exists. Will not overwrite, so aborting.') os.makedirs(validated_params['output_dir']) # configure the command line args and run it cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params) self.bwa.run('index', cli_params) # self.bwa.run('index', cli_params) for file in glob.glob(r'/kb/module/work/tmp/' + fasta_info['assembly_name'] + '.*'): print(file) shutil.copy(file, validated_params['output_dir']) index_info = {'output_dir': validated_params['output_dir'], 'index_files_basename': fasta_info['assembly_name']} # cache the result, mark if it worked or not cache_success = self._put_cached_index(assembly_info, fasta_info['assembly_name'], validated_params['output_dir'], validated_params['ws_for_cache']) if cache_success: index_info['pushed_to_cache'] = 1 else: index_info['pushed_to_cache'] = 0 return index_info def _build_cli_params(self, fasta_file_path, index_files_basename, validated_params): cli_params = [] # always run in quiet mode # positional args: first the fasta path, then the base name used for the index files cli_params.append(fasta_file_path) cli_params.append("-p") cli_params.append(index_files_basename) return cli_params
def run_generate_metadata_report(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_generate_metadata_report object_type = params['object_type'] workspace_name = params['workspace_name'] ws = Workspace(self.ws_url) print(params) objects_in_workspace = ws.list_objects({ 'workspaces': [workspace_name], 'type': object_type }) object_names = sorted([j[1] for j in objects_in_workspace]) d = dict() if (object_type == 'KBaseRNASeq.RNASeqAlignment'): for object_name in object_names: alignment_stats = ws.get_objects2({ 'objects': [{ 'workspace': workspace_name, 'name': object_name }] })['data'][0]['data']['alignment_stats'] metadata_keys = alignment_stats.keys() object_pd = pd.Series(alignment_stats, index=metadata_keys) d[object_name] = object_pd else: for object_name in object_names: obj_meta_data = ws.get_object_info3( { 'objects': [{ 'workspace': workspace_name, 'name': object_name }], 'includeMetadata': 1 }, ) metadata = obj_meta_data.get('infos')[0][10] metadata_keys = metadata.keys() object_pd = pd.Series(metadata, index=metadata_keys) d[object_name] = object_pd df = pd.DataFrame(d) htmlDir = os.path.join(self.shared_folder, str(uuid.uuid4())) self._mkdir_p(htmlDir) report_file_path = os.path.join(htmlDir, "index.html") #df.to_html(report_file_path) self.write_pd_html(df.T, report_file_path) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except Exception: raise ValueError('Error uploading HTML file: ' + str(htmlDir) + ' to shock') reportname = 'generate_metadata_report_' + str(uuid.uuid4()) reportobj = { 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 500, 'workspace_name': params['workspace_name'], 'report_object_name': reportname } # attach to report obj reportobj['direct_html'] = '' reportobj['direct_html_link_index'] = 0 reportobj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': 'index.html', 'label': 'index.html' }] report = KBaseReport(self.callback_url, token=ctx['token']) report_info = report.create_extended_report(reportobj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } print(output) #END run_generate_metadata_report # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method run_generate_metadata_report return value ' + 'output is not type dict as required.') # return the results return [output]
class FeatureSetBuilder: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_upload_featureset_from_diff_expr_params(self, params): """ _validate_upload_featureset_from_diff_expr_params: validates params passed to upload_featureset_from_diff_expr method """ log('start validating upload_featureset_from_diff_expr params') # check for required parameters for p in ['diff_expression_ref', 'workspace_name', 'p_cutoff', 'q_cutoff', 'fold_change_cutoff']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) p = params.get('fold_scale_type') if p and p != 'logarithm': raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used') @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning("Unexpected parameter {} supplied".format(param)) def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, workspace_name): """ _generate_report: generate summary report """ log('start creating report') output_html_files = self._generate_html_report(up_feature_set_ref_list, down_feature_set_ref_list) objects_created = list() for up_feature_set_ref in up_feature_set_ref_list: objects_created += [{'ref': up_feature_set_ref, 'description': 'Upper FeatureSet Object'}] for down_feature_set_ref in down_feature_set_ref_list: objects_created += [{'ref': down_feature_set_ref, 'description': 'Lower FeatureSet Object'}] for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list: objects_created += [{'ref': filtered_expression_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] report_params = {'message': '', 'workspace_name': workspace_name, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') uppper_feature_content = '' for up_feature_set_ref in up_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': up_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) lower_feature_content = '' for down_feature_set_ref in down_feature_set_ref_list: feature_set_obj = self.ws.get_objects2({'objects': [{'ref': down_feature_set_ref}]})['data'][0] feature_set_data = feature_set_obj['data'] feature_set_info = feature_set_obj['info'] feature_set_name = feature_set_info[1] elements = feature_set_data.get('elements') feature_ids = list(elements.keys()) lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name, len(feature_ids)) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>', uppper_feature_content) report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>', lower_feature_content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report'}) return html_report def _process_diff_expression(self, diff_expression_set_ref, result_directory, condition_label_pair): """ _process_diff_expression: process differential expression object info """ log('start processing differential expression object') diff_expr_set_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}]})['data'][0]['data'] set_items = diff_expr_set_data['items'] diff_expr_matrix_file_name = 'gene_results.csv' diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name) with open(diff_expr_matrix_file, 'w') as csvfile: fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for set_item in set_items: diff_expression_ref = set_item['ref'] diff_expression_data = self.ws.get_objects2({'objects': [{'ref': diff_expression_ref}]})['data'][0]['data'] label_string = set_item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_1 = label_list[0] condition_2 = label_list[1] if condition_1 in condition_label_pair and condition_2 in condition_label_pair: genome_id = diff_expression_data['genome_ref'] matrix_data = diff_expression_data['data'] selected_diff_expression_ref = diff_expression_ref with open(diff_expr_matrix_file, 'a') as csvfile: row_ids = matrix_data.get('row_ids') row_values = matrix_data.get('values') writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for pos, row_id in enumerate(row_ids): row_value = row_values[pos] writer.writerow({'gene_id': row_id, 'log2_fold_change': row_value[0], 'p_value': row_value[1], 'q_value': row_value[2]}) return diff_expr_matrix_file, genome_id, selected_diff_expression_ref def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name): """ _generate_feature_set: generate FeatureSet object KBaseCollections.FeatureSet type: typedef structure { string description; list<feature_id> element_ordering; mapping<feature_id, list<genome_ref>> elements; } FeatureSet; """ log('start saving KBaseCollections.FeatureSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) elements = {feature_id: [genome_id] for feature_id in feature_ids} feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression', 'element_ordering': feature_ids, 'elements': elements} object_type = 'KBaseCollections.FeatureSet' save_object_params = { 'id': workspace_id, 'objects': [{'type': object_type, 'data': feature_set_data, 'name': feature_set_name}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return feature_set_obj_ref def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value, comp_fold_change_cutoff): """ _process_matrix_file: filter matrix file by given cutoffs """ log('start processing matrix file') up_feature_ids = [] down_feature_ids = [] if comp_fold_change_cutoff < 0: comp_fold_change_cutoff = -comp_fold_change_cutoff with open(diff_expr_matrix_file, 'r') as file: reader = csv.DictReader(file) for row in reader: feature_id = row['gene_id'] row_p_value = row['p_value'] row_q_value = row['q_value'] row_fold_change_cutoff = row['log2_fold_change'] null_value = {'NA', 'null', ''} col_value = {row_p_value, row_q_value, row_fold_change_cutoff} if not col_value.intersection(null_value): p_value_condition = float(row_p_value) <= comp_p_value q_value_condition = float(row_q_value) <= comp_q_value up_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) >= comp_fold_change_cutoff)) down_matches_condition = (p_value_condition and q_value_condition and (float(row_fold_change_cutoff) <= -comp_fold_change_cutoff)) if up_matches_condition: up_feature_ids.append(feature_id) elif down_matches_condition: down_feature_ids.append(feature_id) return list(set(up_feature_ids)), list(set(down_feature_ids)) def _filter_expression_matrix(self, expression_matrix_ref, feature_ids, workspace_name, filtered_expression_matrix_suffix="", diff_expression_matrix_ref=None, filtered_expression_matrix_name=None): """ _filter_expression_matrix: generated filtered expression matrix """ log('start saving ExpressionMatrix object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_matrix_obj = self.dfu.get_objects({'object_refs': [expression_matrix_ref]})['data'][0] expression_matrix_info = expression_matrix_obj['info'] expression_matrix_data = expression_matrix_obj['data'] expression_matrix_name = expression_matrix_info[1] if not filtered_expression_matrix_name: if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name): filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix', filtered_expression_matrix_suffix, expression_matrix_name) else: filtered_expression_matrix_name = expression_matrix_name + \ filtered_expression_matrix_suffix filtered_expression_matrix_data = expression_matrix_data.copy() data = filtered_expression_matrix_data['data'] row_ids = data['row_ids'] values = data['values'] filtered_data = data.copy() filtered_row_ids = list() filtered_values = list() for pos, row_id in enumerate(row_ids): if row_id in feature_ids: filtered_row_ids.append(row_id) filtered_values.append(values[pos]) filtered_data['row_ids'] = filtered_row_ids filtered_data['values'] = filtered_values filtered_expression_matrix_data['data'] = filtered_data expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data, 'name': filtered_expression_matrix_name} # we now save the filtering DEM in a EM field added for this purpose if diff_expression_matrix_ref: expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref] save_object_params = { 'id': workspace_id, 'objects': [expression_obj]} dfu_oi = self.dfu.save_objects(save_object_params)[0] filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) return filtered_expression_matrix_ref def _xor(self, a, b): return bool(a) != bool(b) def _check_input_labels(self, condition_pairs, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() label_list = [x.strip() for x in label_string.split(',')] first_label = label_list[0] second_label = label_list[1] if first_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(first_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if second_label not in available_condition_labels: error_msg = 'Condition: {} is not availalbe. '.format(second_label) error_msg += 'Available conditions: {}'.format(available_condition_labels) raise ValueError(error_msg) if first_label == second_label: raise ValueError('Input conditions are the same') return checked def _get_condition_labels(self, diff_expression_set_ref): """ _get_condition_labels: get all possible condition label pairs """ log('getting all possible condition pairs') condition_label_pairs = list() available_condition_labels = set() diff_expression_set_obj = self.ws.get_objects2({'objects': [{'ref': diff_expression_set_ref}] })['data'][0] diff_expression_set_data = diff_expression_set_obj['data'] items = diff_expression_set_data.get('items') for item in items: label_string = item['label'] label_list = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(label_list) available_condition_labels |= set(label_list) log('all possible condition pairs:\n{}'.format(condition_label_pairs)) return condition_label_pairs, available_condition_labels def _get_feature_ids(self, genome_ref, ids): """ _get_feature_ids: get feature ids from genome """ genome_features = self.gsu.search({'ref': genome_ref, 'limit': len(ids), 'structured_query': {"$or": [{"feature_id": x} for x in ids]}, 'sort_by': [['feature_id', True]]})['features'] features_ids = set((feature.get('feature_id') for feature in genome_features)) return features_ids def _build_fs_obj(self, params): new_feature_set = { 'description': '', 'element_ordering': [], 'elements': {} } genome_ref = params['genome'] if params.get('base_feature_sets', []) and None not in params['base_feature_sets']: base_feature_sets = self.dfu.get_objects( {'object_refs': params['base_feature_sets']} )['data'] for ret in base_feature_sets: base_set = ret['data'] base_set_name = ret['info'][1] new_feature_set['element_ordering'] += [x for x in base_set['element_ordering'] if x not in new_feature_set['elements']] for element, genome_refs in base_set['elements'].items(): if element in new_feature_set['elements']: new_feature_set['elements'][element] += [x for x in genome_refs if x not in new_feature_set['elements'][ element]] else: new_feature_set['elements'][element] = genome_refs new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format( base_set_name, base_set.get('description')) new_feature_ids = [] if params.get('feature_ids'): if isinstance(params['feature_ids'], str): new_feature_ids += params['feature_ids'].split(',') else: new_feature_ids += params['feature_ids'] if params.get('feature_ids_custom'): new_feature_ids += params['feature_ids_custom'].split(',') if new_feature_ids: genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids) for new_feature in new_feature_ids: if new_feature not in genome_feature_ids: raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format( new_feature, genome_ref)) if new_feature in new_feature_set['elements']: if genome_ref not in new_feature_set['elements'][new_feature]: new_feature_set['elements'][new_feature].append(genome_ref) else: new_feature_set['elements'][new_feature] = [genome_ref] new_feature_set['element_ordering'].append(new_feature) if params.get('description'): new_feature_set['description'] = params['description'] return new_feature_set def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.ws = Workspace(self.ws_url, token=self.token) self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.scratch = config['scratch'] def upload_featureset_from_diff_expr(self, params): """ upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression based on given threshold cutoffs required params: diff_expression_ref: DifferetialExpressionMatrixSet object reference expression_matrix_ref: ExpressionMatrix object reference p_cutoff: p value cutoff q_cutoff: q value cutoff fold_scale_type: one of ["linear", "log2+1", "log10+1"] fold_change_cutoff: fold change cutoff feature_set_suffix: Result FeatureSet object name suffix filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix workspace_name: the name of the workspace it gets saved to return: result_directory: folder path that holds all files generated up_feature_set_ref_list: list of generated upper FeatureSet object reference down_feature_set_ref_list: list of generated down FeatureSet object reference filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ self._validate_upload_featureset_from_diff_expr_params(params) diff_expression_set_ref = params.get('diff_expression_ref') diff_expression_set_info = self.ws.get_object_info3({"objects": [{"ref": diff_expression_set_ref}]} )['infos'][0] diff_expression_set_name = diff_expression_set_info[1] result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) (available_condition_label_pairs, available_condition_labels) = self._get_condition_labels(diff_expression_set_ref) run_all_combinations = params.get('run_all_combinations') condition_pairs = params.get('condition_pairs') if not self._xor(run_all_combinations, condition_pairs): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide partial condition pairs. Don't do both or neither" raise ValueError(error_msg) if run_all_combinations: condition_label_pairs = available_condition_label_pairs else: if self._check_input_labels(condition_pairs, available_condition_labels): condition_label_pairs = list() for condition_pair in condition_pairs: label_string = condition_pair['label_string'][0].strip() condition_labels = [x.strip() for x in label_string.split(',')] condition_label_pairs.append(condition_labels) up_feature_set_ref_list = list() down_feature_set_ref_list = list() filtered_expression_matrix_ref_list = list() for condition_label_pair in condition_label_pairs: condition_string = '-'.join(reversed(condition_label_pair)) diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression( diff_expression_set_ref, result_directory, condition_label_pair) up_feature_ids, down_feature_ids = self._process_matrix_file( diff_expr_matrix_file, params.get('p_cutoff'), params.get('q_cutoff'), params.get('fold_change_cutoff')) filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix') if params.get('expression_matrix_ref'): filtered_expression_matrix_ref = self._filter_expression_matrix( params.get('expression_matrix_ref'), up_feature_ids + down_feature_ids, params.get('workspace_name'), "", diff_expr_matrix_ref, filtered_em_name) filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref) feature_set_suffix = params.get('feature_set_suffix', "") up_feature_set_name = "{}_{}_up{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) up_feature_set_ref = self._generate_feature_set(up_feature_ids, genome_id, params.get('workspace_name'), up_feature_set_name) up_feature_set_ref_list.append(up_feature_set_ref) down_feature_set_name = "{}_{}_down{}".format( diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix) down_feature_set_ref = self._generate_feature_set(down_feature_ids, genome_id, params.get('workspace_name'), down_feature_set_name) down_feature_set_ref_list.append(down_feature_set_ref) returnVal = {'result_directory': result_directory, 'up_feature_set_ref_list': up_feature_set_ref_list, 'down_feature_set_ref_list': down_feature_set_ref_list, 'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list} report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list, filtered_expression_matrix_ref_list, params.get('workspace_name')) returnVal.update(report_output) return returnVal def filter_matrix_with_fs(self, params): self.validate_params(params, ('feature_set_ref', 'workspace_name', 'expression_matrix_ref', 'filtered_expression_matrix_suffix')) ret = self.dfu.get_objects( {'object_refs': [params['feature_set_ref']]} )['data'][0] feature_set = ret['data'] feature_set_name = ret['info'][1] feature_ids = set(feature_set['elements'].keys()) filtered_matrix_ref = self._filter_expression_matrix( params['expression_matrix_ref'], feature_ids, params['workspace_name'], params['filtered_expression_matrix_suffix']) objects_created = [{'ref': filtered_matrix_ref, 'description': 'Filtered ExpressionMatrix Object'}] message = "Filtered Expression Matrix based of the {} feature ids present in {}"\ .format(len(feature_ids), feature_set_name) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'filtered_expression_matrix_ref': filtered_matrix_ref, 'report_name': output['name'], 'report_ref': output['ref']} def build_feature_set(self, params): self.validate_params(params, {'output_feature_set', 'workspace_name', }, {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets', 'description'}) feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets') if not any([params.get(x) for x in feature_sources]): raise ValueError("You must supply at least one feature source: {}".format( ", ".join(feature_sources))) workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) new_feature_set = self._build_fs_obj(params) save_object_params = { 'id': workspace_id, 'objects': [{'type': 'KBaseCollections.FeatureSet', 'data': new_feature_set, 'name': params['output_feature_set']}]} dfu_oi = self.dfu.save_objects(save_object_params)[0] feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4]) objects_created = [{'ref': feature_set_obj_ref, 'description': 'Feature Set'}] message = 'A new feature set containing {} features was created.'.format( len(new_feature_set['elements'])) report_params = {'message': message, 'workspace_name': params['workspace_name'], 'objects_created': objects_created, 'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) return {'feature_set_ref': feature_set_obj_ref, 'report_name': output['name'], 'report_ref': output['ref']}
class BwaAligner: def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.scratch_dir = scratch_dir self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.provenance = provenance # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_Bwa', provenance[0]['subactions']) print('Running kb_Bwa version = ' + self.my_version) self.ws = Workspace(self.workspace_url) self.bwa = BwaRunner(self.scratch_dir) self.parallel_runner = KBParallel(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url) def get_version_from_subactions(self, module_name, subactions): # go through each sub action looking for if not subactions: return 'release' # default to release if we can't find anything for sa in subactions: if 'name' in sa: if sa['name'] == module_name: # local-docker-image implies that we are running in kb-test, so return 'dev' if sa['commit'] == 'local-docker-image': return 'dev' # to check that it is a valid hash, make sure it is the right # length and made up of valid hash characters if re.match('[a-fA-F0-9]{40}$', sa['commit']): return sa['commit'] # again, default to setting this to release return 'release' def align(self, params): validated_params = self.validate_params(params) input_info = self.determine_input_info(validated_params) # input info provides information on the input and tells us if we should # run as a single_library or as a set: # input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'} assembly_or_genome_ref = validated_params['assembly_or_genome_ref'] if input_info['run_mode'] == 'single_library': if 'output_alignment_name' not in validated_params: suffix = '_alignment' if 'output_alignment_suffix' in validated_params: suffix = validated_params['output_alignment_suffix'] validated_params[ 'output_alignment_name'] = input_info['info'][1] + suffix single_lib_result = self.single_reads_lib_run( input_info, assembly_or_genome_ref, validated_params, create_report=validated_params['create_report']) return single_lib_result if input_info['run_mode'] == 'sample_set': reads = self.fetch_reads_refs_from_sampleset( input_info['ref'], input_info['info'], validated_params) self.build_bwa_index(assembly_or_genome_ref, validated_params['output_workspace']) print('Running on set of reads=') pprint(reads) tasks = [] for r in reads: tasks.append( self.build_single_execution_task( r['ref'], params, r['alignment_output_name'], r['condition'])) batch_run_params = { 'tasks': tasks, 'runner': 'parallel', 'max_retries': 2 } if validated_params['concurrent_local_tasks'] is not None: batch_run_params['concurrent_local_tasks'] = validated_params[ 'concurrent_local_tasks'] if validated_params['concurrent_njsw_tasks'] is not None: batch_run_params['concurrent_njsw_tasks'] = validated_params[ 'concurrent_njsw_tasks'] results = self.parallel_runner.run_batch(batch_run_params) print('Batch run results=') pprint(results) batch_result = self.process_batch_result(results, validated_params, reads, input_info['info']) return batch_result raise ('Improper run mode') def build_single_execution_task(self, reads_lib_ref, params, output_name, condition): task_params = copy.deepcopy(params) task_params['input_ref'] = reads_lib_ref task_params['output_alignment_name'] = output_name task_params['create_report'] = 0 task_params['condition_label'] = condition return { 'module_name': 'kb_Bwa', 'function_name': 'align_reads_to_assembly_app', 'version': self.my_version, 'parameters': task_params } def single_reads_lib_run(self, read_lib_info, assembly_or_genome_ref, validated_params, create_report=False, bwa_index_info=None): ''' run on one reads ''' # download reads and prepare any bwa index files input_configuration = self.prepare_single_run( read_lib_info, assembly_or_genome_ref, bwa_index_info, validated_params['output_workspace']) # run the actual program run_output_info = self.run_bwa_align_cli(input_configuration, validated_params) # process the result and save the output upload_results = self.save_read_alignment_output( run_output_info, input_configuration, validated_params) run_output_info['upload_results'] = upload_results report_info = None if create_report: report_info = self.create_report_for_single_run( run_output_info, input_configuration, validated_params) self.clean(run_output_info) return {'output_info': run_output_info, 'report_info': report_info} def build_bwa_index(self, assembly_or_genome_ref, ws_for_cache): bwaIndexBuilder = BwaIndexBuilder(self.scratch_dir, self.workspace_url, self.callback_url, self.srv_wiz_url, self.provenance) return bwaIndexBuilder.get_index({ 'ref': assembly_or_genome_ref, 'ws_for_cache': ws_for_cache }) def prepare_single_run(self, input_info, assembly_or_genome_ref, bwa_index_info, ws_for_cache): ''' Given a reads ref and an assembly, setup the bwa index ''' # first setup the bwa index of the assembly input_configuration = {'bwa_index_info': bwa_index_info} if not bwa_index_info: bwaIndexBuilder = BwaIndexBuilder(self.scratch_dir, self.workspace_url, self.callback_url, self.srv_wiz_url, self.provenance) index_result = bwaIndexBuilder.get_index({ 'ref': assembly_or_genome_ref, 'ws_for_cache': ws_for_cache }) input_configuration['bwa_index_info'] = index_result # next download the reads read_lib_ref = input_info['ref'] read_lib_info = input_info['info'] reads_params = { 'read_libraries': [read_lib_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callback_url) reads = ru.download_reads(reads_params)['files'] input_configuration['reads_lib_type'] = self.get_type_from_obj_info( read_lib_info).split('.')[1] input_configuration['reads_files'] = reads[read_lib_ref] input_configuration['reads_lib_ref'] = read_lib_ref return input_configuration def run_bwa_align_cli(self, input_configuration, validated_params): # pprint('======== input_configuration =====') # pprint(input_configuration) options = [] run_output_info = {} # set the bwa index location bt2_index_dir = input_configuration['bwa_index_info']['output_dir'] bt2_index_basename = input_configuration['bwa_index_info'][ 'index_files_basename'] #options.extend(['-x', bt2_index_basename]) reference = os.path.join(bt2_index_dir, bt2_index_basename) options_r = [] options_l = [] options.append(reference) options_r.append(reference) options_l.append(reference) output_dir = os.path.join( self.scratch_dir, 'bwa_alignment_output_' + str(int(time.time() * 10000))) output_sam_file = os.path.join(output_dir, 'reads_alignment.sam') os.makedirs(output_dir) # set the input reads sam_parameter = '' if input_configuration['reads_lib_type'] == 'SingleEndLibrary': options.extend( ['-0', input_configuration['reads_files']['files']['fwd']]) run_output_info['library_type'] = 'single_end' output_sai_file = os.path.join(output_dir, bt2_index_basename) + ".sai" options.extend(["-f", output_sai_file]) self.bwa.run('aln', options, cwd=bt2_index_dir) sam_parameter = 'samse' options2 = [] options2.append(reference) options2.append(output_sai_file) options2.append(input_configuration['reads_files']['files']['fwd']) options2.extend(["-f", output_sam_file]) self.bwa.run(sam_parameter, options2, cwd=bt2_index_dir) elif input_configuration['reads_lib_type'] == 'PairedEndLibrary': options_l.extend( ['-1', input_configuration['reads_files']['files']['fwd']]) output_l_sai_file = os.path.join(output_dir, bt2_index_basename) + "_l.sai" options_l.extend(["-f", output_l_sai_file]) self.bwa.run('aln', options_l, cwd=bt2_index_dir) options_r.extend( ['-2', input_configuration['reads_files']['files']['rev']]) output_r_sai_file = os.path.join(output_dir, bt2_index_basename) + "_r.sai" options_r.extend(["-f", output_r_sai_file]) self.bwa.run('aln', options_r, cwd=bt2_index_dir) sam_parameter = 'sampe' options2 = [] options2.append(reference) options2.append(output_r_sai_file) options2.append(output_l_sai_file) options2.append(input_configuration['reads_files']['files']['rev']) options2.append(input_configuration['reads_files']['files']['fwd']) options2.extend(["-f", output_sam_file]) self.bwa.run(sam_parameter, options2, cwd=bt2_index_dir) run_output_info['library_type'] = 'paired_end' ''' align = bash('bwa aln -I -t 8 reference.fa reads.txt > out.sai') sam = bash('bwa samse reference.fa out.sai reads.txt > out.sam') ''' # setup the output file name # options.extend(['-S', output_sam_file]) run_output_info['output_sam_file'] = output_sam_file run_output_info['output_dir'] = output_dir return run_output_info def save_read_alignment_output(self, run_output_info, input_configuration, validated_params): rau = ReadsAlignmentUtils(self.callback_url) destination_ref = validated_params[ 'output_workspace'] + '/' + validated_params[ 'output_alignment_name'] condition = 'unknown' if 'condition_label' in validated_params: condition = validated_params['condition_label'] upload_params = { 'file_path': run_output_info['output_sam_file'], 'destination_ref': destination_ref, 'read_library_ref': input_configuration['reads_lib_ref'], 'assembly_or_genome_ref': validated_params['assembly_or_genome_ref'], 'condition': condition } upload_results = rau.upload_alignment(upload_params) return upload_results def clean(self, run_output_info): ''' Not really necessary on a single run, but if we are running multiple local subjobs, we should clean up files that have already been saved back up to kbase ''' pass def create_report_for_single_run(self, run_output_info, input_configuration, validated_params): # first run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': run_output_info['upload_results']['obj_ref']}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create report report_text = 'Ran on a single reads library.\n\n' alignment_info = self.get_obj_info( run_output_info['upload_results']['obj_ref']) report_text = 'Created ReadsAlignment: ' + str( alignment_info[1]) + '\n' report_text = ' ' + run_output_info[ 'upload_results']['obj_ref'] + '\n' kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({ 'message': report_text, 'objects_created': [{ 'ref': run_output_info['upload_results']['obj_ref'], 'description': 'ReadsAlignment' }], 'report_object_name': 'kb_Bwa_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'workspace_name': validated_params['output_workspace'] }) return { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } def process_batch_result(self, batch_result, validated_params, reads, input_set_info): n_jobs = len(batch_result['results']) n_success = 0 n_error = 0 ran_locally = 0 ran_njsw = 0 # reads alignment set items items = [] objects_created = [] for k in range(0, len(batch_result['results'])): job = batch_result['results'][k] result_package = job['result_package'] if job['is_error']: n_error += 1 else: n_success += 1 print(result_package['result']) print(result_package['result'][0]) print(result_package['result'][0]['output_info']) output_info = result_package['result'][0]['output_info'] ra_ref = output_info['upload_results']['obj_ref'] # Note: could add a label to the alignment here? items.append({'ref': ra_ref, 'label': reads[k]['condition']}) objects_created.append({'ref': ra_ref}) if result_package['run_context']['location'] == 'local': ran_locally += 1 if result_package['run_context']['location'] == 'njsw': ran_njsw += 1 # Save the alignment set alignment_set_data = {'description': '', 'items': items} alignment_set_save_params = { 'data': alignment_set_data, 'workspace': validated_params['output_workspace'], 'output_object_name': str(input_set_info[1]) + validated_params['output_obj_name_suffix'] } set_api = SetAPI(self.srv_wiz_url) save_result = set_api.save_reads_alignment_set_v1( alignment_set_save_params) print('Saved ReadsAlignment=') pprint(save_result) objects_created.append({ 'ref': save_result['set_ref'], 'description': 'Set of all reads alignments generated' }) set_name = save_result['set_info'][1] # run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': save_result['set_ref']}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create the report report_text = 'Ran on SampleSet or ReadsSet.\n\n' report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n' report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n' report_text += ' Successful runs = ' + str(n_success) + '\n' report_text += ' Failed runs = ' + str(n_error) + '\n' report_text += ' Ran on main node = ' + str(ran_locally) + '\n' report_text += ' Ran on remote worker = ' + str(ran_njsw) + '\n\n' print('Report text=') print(report_text) kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({ 'message': report_text, 'objects_created': objects_created, 'report_object_name': 'kb_Bwa_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'workspace_name': validated_params['output_workspace'] }) result = { 'report_info': { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } } result['batch_output_info'] = batch_result return result def validate_params(self, params): validated_params = {} required_string_fields = [ 'input_ref', 'assembly_or_genome_ref', 'output_obj_name_suffix', 'output_workspace' ] for field in required_string_fields: if field in params and params[field]: validated_params[field] = params[field] else: raise ValueError('"' + field + '" field required to run bwa aligner app') optional_fields = [ 'quality_score', 'alignment_type', 'preset_options', 'trim5', 'trim3', 'condition_label', 'np', 'minins', 'maxins', 'output_alignment_suffix', 'output_alignment_name' ] for field in optional_fields: if field in params: if params[field] is not None: validated_params[field] = params[field] validated_params['create_report'] = True if 'create_report' in params and params['create_report'] is not None: if int(params['create_report']) == 1: validated_params['create_report'] = True elif int(params['create_report']) == 0: validated_params['create_report'] = False else: raise ValueError( '"create_report" field, if present, should be set to a boolean value: 0 or 1' ) validated_params['concurrent_local_tasks'] = None validated_params['concurrent_njsw_tasks'] = None if 'concurrent_local_tasks' in params and params[ 'concurrent_local_tasks'] is not None: validated_params['concurrent_local_tasks'] = int( params['concurrent_local_tasks']) if 'concurrent_njsw_tasks' in params and params[ 'concurrent_njsw_tasks'] is not None: validated_params['concurrent_njsw_tasks'] = int( params['concurrent_njsw_tasks']) return validated_params def fetch_reads_refs_from_sampleset(self, ref, info, validated_params): """ Note: adapted from kbaseapps/kb_hisat2 - file_util.py From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = self.get_type_from_obj_info(info) refs = list() refs_for_ws_info = list() if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in ReadsSet object") set_api = SetAPI(self.srv_wiz_url) reads_set = set_api.get_reads_set_v1({ 'ref': ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) for reads in reads_set["data"]["items"]: refs.append({ 'ref': reads['ref_path'], 'condition': reads['label'] }) refs_for_ws_info.append({'ref': reads['ref_path']}) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) # get object info so we can name things properly infos = self.ws.get_object_info3({'objects': refs_for_ws_info})['infos'] name_ext = '_alignment' if 'output_alignment_suffix' in validated_params \ and validated_params['output_alignment_suffix'] is not None: ext = validated_params['output_alignment_suffix'].replace(' ', '') if ext: name_ext = ext unique_name_lookup = {} for k in range(0, len(refs)): refs[k]['info'] = infos[k] name = infos[k][1] if name not in unique_name_lookup: unique_name_lookup[name] = 1 else: unique_name_lookup[name] += 1 name = name + '_' + str(unique_name_lookup[name]) name = name + name_ext refs[k]['alignment_output_name'] = name return refs def determine_input_info(self, validated_params): ''' get info on the input_ref object and determine if we run once or run on a set ''' info = self.get_obj_info(validated_params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in [ 'KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary' ]: return { 'run_mode': 'single_library', 'info': info, 'ref': validated_params['input_ref'] } if obj_type == 'KBaseRNASeq.RNASeqSampleSet': return { 'run_mode': 'sample_set', 'info': info, 'ref': validated_params['input_ref'] } if obj_type == 'KBaseSets.ReadsSet': return { 'run_mode': 'sample_set', 'info': info, 'ref': validated_params['input_ref'] } raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
class GFFUtils2: def __init__(self, config): self.callback_url = config['callback_url'] self.shared_folder = config['scratch'] #self.shared_folder = "/kb/module/work" self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.wsc = Workspace(self.ws_url) def _prep_gff(self, gff_file): outfile = os.path.join(self.genome_dir, 'out.gff') sortcmd = f'(grep ^"#" {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)' with open(outfile, 'w') as o: p = subprocess.Popen(sortcmd, shell=True, stdout=o) out, err = p.communicate() o.close() bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir) out2, err2 = bgzip.communicate() outfile += '.gz' return outfile def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths): with open(gff_file_path, 'w') as f: for feature in json: if feature['feature_type'].strip().upper() == 'GENE': end = int(feature['location'][0]['start'])+int(feature['location'][0]['length']) metainfo = "ID="+feature['feature_id'] if feature['function']: metainfo += ';FUNCTION='+feature['function'] contig_id = str(feature['location'][0]['contig_id']) start = int(feature['location'][0]['start']) # TODO: Fix Plink reassignment of Chr prefixes try: global_pos = int(contig_base_lengths[contig_id]) + start except KeyError: try: global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start except KeyError: try: global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start except KeyError: pp(contig_base_lengths) pp(contig_id) raise KeyError(e) """ Remove ontology for now if feature['ontology_terms']: metainfo += ';ONTOLOGY(' for k, v in feature['ontology_terms'].items(): metainfo += str(k) + ',' + str(v) + ':' metainfo = metainfo[:-1] # remove trailing ; metainfo += ')' """ constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \ 'KBase\tgene\t' + \ str(feature['location'][0]['start']) + '\t' + \ str(end) + '\t.\t' + \ str(feature['location'][0]['strand']) + '\t' + \ str(global_pos) + '\t' + \ str(metainfo) + '\n' f.write(constructed_gff_line) f.close() if os.path.exists(gff_file_path): return gff_file_path else: raise FileNotFoundError('Unable to create GFF file form genome JSON.') def _process_tabix_results(self, queryresult): queryinfo = queryresult[8].split(';') if len(queryinfo) >= 2: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])] elif len(queryinfo) is 1: extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"] else: extension = ['NA', 'NA', 'NA'] return extension def find_gene_info(self, row): tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"])) tbresult = next(tb, None) if tbresult is None: tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult2 = next(tb2, None) if tbresult2 is None: tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"])) tbresult3 = next(tb3, None) if tbresult3 is None: if int(row["POS"]) < 500: nstart = 0 else: nstart = int(row["POS"]) - 500 neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500) neigh_result = next(neigh_tb, None) if neigh_result is None: return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: nq = self._process_tabix_results(neigh_result) return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q3 = self._process_tabix_results(tbresult3) return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q2 = self._process_tabix_results(tbresult2) return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) else: q = self._process_tabix_results(tbresult) return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION']) def get_gwas_result_file(self, association_ref, association_name, p_value): #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data'] association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0] association_results = association_obj['data']["association_details"][0]["association_results"] result = "CHR\tSNP\tPOS\tP\tBP\n" for variation in association_results: if (float(variation[3]) > float(p_value)): continue result += str(variation[0]) + "\t" result += str(variation[1]) + "\t" result += str(variation[2]) + "\t" result += str(variation[3]) + "\t" result += str(variation[2]) + "\n" filepath = os.path.join(self.genome_dir, association_name) with open(filepath, "w") as file1: file1.write(result) return (filepath) def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix): gene_ids = dict() element_ordering = list() elements = dict() skip_words = ["GENEID", "NEIGHBORGENE", "NA"] with open(filepath, 'r') as reader: for line in reader: fields = line.split("\t") condition1 = fields[5] not in skip_words condition2 = fields[5] not in elements condition3 = fields[6] not in skip_words condition4 = fields[6] not in elements if condition1 and condition2: element_ordering.append(fields[5]) elements[fields[5]] = [genome_ref] if condition3 and condition4: element_ordering.append(fields[6]) elements[fields[6]] = [genome_ref] featureset = dict() featureset['description'] = description featureset['element_ordering'] = element_ordering featureset['elements'] = elements ws_id = self.dfu.ws_name_to_id(workspace_name) featureset_obj_name = prefix + str(association_name) save_info = self.dfu.save_objects( { 'id': ws_id, 'objects': [ {'type': 'KBaseCollections.FeatureSet', 'data': featureset, 'name': featureset_obj_name}]})[0] obj_ref = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] ) return obj_ref def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value): #TODO: Send outfile to prep gff function inseted of hardcord #TODO: Removed hard coded stuff and create new directory for each test function self.genome_dir_name = "_".join(genome_ref.split("/")) self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name) if not os.path.isdir(self.genome_dir): os.mkdir(self.genome_dir) sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz') self.sorted_gff = sorted_gff_path if not os.path.exists(sorted_gff_path): feature_num = self.gsu.search({'ref': genome_ref})['num_found'] # get genome features for gff construction genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, #'sort_by': [['feature_id', True]] })['features'] assembly_ref = self.wsc.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': genome_ref }])[0]['data']['assembly_ref'] # get assembly contigs for base length calculations assembly_contigs = self.wsc.get_object_subset([{ 'included': ['/contigs'], 'ref': assembly_ref }])[0]['data']['contigs'] contig_ids = list(assembly_contigs.keys()) contig_ids.sort() contig_base_lengths = {} prev_length = 0 for contig in contig_ids: contig_base_lengths[contig] = prev_length prev_length += assembly_contigs[contig]['length'] gff_file = os.path.join(self.genome_dir, 'constructed.gff') constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths) self.sorted_gff = self._prep_gff(constructed_gff) tabix_index(self.sorted_gff) obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]}) association_name =obj_info["infos"][0][1] gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value) gwas_results = pd.read_csv(gwas_results_file, sep='\t') gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \ gwas_results.apply(self.find_gene_info, axis=1) new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..')) fname = 'final_' + association_name new_results_path = os.path.join(new_results_path, fname ) gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False) description = "Genelist for GWAS results of trait " + association_name featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix) return featureset_obj
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(sorted(feat['aliases'].keys())), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}