def get_assembly(self, target_dir, assembly_upa): auc = AssemblyUtil(self.callbackURL) filename = os.path.join(target_dir, assembly_upa.replace('/', '_')) try: auc.get_assembly_as_fasta({ 'ref': assembly_upa, 'filename': filename }) except AssemblyUtilError as assembly_error: print(str(assembly_error)) raise return filename
def get_fasta_file(self, genome_ref): ws = Workspace(self.ws_url) # test if genome references an assembly type # do get_objects2 without data. get list of refs genome_obj_info = ws.get_objects2({ 'objects': [{ 'ref': genome_ref }], 'no_data': 1 }) # get the list of genome refs from the returned info. # if there are no refs (or something funky with the return), this will be an empty list. # this WILL fail if data is an empty list. But it shouldn't be, and we know because # we have a real genome reference, or get_objects2 would fail. genome_obj_refs = genome_obj_info.get('data', [{}])[0].get('refs', []) # see which of those are of an appropriate type (ContigSet or Assembly), if any. assembly_ref = list() ref_params = [{'ref': x} for x in genome_obj_refs] ref_info = ws.get_object_info3({'objects': ref_params}) for idx, info in enumerate(ref_info.get('infos')): if "KBaseGenomeAnnotations.Assembly" in info[ 2] or "KBaseGenomes.ContigSet" in info[2]: assembly_ref.append(";".join(ref_info.get('paths')[idx])) # now just get the file. au = AssemblyUtil(self.callback_url) fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]}) return fasta_file["path"]
def _build_index(self, assembly_info, validated_params): # get the assembly as a fasta file using AssemblyUtil au = AssemblyUtil(self.callback_url) fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']}) # make the target destination folder (check again it wasn't created yet) if os.path.exists(validated_params['output_dir']): raise('Output directory name specified (' + validated_params['output_dir'] + ') already exists. Will not overwrite, so aborting.') os.makedirs(validated_params['output_dir']) # configure the command line args and run it cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params) self.bowtie2.run('bowtie2-build', cli_params) index_info = {'output_dir': validated_params['output_dir'], 'index_files_basename': fasta_info['assembly_name']} # cache the result, mark if it worked or not cache_success = self._put_cached_index(assembly_info, fasta_info['assembly_name'], validated_params['output_dir'], validated_params['ws_for_cache']) if cache_success: index_info['pushed_to_cache'] = 1 else: index_info['pushed_to_cache'] = 0 return index_info
def _stage_assembly_files(self, object_list): """ _stage_assembly_files: download the fasta files to the scratch area return list of file names """ log('Processing assembly object list: {}'.format(object_list)) # Sourmash uses the sequence filename as the default label for the signatures # this includes the complete file path. So keeping the sequence file name as close # to the desired label as possible is the reason not to place each file under # a 'fasta' directory or inlude the '.fa' file extension auc = AssemblyUtil(self.callbackURL) staged_file_list = [] for assembly_upa in object_list: try: file_ = auc.get_assembly_as_fasta({'ref': assembly_upa})['path'] except AssemblyUtilError as assembly_error: print(str(assembly_error)) raise filename = os.path.basename(file_).replace('.fa', '') to_upper_command = "awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' " \ + file_ + ' > tmp.fa ' + '&& mv tmp.fa ' + filename self._run_command(to_upper_command) staged_file_list.append(filename) log('Created file list: {}'.format(staged_file_list)) return staged_file_list
def run_mash_sketch(self, ctx, params): """ Generate a sketch file from a fasta/fastq file :param params: instance of type "MashSketchParams" (* * Pass in **one of** input_path, assembly_ref, or reads_ref * input_path - string - local file path to an input fasta/fastq * assembly_ref - string - workspace reference to an Assembly type * reads_ref - string - workspace reference to a Reads type * Optionally, pass in a boolean indicating whether you are using paired-end reads. * paired_ends - boolean - whether you are passing in paired ends) -> structure: parameter "input_path" of String, parameter "assembly_ref" of String, parameter "reads_ref" of String, parameter "paired_ends" of type "boolean" (params: input_upa: workspace reference to an assembly object workspace_name: name of current workspace search_db: database to search n_max_results: number of results to return, integer between 1 and 100) :returns: instance of type "MashSketchResults" (* * Returns the local scratch file path of the generated sketch file. * Will have the extension '.msh') -> structure: parameter "sketch_path" of String """ # ctx is the context object # return variables are: results #BEGIN run_mash_sketch if 'reads_ref' in params: reads_utils = ReadsUtils(self.callbackURL) result = reads_utils.download_reads({ 'read_libraries': [params['reads_ref']], 'interleaved': 'true' }) input_path = result['files'][params['reads_ref']]['files']['fwd'] elif 'assembly_ref' in params: assembly_util = AssemblyUtil(self.callbackURL) result = assembly_util.get_assembly_as_fasta( {'ref': params['assembly_ref']}) input_path = result['path'] elif 'input_path' in params: input_path = params['input_path'] else: raise ValueError( 'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.' ) mash_utils = MashUtils(self.config, self.auth_token) output_file_path = mash_utils.mash_sketch( input_path, paired_ends=params.get('paired_ends')) results = {'sketch_path': output_file_path} #END run_mash_sketch # At some point might do deeper type checking... if not isinstance(results, dict): raise ValueError('Method run_mash_sketch return value ' + 'results is not type dict as required.') # return the results return [results]
def fetch_fasta_from_assembly(assembly_ref, ws_url, callback_url): """ From an assembly or contigset, this uses a data file util to build a FASTA file and return the path to it. """ allowed_types = ['KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet'] if not check_ref_type(assembly_ref, allowed_types, ws_url): raise ValueError("The reference {} cannot be used to fetch a FASTA file".format( assembly_ref)) au = AssemblyUtil(callback_url) return au.get_assembly_as_fasta({'ref': assembly_ref})
def get_fasta_from_genome(logger,ws_client,urls,genome_id): ref = ws_client.get_object_subset( [{ 'ref' : genome_id ,'included': ['contigset_ref']}]) contig_id = ref[0]['data']['contigset_ref'] logger.info( "Generating FASTA from Genome") try: ## get the FASTA assembly = AssemblyUtil(urls['callback_url']) ret = assembly.get_assembly_as_fasta({'ref':contig_id}) output_file = ret['path'] fasta_file = os.path.basename(output_file) return fasta_file except Exception, e: raise Exception(e) raise Exception("Unable to Create FASTA file from Genome : {0}".format(genome_id))
def test_genbank_to_genome(self, download_staging_file, update_staging_service): fasta_file = 'small_fasta.fna' ws_obj_name = 'MyAssembly' params = { 'staging_file_subdir_path': fasta_file, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name } ref = self.getImpl().import_fasta_as_assembly_from_staging( self.getContext(), params) self.assertTrue('obj_ref' in ref[0]) self.assertTrue('report_ref' in ref[0]) self.assertTrue('report_name' in ref[0]) fasta_file_path = os.path.join('/kb/module/work/tmp', fasta_file) assemblyUtil = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) fasta_assembly = assemblyUtil.get_assembly_as_fasta( {'ref': self.getWsName() + "/{}".format(ws_obj_name)}) expected_data = None with open(fasta_file_path, 'r') as f: expected_data = f.read() actual_data = None with open(fasta_assembly['path'], 'r') as f: actual_data = f.read() self.assertEqual(actual_data, expected_data) get_objects_params = { 'object_refs': [ref[0].get('obj_ref')], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) base_count = object_data.get('data')[0].get('data').get('base_counts') dna_size = object_data.get('data')[0].get('data').get('dna_size') self.assertEqual(dna_size, 2520) expected_base_count = {'A': 700, 'C': 558, 'T': 671, 'G': 591} self.assertDictContainsSubset(base_count, expected_base_count) self.assertDictContainsSubset(expected_base_count, base_count)
def ScanGenomeForMotifs(self, ctx, params): """ :param params: instance of type "ScanGenomeIn" (This example function accepts any number of parameters and returns results in a KBaseReport funcdef run_MotifScan(mapping<string,UnspecifiedObject> params) returns (ReportResults output) authentication required;) -> structure: parameter "genome_ref" of String, parameter "ws_name" of String, parameter "motifset_ref" of String :returns: instance of type "ScanGenomeOut" -> structure: """ # ctx is the context object # return variables are: out #BEGIN ScanGenomeForMotifs ws = Workspace('https://appdev.kbase.us/services/ws') ws_name = params['workspace_name'] subset = ws.get_object_subset([{ 'included': ['/assembly_ref'], 'ref': params['genome_ref'] }]) aref = subset[0]['data']['assembly_ref'] assembly_ref = {'ref': aref} print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref) scanFastaParams = { 'fasta_path': fasta_file['path'], 'motifset_ref': params['motifset_ref'], 'ws_name': params['ws_name'] } #build mast command with this -> fasta_file['path'] #no way we can use this fasta to build report, too big #END ScanGenomeForMotifs # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method ScanGenomeForMotifs return value ' + 'out is not type dict as required.') # return the results return [out]
def stage_assembly_files(self, object_list): """ _stage_assembly_files: download the fasta files to the scratch area return list of file names """ log('Processing assembly object list: {}'.format(object_list)) auc = AssemblyUtil(self.callbackURL) staged_file_list = [] for assembly_upa in object_list: try: filename = auc.get_assembly_as_fasta({'ref': assembly_upa})['path'] except AssemblyUtilError as assembly_error: print(str(assembly_error)) raise staged_file_list.append(filename) log('Created file list: {}'.format(staged_file_list)) return staged_file_list
def download_fasta(refs, cb_url): """ Args: ref - workspace reference in the form 'workspace_id/object_id/obj_version' cb_url - callback server URL Returns the path of the downloaded fasta file """ dfu = DataFileUtil(cb_url) assembly_util = AssemblyUtil(cb_url) ws_objects = dfu.get_objects({'object_refs': refs}) paths = [] for (obj, ref) in zip(ws_objects['data'], refs): ws_type = obj['info'][2] if 'KBaseGenomes.Genome' in ws_type: assembly_ref = get_assembly_ref_from_genome(ref, obj) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: assembly_ref = ref else: raise TypeError('Invalid type ' + ws_type + '. Must be an Assembly or Genome.') path = assembly_util.get_assembly_as_fasta({'ref': assembly_ref})['path'] paths.append(path) return paths
def get_genome_data_files(self, genome_ref): genome_files = {"assembly": None, "gff": None} print('Fetching assembly or contig information from genome...') assembly_ref = self._get_assembly_ref(genome_ref) if len(assembly_ref) > 1: raise ValueError( 'This genome, {}, appears to reference {} Assemblies or ContigSets, with these object references: {}' .format(genome_ref, len(assembly_ref), assembly_ref)) elif len(assembly_ref) == 0: raise ValueError( 'There was no Assembly or ContigSet found as a reference to this genome. Unable to build browser data.' ) print('Done! Found valid assembly data.') print('Converting sequence data to FASTA file...') au = AssemblyUtil(self.callback_url) fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]}) print('Done! FASTA file created: {}'.format(fasta_file)) if "path" not in fasta_file: raise IOError( 'FASTA file was not apparently generated from the given genome fasta_file. fasta_file object missing key "path": {}' .format(fasta_file)) genome_files["assembly"] = fasta_file.get('path', None) print('Converting genome annotation data to gff file...') gfu = GenomeFileUtil(self.callback_url) gff_file = gfu.genome_to_gff({'genome_ref': genome_ref}) print('Done! GFF file created: {}'.format(gff_file)) if "file_path" not in gff_file: raise IOError( 'GFF file was not apparently generated from the given genome. gff_file object missing key "file_path": {}' .format(gff_file)) genome_files["gff"] = gff_file.get('file_path', None) return genome_files
def annotate_contigs(self, ctx, params): """ :param params: instance of type "AnnotateContigsParams" (Required parameters: assembly_ref - reference to Assembly object, output_workspace - output workspace name, output_genome_name - output object name, Optional parameters (correspond to PROKKA command line arguments): --scientific_name Genome scientific name (default 'Unknown') --kingdom [X] Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') --genus [X] Genus name (triggers to use --usegenus) --gcode [N] Genetic code / Translation table (set if --kingdom is set) (default '11') --metagenome Improve gene predictions for highly fragmented genomes (default OFF) --rawproduct Do not clean up /product annotation (default OFF) --fast Fast mode - skip CDS /product searching (default OFF) --mincontiglen [N] Minimum contig size [NCBI needs 200] (default '1') --evalue [n.n] Similarity e-value cut-off (default '1e-06') --rfam Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default OFF) --norrna Don't run rRNA search (default OFF) --notrna Don't run tRNA search (default OFF)) -> structure: parameter "assembly_ref" of type "assembly_ref" (Reference to an Assembly object in the workspace @id ws KBaseGenomeAnnotations.Assembly), parameter "output_workspace" of String, parameter "output_genome_name" of String, parameter "scientific_name" of String, parameter "kingdom" of String, parameter "genus" of String, parameter "gcode" of Long, parameter "metagenome" of type "boolean" (A boolean. 0 = false, anything else = true.), parameter "rawproduct" of type "boolean" (A boolean. 0 = false, anything else = true.), parameter "fast" of type "boolean" (A boolean. 0 = false, anything else = true.), parameter "mincontiglen" of Long, parameter "evalue" of String, parameter "rfam" of type "boolean" (A boolean. 0 = false, anything else = true.), parameter "norrna" of type "boolean" (A boolean. 0 = false, anything else = true.), parameter "notrna" of type "boolean" (A boolean. 0 = false, anything else = true.) :returns: instance of type "AnnotateContigsOutput" -> structure: parameter "output_genome_ref" of type "genome_ref" (Reference to an Genome object in the workspace @id ws KBaseGenomes.Genome), parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN annotate_contigs print("Input parameters: " + pformat(params)) assembly_ref = self._get_input_value(params, 'assembly_ref') output_genome_name = self._get_input_value(params, 'output_genome_name') output_workspace = self._get_input_value(params, 'output_workspace') ws_client = workspaceService(self.ws_url, token=ctx['token']) sso_ret = ws_client.get_objects([{ 'ref': "KBaseOntology/seed_subsystem_ontology" }])[0] sso = sso_ret['data'] ec_to_sso = {} for sso_id in sso['term_hash']: sso_name = sso['term_hash'][sso_id]['name'] if "(EC " in sso_name and sso_name.endswith(")"): ec = sso_name[sso_name.index("(EC ") + 4:-1].strip() sso_list = ec_to_sso.get(ec, None) if not sso_list: sso_list = [] ec_to_sso[ec] = sso_list sso_list.append(sso['term_hash'][sso_id]) print("EC found in SSO: " + str(len(ec_to_sso))) sso_info = sso_ret['info'] sso_ref = str(sso_info[6]) + '/' + str(sso_info[0]) + '/' + str( sso_info[4]) with open('/kb/module/work/seed_so.json', 'w') as outfile: json.dump(sso, outfile, sort_keys=True, indent=4) assembly_info = ws_client.get_object_info_new({ 'objects': [{ 'ref': assembly_ref }], 'includeMetadata': 1 })[0] assembly_meta = assembly_info[10] gc_content = float(assembly_meta.get("GC content")) dna_size = int(assembly_meta.get("Size")) n_contigs = 0 if 'N Contigs' in assembly_meta: n_contigs = int(assembly_meta.get("N Contigs")) else: contig = ws_client.get_objects([{'ref': assembly_ref}])[0] n_contigs = len(contig['data']['contigs']) if n_contigs >= 30000: print("Hmmm. There are over 30,000 contigs in this Assembly. ") print( "It looks like you are trying to run Prokka on a metagenome or non-prokayritoc data set." ) print( "If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins." ) print( "These bins can then be individually annotated as a single genome using Prokka." ) print( "If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes." ) print( "Alternatively, you can try reducing the number of contigs using a filter app." ) raise ValueError( 'Too many contigs for Prokka. See logs for details and suggestios' ) au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=ctx['token']) orig_fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref})['path'] renamed_fasta_file = orig_fasta_file + "_renamed.fna" records = [] new_ids_to_old = {} contig_counter = 0 for record in SeqIO.parse(orig_fasta_file, "fasta"): contig_counter += 1 old_id = record.id new_id = "contig_" + str(contig_counter) sequence = record.seq # it has type 'Seq' record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")") records.append(record) new_ids_to_old[new_id] = old_id SeqIO.write(records, renamed_fasta_file, "fasta") output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4()) scientific_name = params.get('scientific_name', 'Unknown') # --kingdom [X] Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default 'Bacteria') kingdom = str(params.get('kingdom', "Bacteria")) prokka_cmd_list = [ "perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix", "mygenome", "--kingdom", kingdom, "--cpus", '1' ] # --genus [X] Genus name (triggers to use --usegenus) if 'genus' in params and params['genus']: prokka_cmd_list.extend( ['--genus', str(params['genus']), '--usegenus']) # --gcode [N] Genetic code / Translation table (set if --kingdom is set) (default '0') gcode = 0 if 'gcode' in params and params['gcode']: gcode = params['gcode'] prokka_cmd_list.extend(['--gcode', str(gcode)]) # --gram [X] Gram: -/neg +/pos (default '') if 'gram' in params and params['gram']: raise ValueError( "gram parameter is not supported in current Prokka installation" ) # --metagenome Improve gene predictions for highly fragmented genomes (default OFF) if 'metagenome' in params and params['metagenome'] == 1: prokka_cmd_list.append("--metagenome") # --rawproduct Do not clean up /product annotation (default OFF) if 'rawproduct' in params and params['rawproduct'] == 1: prokka_cmd_list.append("--rawproduct") # --fast Fast mode - skip CDS /product searching (default OFF) if 'fast' in params and params['fast'] == 1: prokka_cmd_list.append("--fast") # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default '1') if 'mincontiglen' in params and params['mincontiglen']: prokka_cmd_list.extend( ['--mincontiglen', str(params['mincontiglen'])]) # --evalue [n.n] Similarity e-value cut-off (default '1e-06') if 'evalue' in params and params['evalue']: prokka_cmd_list.extend(['--evalue', str(params['evalue'])]) # --rfam Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default '0') if 'rfam' in params and params['rfam'] == 1: prokka_cmd_list.append("--rfam") # --norrna Don't run rRNA search (default OFF) if 'norrna' in params and params['norrna'] == 1: prokka_cmd_list.append("--norrna") # --notrna Don't run tRNA search (default OFF) if 'notrna' in params and params['notrna'] == 1: prokka_cmd_list.append("--notrna") prokka_cmd_list.append(renamed_fasta_file) print("Prokka command line: " + str(prokka_cmd_list)) subprocess.Popen(prokka_cmd_list, cwd=self.scratch).wait() faa_file = output_dir + "/mygenome.faa" cds_to_prot = {} for record in SeqIO.parse(faa_file, "fasta"): cds_to_prot[record.id] = str(record.seq) ffn_file = output_dir + "/mygenome.ffn" cds_to_dna = {} for record in SeqIO.parse(ffn_file, "fasta"): cds_to_dna[record.id] = str(record.seq) gff_file = output_dir + "/mygenome.gff" if not os.path.isfile(gff_file): raise ValueError("PROKKA output GFF file is not found") cdss = [] mrnas = [] features = [] non_hypothetical = 0 genes_with_ec = 0 genes_with_sso = 0 prot_lengths = [] with open(gff_file, "r") as f1: for rec in GFF.parse(f1): contig_id = new_ids_to_old[str(rec.id)] for ft in rec.features: loc = ft.location min_pos = int(loc.start) + 1 max_pos = int(loc.end) strand = '+' if loc.strand == 1 else '-' flen = max_pos - min_pos + 1 start = min_pos if strand == '+' else max_pos location = [[contig_id, start, strand, flen]] qualifiers = ft.qualifiers generated_id = self._get_qualifier_value( qualifiers.get('ID')) if not generated_id: # Skipping feature with no ID (mostly repeat regions) continue dna = cds_to_dna.get(generated_id) if not dna: # Skipping feature with no DNA (mostly repeat regions) continue name = self._get_qualifier_value(qualifiers.get('Name')) ec = self._get_qualifier_value(qualifiers.get('eC_number')) gene = self._get_qualifier_value(qualifiers.get('gene')) product = self._get_qualifier_value( qualifiers.get('product')) fid = name if name else generated_id aliases = [] if gene: aliases.append(gene) if ec: aliases.append(ec) genes_with_ec += 1 md5 = hashlib.md5(dna).hexdigest() feature = { 'id': fid, 'location': location, 'type': 'gene', 'aliases': aliases, 'md5': md5, 'dna_sequence': dna, 'dna_sequence_length': len(dna), } if product: feature['function'] = product if product != "hypothetical protein": non_hypothetical += 1 if ec and ec in ec_to_sso: sso_list = ec_to_sso[ec] sso_terms = {} for sso_item in sso_list: sso_terms[sso_item['id']] = { 'id': sso_item['id'], 'evidence': [], 'term_name': sso_item['name'], 'ontology_ref': sso_ref, 'term_lineage': [] } feature['ontology_terms'] = {'SSO': sso_terms} genes_with_sso += 1 cds = None mrna = None prot = cds_to_prot.get(generated_id) if prot: cds_id = fid + "_CDS" mrna_id = fid + "_mRNA" prot_len = len(prot) prot_lengths.append(prot_len) feature['protein_translation'] = prot feature['protein_translation_length'] = prot_len feature['cdss'] = [cds_id] feature['mrnas'] = [mrna_id] cds = { 'id': cds_id, 'location': location, 'md5': md5, 'parent_gene': fid, 'parent_mrna': mrna_id, 'function': (product if product else ''), 'ontology_terms': {}, 'protein_translation': prot, 'protein_translation_length': prot_len, 'aliases': aliases } mrna = { 'id': mrna_id, 'location': location, 'md5': md5, 'parent_gene': fid, 'cds': cds_id } features.append(feature) if cds: cdss.append(cds) if mrna: mrnas.append(mrna) genome = { 'id': 'Unknown', 'features': features, 'scientific_name': scientific_name, 'domain': kingdom, 'genetic_code': gcode, 'assembly_ref': assembly_ref, 'cdss': cdss, 'mrnas': mrnas, 'source': 'PROKKA annotation pipeline', 'gc_content': gc_content, 'dna_size': dna_size, 'reference_annotation': 0 } prov = ctx.provenance() ga = GenomeAnnotationAPI(self.sw_url, token=ctx['token']) info = ga.save_one_genome_v1({ 'workspace': output_workspace, 'name': output_genome_name, 'data': genome, 'provenance': prov })['info'] genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) # Prepare report report = '' report += 'Genome saved to: ' + output_workspace + '/' + output_genome_name + '\n' report += 'Number of genes predicted: ' + str(len(features)) + '\n' report += 'Number of protein coding genes: ' + str( len(prot_lengths)) + '\n' report += 'Number of genes with non-hypothetical function: ' + str( non_hypothetical) + '\n' report += 'Number of genes with EC-number: ' + str( genes_with_ec) + '\n' report += 'Number of genes with Seed Subsystem Ontology: ' + str( genes_with_sso) + '\n' report += 'Average protein length: ' + str( int(sum(prot_lengths) / float(len(prot_lengths)))) + ' aa.\n' kbr = KBaseReport(os.environ['SDK_CALLBACK_URL'], token=ctx['token']) report_info = kbr.create_extended_report({ 'message': report, 'objects_created': [{ 'ref': genome_ref, 'description': 'Annotated genome' }], 'report_object_name': 'kb_prokka_report_' + str(uuid.uuid4()), 'workspace_name': output_workspace }) returnVal = { 'output_genome_ref': genome_ref, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END annotate_contigs # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method annotate_contigs return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def do_assembly(self, assemblyRef, wsName): #try: # assembly = wsClient.get_objects2({'objects': [{'ref': assembly_ref}]}) #except: # exc_type, exc_value, exc_traceback = sys.exc_info() # lines = traceback.format_exception(exc_type, exc_value, exc_traceback) # orig_error = ''.join(' ' + line for line in lines) # raise ValueError('Error from workspace:\n' + orig_error) #print assembly#[200:] #print assembly['data'] #print assembly['data'][0] #assembly['data'][0]['data'] #fasta_handle_ref = assembly['data'][0]['data']['fasta_handle_ref'] #print "fasta_handle_ref "+fasta_handle_ref #print type(fasta_handle_ref) #TODO create file here /kb/module/work #TODO set output file name print "SDK_CALLBACK_URL "+os.environ['SDK_CALLBACK_URL'] au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) #assembly_input_ref = "16589/2/1" #filename = "test.fasta" #obj_name = "EcoliMG1655.f" #wsname = "example_assembly" param = dict() param['ref'] = assemblyRef#assembly_input_ref input_fasta_file = au.get_assembly_as_fasta(param)#{'ref': assembly_input_ref}) #just_input_fasta_file = os.path.basename(input_fasta_file['path']) #print "input_fasta_file "+ str(input_fasta_file['path']) newtmp = "/kb/module/work/tmp/tmp_"+self.create_random_string() os.mkdir(newtmp) os.mkdir(newtmp+"/input") newfasta = newtmp +"/input/"+os.path.basename(input_fasta_file['path']) print "newfasta "+newfasta os.rename(input_fasta_file['path'], newfasta) args = ["wrapper_phage_contigs_sorter_iPlant.pl ", "--db 2 ","--fna ", newfasta," --wdir ",newtmp] print str(args) cmdstring = "".join(args) print "Executing" cmdProcess = subprocess.Popen(cmdstring, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) print "Done "+str(cmdProcess) stdout, stderr = cmdProcess.communicate() print " stdout: " + stdout print " stderr: " + stderr #return [report] # Step 5 - Build a Report and return reportObj = { 'objects_created': [], 'text_message': stdout } # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], #report_info = report.create({'report': reportObj, 'workspace_name': wsName}) #reportObj = { # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], # 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) #} #report = KBaseReport(self.callback_url) #report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # contruct the output to send back #output = {'report_name': report_info['name'], # 'report_ref': report_info['ref'] # } #print('returning:' + pformat(output)) print('Saving report') kbr = KBaseReport(self.callback_url, service_ver='dev') report = '' report += "cmdstring: " + str(cmdstring) + " stdout: " + str(stdout) + " stderr: " + str(stderr) virout = newtmp+"/"+"VIRSorter_global-phage-signal.csv" with open(virout, 'r') as myfile: data = myfile.read().replace('\n', '') print "wsName "+str(wsName) data = data.replace(",", "\t") data = data.replace("##", "\n##") report = report +"\n\n***** VirSorter output *****\n"+data report_data = {'message': report, 'objects_created': None, 'direct_html_link_index': None, 'html_links': None, 'report_object_name': 'kb_virsorter_' + str(uuid.uuid4()), 'workspace_name': wsName } print "report_data" print str(report_data) report_info = kbr.create_extended_report(report_data ) # 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}], # 'html_links': [{'shock_id': quastret['shock_id'], # 'name': 'report.html', # 'label': 'QUAST report'} # ], reportName = report_info['name'] reportRef = report_info['ref'] return reportName, reportRef
def filter_contigs(self, ctx, params): """ :param workspace_name: instance of String :param params: instance of type "ContigFilterParams" (Input parameters) -> structure: parameter "assembly_ref" of String, parameter "min_length" of Long :returns: instance of type "ContigFilterResults" (Output results) -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "filtered_assembly_ref" of String, parameter "n_total" of Long, parameter "n_remaining" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN filter_contigs for name in ['min_length', 'assembly_ref', 'workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') ws_name = params['workspace_name'] assembly_util = AssemblyUtil(self.callback_url) file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']}) # Parse the downloaded file in FASTA format parsed_assembly = SeqIO.parse(file['path'], 'fasta') min_length = params['min_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 # Create a file to hold the filtered data filtered_path = os.path.join(self.scratch, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': {'path': filtered_path}, 'workspace_name': ws_name, 'assembly_name': file['assembly_name'] }) # Create an output summary message for the report text_message = "".join([ 'Filtered assembly to ', str(n_remaining), ' contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded html_dir = os.path.join(self.scratch, 'html') html_index_path = os.path.join(html_dir, 'index.html') file_path = os.path.join(self.scratch, 'myfile.txt') with open(file_path, 'w') as f: f.write('hello world') os.mkdir(html_dir) with open(html_index_path, 'w') as f: f.write('<p><b>hello world</b></p>') print('xyz1', os.listdir(html_dir)) print('xyz2', os.listdir(self.scratch)) html_links = [{ 'path': os.path.join(html_dir, 'index.html'), 'name': 'main.html', 'description': 'Sample description' }] file_links = [{ 'path': file_path, 'name': 'file.txt', 'description': 'Sample file description' }] + html_links # Extended report report_data = { 'objects_created': [{'ref': new_ref, 'description': 'Filtered contigs'}], 'html_links': html_links, 'file_links': file_links, 'warnings': ['warning 1', 'warning 2'], 'report_object_name': 'my_report', 'direct_html': '<p>Hello</p>', 'message': text_message, 'workspace_name': ws_name, 'direct_html_link_index': 0, 'html_window_height': 800, 'summary_window_height': 800 } # # Simple report # report_data = { # 'report': { # 'text_message': 'My simple report text message', # 'warnings': ['warning 1', 'warning 2'], # 'objects_created': [{'ref': new_ref, 'description': 'filtered contigs'}] # }, # 'workspace_name': ws_name # } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create_extended_report(report_data) # Return the report reference and name in our results returnVal = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } #END filter_contigs # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_contigs return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def get_promoter_for_gene(self, ctx, params): """ :param params: instance of type "get_promoter_for_gene_input" (Genome is a KBase genome Featureset is a KBase featureset Promoter_length is the length of promoter requested for all genes) -> structure: parameter "genome_ref" of String, parameter "featureSet_ref" of String, parameter "promoter_length" of Long :returns: instance of type "get_promoter_for_gene_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN get_promoter_for_gene #code goes here dfu = DataFileUtil(self.callback_url) objectRefs = {'object_refs': [params['Genome'], params['featureSet']]} objects = dfu.get_objects(objectRefs) genome = objects['data'][0]['data'] featureSet = objects['data'][1]['data'] assembly_ref = {'ref': genome['assembly_ref']} with open('/kb/module/work/genome.json', 'w') as f: json.dump(genome, f) with open('/kb/module/work/featureSet.json', 'w') as f: json.dump(featureSet, f) #with open('/kb/module/work/asssembly.json','w') as f: # json.dump(assembly,f) print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref) #pprint(fasta_file) #loop over featureSet #find matching feature in genome #get record, start, orientation, length #TODO: add some error checking logic to the bounds of the promoter prom = "" for feature in featureSet['elements']: #print(feature) #print(featureSet['elements'][feature]) for f in genome['features']: if f['id'] == feature: attributes = f['location'][0] #print(f['location']) break for record in SeqIO.parse(fasta_file['path'], 'fasta'): #print(record.id) #print(attributes[0]) if record.id == attributes[0]: #print(attributes[0]) if attributes[2] == '+': #might need to offset by 1? end = attributes[1] start = end - params['promoter_length'] if end < 0: end = 0 promoter = record.seq[start:end].upper() prom += ">" + feature + "\n" prom += promoter + "\n" elif attributes[2] == '-': start = attributes[1] end = start + params['promoter_length'] if end > len(record.seq) - 1: end = len(record.seq) - 1 promoter = record.seq[start:end].upper() complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} promoter = ''.join( [complement[base] for base in promoter[::-1]]) prom += ">" + feature + "\n" prom += promoter + "\n" else: print('Error on orientation') timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) html_output_dir = os.path.join(self.shared_folder, 'output_html.' + str(timestamp)) if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) html_file = 'promoter.html' output_html_file_path = os.path.join(html_output_dir, html_file) html_report_lines = "<html><body>" html_report_lines += "<pre>" + prom + "</pre>" html_report_lines += "</body></html>" with open(output_html_file_path, 'w', 0) as html_handle: html_handle.write(html_report_lines) try: html_upload_ret = dfu.file_to_shock({ 'file_path': html_output_dir, #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path, #'make_handle': 0}) 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') reportName = 'identify_promoter_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], 'message': '', 'direct_html': None, 'direct_html_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': 'View' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #iterate over records in fasta #for record in SeqIO.parse(fasta_file['path'], 'fasta'): #objects list of Genome and featureSet #pprint(objects) #END get_promoter_for_gene # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method get_promoter_for_gene return value ' + 'output is not type dict as required.') # return the results return [output]
def filter_contigs(self, ctx, params): """ The actual function is declared using 'funcdef' to specify the name and input/return arguments to the function. For all typical KBase Apps that run in the Narrative, your function should have the 'authentication required' modifier. :param params: instance of type "FilterContigsParams" (A 'typedef' can also be used to define compound or container objects, like lists, maps, and structures. The standard KBase convention is to use structures, as shown here, to define the input and output of your function. Here the input is a reference to the Assembly data object, a workspace to save output, and a length threshold for filtering. To define lists and maps, use a syntax similar to C++ templates to indicate the type contained in the list or map. For example: list <string> list_of_strings; mapping <string, int> map_of_ints;) -> structure: parameter "assembly_input_ref" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "workspace_name" of String, parameter "min_length" of Long :returns: instance of type "FilterContigsResults" (Here is the definition of the output of the function. The output can be used by other SDK modules which call your code, or the output visualizations in the Narrative. 'report_name' and 'report_ref' are special output fields- if defined, the Narrative can automatically render your Report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "assembly_output" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "n_initial_contigs" of Long, parameter "n_contigs_removed" of Long, parameter "n_contigs_remaining" of Long """ # ctx is the context object # return variables are: output #BEGIN filter_contigs # Print statements to stdout/stderr are captured and available as the App log print('Starting Filter Contigs function. Params=') pprint(params) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. print('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system print('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } print('returning:' + pformat(output)) #END filter_contigs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'output is not type dict as required.') # return the results return [output]
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) # 2) based on type, download the files ws = Workspace(self.ws_url) input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0] # 0 obj_id objid - the numerical id of the object. # 1 obj_name name - the name of the object. # 2 type_string type - the type of the object. # 3 timestamp save_date - the save date of the object. # 4 obj_ver ver - the version of the object. # 5 username saved_by - the user that saved or copied the object. # 6 ws_id wsid - the workspace containing the object. # 7 ws_name workspace - the workspace containing the object. # 8 string chsum - the md5 checksum of the object. # 9 int size - the size of the object in bytes. # 10 usermeta meta - arbitrary user-supplied metadata about # the object. obj_name = input_info[1] type_name = input_info[2].split('-')[0] if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']: au = AssemblyUtil(self.callback_url) os.makedirs(input_dir) filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) au.get_assembly_as_fasta({'ref': input_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') pass elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name au = MetagenomeUtils(self.callback_url) bin_file_dir = au.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory'] os.rename(bin_file_dir, input_dir) self.set_fasta_file_extensions(input_dir, fasta_file_extension) elif type_name == 'KBaseGenomes.Genome': raise ValueError('Cannot yet stage fasta file input directory from KBaseGenomes.Genome') else: raise ValueError('Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
class CufflinksUtils: CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/' GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/' def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass def parse_FPKMtracking_calc_TPM(self, filename): """ Generates TPM from FPKM :return: """ fpkm_dict = {} tpm_dict = {} gene_col = 0 fpkm_col = 9 sum_fpkm = 0.0 with open(filename) as f: next(f) for line in f: larr = line.split("\t") gene_id = larr[gene_col] if gene_id != "": fpkm = float(larr[fpkm_col]) sum_fpkm = sum_fpkm + fpkm fpkm_dict[gene_id] = math.log(fpkm + 1, 2) tpm_dict[gene_id] = fpkm if sum_fpkm == 0.0: log("Warning: Unable to calculate TPM values as sum of FPKM values is 0" ) else: for g in tpm_dict: tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2) return fpkm_dict, tpm_dict def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_cufflinks_params(self, params): """ _validate_run_cufflinks_params: Raises an exception if params are invalid """ log('Start validating run_cufflinks params') # check for required parameters for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _run_gffread(self, gff_path, gtf_path): """ _run_gffread: run gffread script ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility """ log('converting gff to gtf') command = self.GFFREAD_TOOLKIT_PATH + '/gffread ' command += "-E {0} -T -o {1}".format(gff_path, gtf_path) self._run_command(command) def _create_gtf_annotation_from_genome(self, genome_ref): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ 'ref': genome_ref, 'included': ['contigset_ref', 'assembly_ref'] }]) if 'contigset_ref' in ref[0]['data']: contig_id = ref[0]['data']['contigset_ref'] elif 'assembly_ref' in ref[0]['data']: contig_id = ref[0]['data']['assembly_ref'] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print(contig_id) log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta({'ref': contig_id}) output_file = ret['path'] mapping_filename = c_mapping.create_sanitized_contig_ids( output_file) os.remove(output_file) # get the GFF ret = self.gfu.genome_to_gff({'genome_ref': genome_ref}) genome_gff_file = ret['file_path'] c_mapping.replace_gff_contig_ids(genome_gff_file, mapping_filename, to_modified=True) gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf' self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path def _get_gtf_file(self, alignment_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch alignment_data = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0]['data'] genome_ref = alignment_data.get('genome_id') # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1] # ws_gtf = genome_name+"_GTF_Annotation" genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_gtf_file_from_genome_ref(self, genome_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_input_file(self, alignment_ref): """ _get_input_file: get input BAM file from Alignment object """ bam_file_dir = self.rau.download_alignment( {'source_ref': alignment_ref})['destination_dir'] files = os.listdir(bam_file_dir) bam_file_list = [ file for file in files if re.match(r'.*\_sorted\.bam', file) ] if not bam_file_list: bam_file_list = [ file for file in files if re.match(r'.*(?<!sorted)\.bam', file) ] if not bam_file_list: raise ValueError('Cannot find .bam file from alignment {}'.format( alignment_ref)) bam_file_name = bam_file_list[0] bam_file = os.path.join(bam_file_dir, bam_file_name) return bam_file def _generate_command(self, params): """ _generate_command: generate cufflinks command """ cufflinks_command = '/opt/cufflinks/cufflinks' cufflinks_command += (' -q --no-update-check -p ' + str(params.get('num_threads', 1))) if 'max_intron_length' in params and params[ 'max_intron_length'] is not None: cufflinks_command += (' --max-intron-length ' + str(params['max_intron_length'])) if 'min_intron_length' in params and params[ 'min_intron_length'] is not None: cufflinks_command += (' --min-intron-length ' + str(params['min_intron_length'])) if 'overhang_tolerance' in params and params[ 'overhang_tolerance'] is not None: cufflinks_command += (' --overhang-tolerance ' + str(params['overhang_tolerance'])) cufflinks_command += " -o {0} -G {1} {2}".format( params['result_directory'], params['gtf_file'], params['input_file']) log('Generated cufflinks command: {}'.format(cufflinks_command)) return cufflinks_command def _process_rnaseq_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing RNASeqAlignment object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) if '/' not in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_rnaseq_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params['gtf_file'], params['expression_suffix']) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _process_kbasesets_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing KBaseSets object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_kbasesets_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params.get('gtf_file'), params.get('expression_suffix')) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_html_report(self, result_directory, obj_ref): """ _generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_object_type = expression_object.get('info')[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqExpression-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data'][ 'sample_expression_ids']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] Overview_Content += '<p>{}</p>'.format(expression_name) elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): pprint(expression_object) Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data']['items']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref['ref'] }], includeMetadata=None)[0][1] condition = expression_ref['label'] Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format( condition, expression_name) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Cufflinks App' }) return html_report def _save_rnaseq_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_rnaseq_expression: save Expression object to workspace """ log('start saving Expression object') alignment_object_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_kbasesets_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils and SetAPI """ log('start saving Expression object') alignment_info = self.ws.get_object_info3( {'objects': [{ "ref": alignment_ref }]}) alignment_object_name = alignment_info['infos'][0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_rnaseq_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_rnaseq_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _save_kbasesets_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_kbasesets_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _generate_report(self, obj_ref, workspace_name, result_directory, exprMatrix_FPKM_ref=None, exprMatrix_TPM_ref=None): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report(result_directory, obj_ref) expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_info = expression_object['info'] expression_data = expression_object['data'] expression_object_type = expression_info[2] if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'ExpressionSet generated by Cufflinks' }] items = expression_data['items'] for item in items: objects_created.append({ 'ref': item['ref'], 'description': 'Expression generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_FPKM_ref, 'description': 'FPKM ExpressionMatrix generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_TPM_ref, 'description': 'TPM ExpressionMatrix generated by Cufflinks' }) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _parse_FPKMtracking(self, filename, metric): result = {} pos1 = 0 if metric == 'FPKM': pos2 = 7 if metric == 'TPM': pos2 = 8 with open(filename) as f: next(f) for line in f: larr = line.split("\t") if larr[pos1] != "": try: result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2) except ValueError: result[larr[pos1]] = math.log(1, 2) return result def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cufflinks_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Cufflinks App' }) return output_files def _generate_expression_data(self, result_directory, alignment_ref, gtf_file, workspace_name, expression_suffix): """ _generate_expression_data: generate Expression object with cufflinks output files """ alignment_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0] # set expression name alignment_object_name = alignment_data_object['info'][1] if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_data = { 'id': expression_name, 'type': 'RNA-Seq', 'numerical_interpretation': 'FPKM', 'processing_comments': 'log2 Normalized', 'tool_used': self.tool_used, 'tool_version': self.tool_version } alignment_data = alignment_data_object['data'] condition = alignment_data.get('condition') expression_data.update({'condition': condition}) genome_id = alignment_data.get('genome_id') expression_data.update({'genome_id': genome_id}) read_sample_id = alignment_data.get('read_sample_id') expression_data.update( {'mapped_rnaseq_alignment': { read_sample_id: alignment_ref }}) exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM( os.path.join(result_directory, 'genes.fpkm_tracking')) expression_data.update({'expression_levels': exp_dict}) expression_data.update({'tpm_expression_levels': tpm_exp_dict}) handle = self.dfu.file_to_shock({ 'file_path': result_directory, 'pack': 'zip', 'make_handle': True })['handle'] expression_data.update({'file': handle}) return expression_data def _generate_expression_set_data(self, alignment_expression_map, alignment_set_ref, expression_set_name): """ _generate_expression_set_data: generate ExpressionSet object with cufflinks output files """ alignment_set_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] alignment_set_data = alignment_set_data_object['data'] expression_set_data = { 'tool_used': self.tool_used, 'tool_version': self.tool_version, 'id': expression_set_name, 'alignmentSet_id': alignment_set_ref, 'genome_id': alignment_set_data.get('genome_id'), 'sampleset_id': alignment_set_data.get('sampleset_id') } sample_expression_ids = [] mapped_expression_objects = [] mapped_expression_ids = [] for alignment_expression in alignment_expression_map: alignment_ref = alignment_expression.get('alignment_ref') expression_ref = alignment_expression.get('expression_obj_ref') sample_expression_ids.append(expression_ref) mapped_expression_ids.append({alignment_ref: expression_ref}) alignment_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] mapped_expression_objects.append({alignment_name: expression_name}) expression_set_data['sample_expression_ids'] = sample_expression_ids expression_set_data[ 'mapped_expression_objects'] = mapped_expression_objects expression_set_data['mapped_expression_ids'] = mapped_expression_ids return expression_set_data def _process_alignment_set_object(self, params, alignment_object_type): """ _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object and KBaseSets.ReadsAlignmentSet type object """ log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object' '\nparams:\n{}'.format(json.dumps(params, indent=1))) alignment_set_ref = params.get('alignment_set_ref') if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): params['gtf_file'] = self._get_gtf_file(alignment_set_ref) else: if not '/' in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] params['gtf_file'] = self._get_gtf_file_from_genome_ref( params['genome_ref']) alignment_set = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) mul_processor_params = [] for alignment in alignment_set["data"]["items"]: alignment_ref = alignment['ref_path'] alignment_upload_params = params.copy() alignment_upload_params['alignment_ref'] = alignment_ref mul_processor_params.append(alignment_upload_params) # use the following when you want to run the cmd sequentially # self._process_kbasesets_alignment_object(mul_processor_params[0]) cpus = min(params.get('num_threads'), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log('running _process_alignment_object with {} cpus'.format(cpus)) alignment_expression_map = pool.map( self._process_kbasesets_alignment_object, mul_processor_params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) expression_items = list() for proc_alignment_return in alignment_expression_map: expression_obj_ref = proc_alignment_return.get( 'expression_obj_ref') alignment_ref = proc_alignment_return.get('alignment_ref') alignment_info = self.ws.get_object_info3({ 'objects': [{ "ref": alignment_ref }], 'includeMetadata': 1 }) condition = alignment_info['infos'][0][10]['condition'] expression_items.append({ "ref": expression_obj_ref, "label": condition, }) expression_name = self.ws.get_object_info( [{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] self._run_command('cp -R {} {}'.format( proc_alignment_return.get('result_directory'), os.path.join(result_directory, expression_name))) expression_set = { "description": "generated by kb_cufflinks", "items": expression_items } expression_set_info = self.set_api.save_expression_set_v1({ "workspace": params['workspace_name'], "output_object_name": params['expression_set_name'], "data": expression_set }) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_set_info['set_ref'] } widget_params = { "output": params.get('expression_set_name'), "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_output_object_name(self, params, alignment_object_type, alignment_object_name): """ Generates the output object name based on input object type and name and stores it in params with key equal to 'expression' or 'expression_set' based on whether the input object is an alignment or alignment_set. :param params: module input params :param alignment_object_type: input alignment object type :param alignment_object_name: input alignment object name :param alignment_object_data: input alignment object data """ expression_set_suffix = params['expression_set_suffix'] expression_suffix = params['expression_suffix'] if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): if re.match('.*_[Aa]lignment$', alignment_object_name): params['expression_name'] = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_name'] = alignment_object_name + expression_suffix if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix def _save_expression_matrix(self, expressionset_ref, workspace_name): """ _save_expression_matrix: save FPKM and TPM ExpressionMatrix """ log('start saving ExpressionMatrix object') expression_set_name = self.ws.get_object_info( [{ "ref": expressionset_ref }], includeMetadata=None)[0][1] output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '', expression_set_name) upload_expression_matrix_params = { 'expressionset_ref': expressionset_ref, 'output_obj_name': output_obj_name_prefix, 'workspace_name': workspace_name } expression_matrix_refs = self.eu.get_expressionMatrix( upload_expression_matrix_params) return expression_matrix_refs def run_cufflinks_app(self, params): log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_cufflinks_params(params) alignment_object_ref = params.get('alignment_object_ref') alignment_object_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_object_ref }]})['infos'][0] alignment_object_type = alignment_object_info[2] alignment_object_name = alignment_object_info[1] # get output object name self._generate_output_object_name(params, alignment_object_type, alignment_object_name) log('--->\nalignment object type: \n' + '{}'.format(alignment_object_type)) if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): params.update({'alignment_ref': alignment_object_ref}) returnVal = self._process_rnaseq_alignment_object(params) report_output = self._generate_report( returnVal.get('expression_obj_ref'), params.get('workspace_name'), returnVal.get('result_directory')) returnVal.update(report_output) elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \ re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): params.update({'alignment_set_ref': alignment_object_ref}) returnVal = self._process_alignment_set_object( params, alignment_object_type) expression_matrix_refs = self._save_expression_matrix( returnVal['expression_obj_ref'], params.get('workspace_name')) returnVal.update(expression_matrix_refs) report_output = self._generate_report( returnVal['expression_obj_ref'], params.get('workspace_name'), returnVal['result_directory'], expression_matrix_refs['exprMatrix_FPKM_ref'], expression_matrix_refs['exprMatrix_TPM_ref']) returnVal.update(report_output) else: raise ValueError( 'None RNASeqAlignment type\nObject info:\n{}'.format( alignment_object_info)) return returnVal
def filter_contigs(self, ctx, params): """ Main method :param params: instance of type "ContigFilterParams" (Input parameter types) -> structure: parameter "workspace_name" of String, parameter "assembly_ref" of String, parameter "min_length" of Long :returns: instance of type "ContigFilterResults" (Output result types) -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "filtered_assembly_ref" of String, parameter "n_total" of Long, parameter "n_remaining" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN filter_contigs for name in ['min_length', 'assembly_ref', 'workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') print("params['min_length']=%s, params['assembly_ref']=%s" % (params['min_length'], params['assembly_ref'])) print("params['params['workspace_name']=%s" % (params['workspace_name'])) print("self.callback_url=%s" % self.callback_url) print("self.scratch=%s" % self.scratch) print "config = " pprint.pprint(self.config) ############### # Download ref ############## assembly_util = AssemblyUtil(self.callback_url) file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']}) print "assembly fasta file = " pprint.pprint(file) ################################### # Real business - filter the contig ################################### parsed_assembly = SeqIO.parse(file['path'], 'fasta') min_length = params['min_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 # returnVal = { # 'n_total': n_total, # 'n_remaining': n_remaining # } # returnVal = {} ################## # Output ################## workspace_name = params['workspace_name'] filtered_path = os.path.join(self.scratch, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': { 'path': filtered_path }, 'workspace_name': workspace_name, 'assembly_name': file['assembly_name'] }) # returnVal = { # 'n_total': n_total, # 'n_remaining': n_remaining, # 'filtered_assembly_ref': new_ref # } ################ # Reporting ################ text_message = "".join([ 'Filtered assembly to ', str(n_remaining), 's contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded report_data = { 'objects_created': [ {'ref': new_ref, 'description': 'Filtered contigs'} ], 'text_message': text_message } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({ 'report': report_data, 'workspace_name': workspace_name }) # Return the report reference and name in our results returnVal = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } ############### # BBtools test ############### # bbtools = BBTools(self.callback_url) bbtools = BBTools(self.callback_url, service_ver='beta') # set up input files print "file['path'] = " print file['path'] # print new_ref['filtered_assembly_ref'] rqc_filter_input = { "reads_file": file['path'] # /kb/module/work/tmp/Shewanella_oneidensis_MR-1_assembly.fa } # or, if you want to use a KBase Workspace UPA for your reads object: # rqc_filter_input = { # "reads_library_ref": new_ref['filtered_assembly_ref'] # } # set up parameters (example below, there are many more options, see BBTools.spec) rqc_filter_params = { "qtrim": "rl", "maxns": 3, "minlength": 40, "maxmem": 5 } #"maxmem": 5 # run the local RQCFilter function result = bbtools.run_RQCFilter_local(rqc_filter_input, rqc_filter_params) print "result = " pprint.pprint(result) #END filter_contigs # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_contigs return value returnVal is not type dict as required.') # return the results return [returnVal]
def predict_amr_phenotype(self, ctx, params): """ The AMR prediction function specification :param params: instance of type "AMRPredictionParams" (Structure of input data for AMR prediction) -> structure: parameter "assembly_input_ref" of type "assembly_ref", parameter "species" of String, parameter "workspace_name" of String :returns: instance of type "AMRPredictionResults" (Structure of output of AMR prediction) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN predict_amr_phenotype # Input validation for name in ['assembly_input_ref', 'species', 'workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not (isinstance(params['assembly_input_ref'], string_types) or isinstance(params['assembly_input_ref'], list)) or not len(params['assembly_input_ref']): raise ValueError('Pass in a valid assembly reference string(s)') # Extract params if not isinstance(params["assembly_input_ref"], list): assemblies = [params["assembly_input_ref"]] else: assemblies = params["assembly_input_ref"] species = params["species"] # Get models for species scm_models = self.get_models_by_algorithm_and_species("scm", species) cart_models = self.get_models_by_algorithm_and_species("cart", species) # Process assemblies predictions = {} assembly_util = AssemblyUtil(self.callback_url) for assembly_ref in assemblies: assembly_predictions = {} # Get the fasta file path and other info assembly = assembly_util.get_assembly_as_fasta({'ref': assembly_ref}) # Extract the k-mers kmers = self.extract_kmers(assembly["path"], k=31) print "Kmers --", assembly["assembly_name"], ":", len(kmers) # Make predictions (SCM) print "SCM models" assembly_predictions["scm"] = {} for antibiotic, model in scm_models.iteritems(): p = model.predict(kmers) assembly_predictions["scm"][antibiotic] = {} assembly_predictions["scm"][antibiotic]["label"] = p[0] assembly_predictions["scm"][antibiotic]["why"] = p[1] # Make predictions (CART) print "CART models" assembly_predictions["cart"] = {} for antibiotic, model in cart_models.iteritems(): p = model.predict(kmers) assembly_predictions["cart"][antibiotic] = {} assembly_predictions["cart"][antibiotic]["label"] = p[0] assembly_predictions["cart"][antibiotic]["why"] = p[1] predictions[assembly["assembly_name"]] = assembly_predictions del assembly_predictions # Generate report text_message = "This is a test report for kover amr (text)" # Data for creating the report, referencing the assembly we uploaded report_data = { 'objects_created': [], 'text_message': text_message, 'direct_html': generate_html_prediction_report(predictions, species) } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create({ 'report': report_data, 'workspace_name': params['workspace_name'], 'file_links': generate_csv_prediction_report(predictions, species, self.scratch) }) output = { 'report_ref': report['ref'], 'report_name': report['name'] } #END predict_amr_phenotype # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method predict_amr_phenotype return value ' + 'output is not type dict as required.') # return the results return [output]
def get_assembly_info(self, ref): assembly_util = AssemblyUtil(self.callback_url) file = assembly_util.get_assembly_as_fasta({'ref': ref}) return dict(name=file["assembly_name"], contigs=list(SeqIO.parse(file['path'], 'fasta')))
class ProkkaUtils: def __init__(self, config): self.scratch = config["scratch"] self.ctx = config['ctx']; self.callback_url = config["SDK_CALLBACK_URL"] self.ws_client = workspaceService(config["workspace-url"]) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.genome_api = GenomeAnnotationAPI(self.callback_url) self.sso_ref = None self.sso_event = None self.ec_to_sso = {} self.output_workspace = None @staticmethod def _get_input_value(params, key): """Get value of key after checking for its existence :param params: Params dictionary haystack :param key: Key to search in Params :return: Parameter Value :raises ValueError: raises an exception if the key doesn"t exist """ if not key in params: raise ValueError("Parameter " + key + " should be set in input parameters") return params[key] @staticmethod def _get_qualifier_value(qualifier): """Get first qualifier from the list of qualifiers :param qualifier: list contents of the qualifier from BCBio GFF Tools :return: first element in the list """ return qualifier[0] if (qualifier and len(qualifier) > 0) else None def download_seed_data(self): """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and the create a table from ec numbers to sso (ec_to_sso) :return: None """ # Download Seed Reference Data sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0] sso = sso_ret["data"] for sso_id in sso["term_hash"]: sso_name = sso["term_hash"][sso_id]["name"] if "(EC " in sso_name and sso_name.endswith(")"): ec = sso_name[sso_name.index("(EC ") + 4: -1].strip() sso_list = self.ec_to_sso.get(ec, None) if not sso_list: sso_list = [] self.ec_to_sso[ec] = sso_list sso_list.append(sso["term_hash"][sso_id]) print("EC found in SSO: " + str(len(self.ec_to_sso))) sso_info = sso_ret["info"] sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4]) with open("/kb/module/work/seed_so.json", "w") as outfile: json.dump(sso, outfile, sort_keys=True, indent=4) self.sso_ref = sso_ref def inspect_assembly(self, assembly_meta, assembly_ref): """Check to see if assembly has too many contigs and might not be a metagenome or non prokaryotic dataset :param assembly_meta: information about the assembly reference :param assembly_ref: the assembly reference number :return: a tuple containing gc_content and dna_size """ gc_content = float(assembly_meta.get("GC content")) dna_size = int(assembly_meta.get("Size")) n_contigs = 0 if "N Contigs" in assembly_meta: n_contigs = int(assembly_meta.get("N Contigs")) else: contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0] n_contigs = len(contig["data"]["contigs"]) if n_contigs >= 30000: message = """ Hmmm. There are over 30,000 contigs in this Assembly. It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. These bins can then be individually annotated as a single genome using Prokka. If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. Alternatively, you can try reducing the number of contigs using a filter app.") raise ValueError("Too many contigs for Prokka. See logs for details and suggestions """ print(message) raise ValueError("Too many contigs for Prokka. See logs for details and suggestions") assembly_info = namedtuple("assembly_info", "gc_content dna_size") return assembly_info(gc_content, dna_size) @staticmethod def create_renamed_assembly(assembly_fasta_filepath): """Rename records to be in the format of contig_N and output a new fasta file :param assembly_fasta_filepath: :return: The path to the fasta file with renamed contigs the number of contigs, the mapping from old ids to new ids, and the contigs as SeqRecords """ records = [] new_ids_to_old = {} contig_counter = 0 for record in SeqIO.parse(assembly_fasta_filepath, "fasta"): contig_counter += 1 old_id = record.id new_id = "contig_" + str(contig_counter) sequence = record.seq # it has type "Seq" record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")") records.append(record) new_ids_to_old[new_id] = old_id renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna" SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta") renamed_assembly = namedtuple("renamed_assembly", "filepath contig_counter new_ids_to_old records") return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old, records) def run_prokka(self, params, subject_fasta_filepath): """Run Prokka :param params: Prokka parameters :param subject_fasta_filepath: The contigs or genes to run prokka against :return: The directory with all of the prokka output files """ output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4()) # --kingdom [X] Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria") kingdom = "Bacteria" if "kingdom" in params and params["kingdom"]: kingdom = params["kingdom"] prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix", "mygenome", "--kingdom", kingdom] # --genus [X] Genus name (triggers to use --usegenus) if "genus" in params and params["genus"]: prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"]) # --gcode [N] Genetic code / Translation table (set if --kingdom is set) (default "0") if "gcode" in params and params["gcode"]: prokka_cmd_list.extend(["--gcode", str(params["gcode"])]) else: prokka_cmd_list.extend(["--gcode", "0"]) # --gram [X] Gram: -/neg +/pos (default "") if "gram" in params and params["gram"]: raise ValueError("gram parameter is not supported in current Prokka installation") # --metagenome Improve gene predictions for highly fragmented genomes (default OFF) if "metagenome" in params and params["metagenome"] == 1: prokka_cmd_list.append("--metagenome") # --rawproduct Do not clean up /product annotation (default OFF) if "rawproduct" in params and params["rawproduct"] == 1: prokka_cmd_list.append("--rawproduct") # --fast Fast mode - skip CDS /product searching (default OFF) if "fast" in params and params["fast"] == 1: prokka_cmd_list.append("--fast") # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1") if "mincontiglen" in params and params["mincontiglen"]: prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])]) # --evalue [n.n] Similarity e-value cut-off (default "1e-06") if "evalue" in params and params["evalue"]: prokka_cmd_list.extend(["--evalue", str(params["evalue"])]) # --rfam Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0") if "rfam" in params and params["rfam"] == 1: prokka_cmd_list.append("--rfam") # --norrna Don"t run rRNA search (default OFF) if "norrna" in params and params["norrna"] == 1: prokka_cmd_list.append("--norrna") # --notrna Don"t run tRNA search (default OFF) if "notrna" in params and params["notrna"] == 1: prokka_cmd_list.append("--notrna") prokka_cmd_list.append(subject_fasta_filepath) print("Prokka command line: " + str(prokka_cmd_list)) try: check_output(prokka_cmd_list, cwd=self.scratch) except CalledProcessError as e: pprint(e) return output_dir @staticmethod def retrieve_prokka_results(output_dir): """ Gather up the relevant prokka results, load the records from the results files :param output_dir: :return: Sequences from the .faa .ffn files and the gff_filepath """ faa_file = output_dir + "/mygenome.faa" cds_to_prot = {} for record in SeqIO.parse(faa_file, "fasta"): cds_to_prot[record.id] = str(record.seq) ffn_file = output_dir + "/mygenome.ffn" cds_to_dna = {} for record in SeqIO.parse(ffn_file, "fasta"): cds_to_dna[record.id] = str(record.seq) gff_file = output_dir + "/mygenome.gff" if not os.path.isfile(gff_file): raise ValueError("PROKKA output GFF file is not found") prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath") return prokka_results(cds_to_prot, cds_to_dna, gff_file) def parse_prokka_results(self, **prokka_parse_parameters): """ Go through the prokka results from the input contigs and then create the features, mrnas and cdss components of the KbaseGenome.Genome object :param prokka_parse_parameters: gff_filepath, mappings :return: Genome:features Genome:cdss Genome:mrnas report_message of genes discovered """ gff_filepath = prokka_parse_parameters["gff_filepath"] cds_to_dna = prokka_parse_parameters["cds_to_dna"] cds_to_prot = prokka_parse_parameters["cds_to_prot"] new_ids_to_old = prokka_parse_parameters["new_ids_to_old"] evidence = self.make_annotation_evidence() cdss = [] mrnas = [] features = [] non_hypothetical = 0 genes_with_ec = 0 genes_with_sso = 0 prot_lengths = [] with open(gff_filepath, "r") as f1: for rec in GFF.parse(f1): contig_id = new_ids_to_old[str(rec.id)] for ft in rec.features: loc = ft.location min_pos = int(loc.start) + 1 max_pos = int(loc.end) strand = "+" if loc.strand == 1 else "-" flen = max_pos - min_pos + 1 start = min_pos if strand == "+" else max_pos location = [[contig_id, start, strand, flen]] qualifiers = ft.qualifiers generated_id = self._get_qualifier_value(qualifiers.get("ID")) if not generated_id: # Skipping feature with no ID (mostly repeat regions) continue dna = cds_to_dna.get(generated_id) if not dna: # Skipping feature with no DNA (mostly repeat regions) continue name = self._get_qualifier_value(qualifiers.get("Name")) ec = self._get_qualifier_value(qualifiers.get("eC_number")) gene = self._get_qualifier_value(qualifiers.get("gene")) product = self._get_qualifier_value(qualifiers.get("product")) fid = generated_id aliases = [] if name: aliases.append(name) if gene: aliases.append(gene) if ec: aliases.append(ec) genes_with_ec += 1 md5 = hashlib.md5(dna).hexdigest() feature = {"id": fid, "location": location, "type": "gene", "aliases": aliases, "md5": md5, "dna_sequence": dna, "dna_sequence_length": len(dna), } if product: feature["function"] = product if product != "hypothetical protein": non_hypothetical += 1 if ec and ec in self.ec_to_sso: sso_list = self.ec_to_sso[ec] sso_terms = {} for sso_item in sso_list: sso_terms[sso_item["id"]] = {"id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": []} feature["ontology_terms"] = {"SSO": sso_terms} genes_with_sso += 1 cds = None mrna = None prot = cds_to_prot.get(generated_id) if prot: cds_id = fid + "_CDS" mrna_id = fid + "_mRNA" prot_len = len(prot) prot_lengths.append(prot_len) feature["protein_translation"] = prot feature["protein_translation_length"] = prot_len feature["cdss"] = [cds_id] feature["mrnas"] = [mrna_id] cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid, "parent_mrna": mrna_id, "function": (product if product else ""), "ontology_terms": {}, "protein_translation": prot, "protein_translation_length": prot_len, "aliases": aliases} mrna = {"id": mrna_id, "location": location, "md5": md5, "parent_gene": fid, "cds": cds_id} features.append(feature) if cds: cdss.append(cds) if mrna: mrnas.append(mrna) # Prepare report report = "" report += "Number of genes predicted: " + str(len(features)) + "\n" report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n" report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n" report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n" report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n" report += "Average protein length: " + str(int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n" annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message") return annotated_assembly(features, cdss, mrnas, report) def get_new_annotations(self, gff_filepath): """ :param gff_filepath: A dictionary of ids with products and ec numbers :return: """ evidence = self.make_annotation_evidence() genome = {} with open(gff_filepath, "r") as f: for rec in GFF.parse(f): gid = rec.id gene_features = {"id": id} for feature in rec.features: qualifiers = feature.qualifiers if "product" in qualifiers: gene_features["function"] = " ".join(qualifiers["product"]) if "eC_number" in qualifiers: ec_numbers = qualifiers["eC_number"] sso_terms = dict() for ec in ec_numbers: sso_list = self.ec_to_sso.get(ec, []) for sso_item in sso_list: sso_terms[sso_item["id"]] = {"id": sso_item["id"], "evidence": [evidence], "term_name": sso_item["name"], "ontology_ref": self.sso_ref, "term_lineage": []} gene_features["ontology_terms"] = sso_terms genome[gid] = gene_features return genome def write_genome_to_fasta(self, genome_data): """ :param genome_data: :return: """ fasta_for_prokka_filepath = os.path.join(self.scratch, "features_" + str(uuid.uuid4()) + ".fasta") count = 0 with open(fasta_for_prokka_filepath, "w") as f: for item in genome_data["data"]["features"]: if "id" not in item or "dna_sequence" not in item: print("This feature does not have a valid dna sequence.") else: f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n") count += 1 print("Finished printing to" + fasta_for_prokka_filepath) if os.stat(fasta_for_prokka_filepath).st_size == 0: raise Exception( "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.") return fasta_for_prokka_filepath def make_sso_ontology_event(self): """ :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation", "method_version": version, "timestamp": time_string, "id": "SSO", "ontology_ref": self.sso_ref } def make_annotation_evidence(self): """ :param sso_ref: Reference to the annotation library set :return: Ontology_event to be appended to the list of genome ontology events """ time_string = str( datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) return { "method": "Prokka Annotation (Evidence)", "method_version": version, "timestamp": time_string, } def create_genome_ontology_fields(self, genome_data): # Make sure ontologies_events exist sso_event = self.make_sso_ontology_event() ontology_event_index = 0 if 'ontology_events' in genome_data['data']: genome_data['data']['ontology_events'].append(sso_event) ontology_event_index += len(genome_data['data']['ontology_events']) - 1 else: genome_data['data']['ontology_events'] = [sso_event] genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index') return genome_obj_modified(genome_data, ontology_event_index) @staticmethod def old_genome_ontologies(feature, new_ontology): if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): feature["ontology_terms"]["SSO"][key] = new_ontology[key] return feature @staticmethod def new_genome_ontologies(feature, new_ontology, ontology_event_index): if "ontology_terms" not in feature: feature["ontology_terms"] = {"SSO": {}} if "SSO" not in feature["ontology_terms"]: feature["ontology_terms"]["SSO"] = {} for key in new_ontology.keys(): id = new_ontology[key]["id"] if id in feature["ontology_terms"]["SSO"]: feature["ontology_terms"]["SSO"][id].append(ontology_event_index) else: feature["ontology_terms"]["SSO"][id] = [ontology_event_index] return feature def annotate_genome_with_new_annotations(self, **annotation_args): """ :param annotation_args: genome_data, new_annotations from prokka, and the output_genome_name :type :return: """ genome_data = annotation_args["genome_data"] new_annotations = annotation_args["new_annotations"] new_genome = False if 'feature_counts' in genome_data['data']: new_genome = True genome_obj_modified = self.create_genome_ontology_fields(genome_data) genome_data = genome_obj_modified.genome_data ontology_event_index = genome_obj_modified.ontology_event_index stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0, "found_functions": 0, "new_ontologies": 0} function_report_filepath = os.path.join(self.scratch, "ontology_report") ontology_report_filepath = os.path.join(self.scratch, "function_report") onto_r = open(function_report_filepath, "w") func_r = open(ontology_report_filepath, "w") func_r.write("function_id current_function new_function\n") onto_r.write("function_id current_ontology new_ontology\n") for i, feature in enumerate(genome_data["data"]["features"]): fid = feature["id"] current_function = feature.get("function", "") current_functions = feature.get("functions", []) current_ontology = feature.get("ontology_terms", None) new_function = "" new_ontology = dict() if fid in new_annotations: # Set Function new_function = new_annotations[fid].get("function", "") if new_function and "hypothetical protein" not in new_function: if (new_function != current_function and new_function not in current_functions): stats['new_functions'] += 1 genome_data["data"]["features"][i]["function"] = new_function genome_data["data"]["features"][i]["functions"] = [new_function] stats['found_functions'] += 1 # Set Ontologies new_ontology = new_annotations[fid].get("ontology_terms", None) if new_ontology: stats['new_ontologies'] += 1 if new_genome: genome_data["data"]["features"][i] = self. \ new_genome_ontologies(feature, new_ontology, ontology_event_index) else: genome_data["data"]["features"][i] = self. \ old_genome_ontologies(feature, new_ontology) if current_function: func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n") else: func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n") onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n") func_r.close() onto_r.close() info = self.gfu.save_one_genome({"workspace": self.output_workspace, "name": annotation_args["output_genome_name"], "data": genome_data["data"], "provenance": self.ctx.provenance()})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) annotated_genome = namedtuple("annotated_genome", "genome_ref function_report_filepath ontology_report_filepath stats") return annotated_genome(genome_ref, function_report_filepath, ontology_report_filepath, stats) def upload_file(self, filepath, message="Annotation report generated by kb_prokka"): """ Upload a file to shock :param filepath: File to upload :param message: Optional Upload Message :return: """ output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"] print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id) return {"shock_id": output_file_shock_id, "name": os.path.basename(filepath), "label": os.path.basename(filepath), "description": message} def report_annotated_genome(self, genome): """ Create report output with newly reannotated genome, and some stats :param genome: Reannotated Genome Reference, Report Files and Stats :return: Reference to Report Object """ genome_ref = genome.genome_ref stats = genome.stats file_links = [self.upload_file(genome.ontology_report_filepath), self.upload_file(genome.function_report_filepath)] report_message = ("Genome Ref:{0}\n" "Number of features sent into prokka:{1}\n" "New functions found:{2}\n" "Ontology terms found:{3}\n" ).format(genome_ref, stats["current_functions"], stats["new_functions"], stats["new_ontologies"]) report_info = self.kbr.create_extended_report( {"message": report_message, "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}], "file_links": file_links, "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": self.output_workspace }) return {"output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"]} def annotate_genome(self, params): """ User input an existing genome to re-annotate. :param params: Reference to the genome, Output File Name, UI Parameters :return: Report with Reannotated Genome and Stats about it """ self.download_seed_data() self.output_workspace = params["output_workspace"] genome_ref = self._get_input_value(params, "object_ref") output_name = self._get_input_value(params, "output_genome_name") # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0] genome_data = \ self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[ "genomes"][0] fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data) output_dir = self.run_prokka(params, fasta_for_prokka_filepath) prokka_results = self.retrieve_prokka_results(output_dir) new_annotations = self.get_new_annotations(prokka_results.gff_filepath) annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data, new_annotations=new_annotations, output_genome_name=output_name) return self.report_annotated_genome(annotated_genome) def annotate_assembly(self, params, assembly_info): """ Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file, rename the contigs, run prokka against the contigs, parse the results, and finally, create and upload a genome object. :param params: object reference, output_genome_name and output_workspace :param assembly_info: Information used to determine if the assembly is too big :return: Report with newly annotated assembly as a genome, and stats about it """ self.download_seed_data() self.output_workspace = params["output_workspace"] assembly_ref = self._get_input_value(params, "object_ref") output_genome_name = self._get_input_value(params, "output_genome_name") output_workspace = self._get_input_value(params, "output_workspace") assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref) orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"] # Rename Assembly and Keep Track of Old Contigs renamed_assembly = self.create_renamed_assembly(orig_fasta_file) # Run Prokka with the modified, renamed fasta file output_dir = self.run_prokka(params, renamed_assembly.filepath) # Prokka_results prokka_results = self.retrieve_prokka_results(output_dir) # Parse Results annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath, cds_to_dna=prokka_results.cds_to_dna, cds_to_prot=prokka_results.cds_to_dna, new_ids_to_old=renamed_assembly.new_ids_to_old) # Force defaults for optional parameters that may be set to None scientific_name = 'Unknown' if 'scientific_name' in params and params['scientific_name']: scientific_name = params['scientific_name'] domain = "Bacteria" if 'kingdom' in params and params['kingdom']: domain = params['kingdom'] gcode = 0 if 'gcode' in params and params['gcode']: gcode = params['gcode'] genome = {"id": "Unknown", "features": annotated_assembly.features, "scientific_name": scientific_name, "domain": domain, "genetic_code": gcode, "assembly_ref": assembly_ref, "cdss": annotated_assembly.cdss, "mrnas": annotated_assembly.mrnas, "source": "PROKKA annotation pipeline", "gc_content": assembly_info.gc_content, "dna_size": assembly_info.dna_size, "reference_annotation": 0} info = self.gfu.save_one_genome({"workspace": output_workspace, "name": output_genome_name, "data": genome, "provenance": self.ctx.provenance()})["info"] genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]) report_message = "Genome saved to: " + output_workspace + "/" + \ output_genome_name + "\n" + annotated_assembly.report_message report_info = self.kbr.create_extended_report( {"message": report_message, "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}], "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()), "workspace_name": output_workspace }) return {"output_genome_ref": genome_ref, "report_name": report_info["name"], "report_ref": report_info["ref"]}
def __init__(self, genome=None, callbackURL=''): self._contents = StringIO.StringIO() self._ga = genome print('downloading assembly') au = AssemblyUtil(callbackURL) assembly_info = self._ga.get_assembly().get_info() assembly_ref = str(assembly_info['workspace_id']) + '/' + str( assembly_info['object_id']) + '/' + str(assembly_info['version']) print('Assembly reference = ' + assembly_ref) assembly_file_path = au.get_assembly_as_fasta({'ref': assembly_ref})['path'] print('extracting taxonomy information') self._taxa = self._ga.get_taxon() self._tax_lineage = self._taxa.get_scientific_lineage() print('assembling feature and protein data') self._genome_name = str(self._ga.get_id()) self._proteins = self._ga.get_proteins() self._features = self._ga.get_features() print('writing file') # read in fasta file to build the contig index contigs = {} for record in SeqIO.parse(assembly_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) contigs[record.id] = { 'length': len(record.seq), 'sequence': str(record.seq) } self._contigs = contigs contig_length_dict = dict() for contig_id in contigs: contig_length_dict[contig_id] = contigs[contig_id]["length"] del contigs contigs_tuples = sorted(contig_length_dict.items(), key=lambda x: x[1], reverse=True) # print("Contig tuples : " + str(contigs_tuples)) # organize features by location feature_ids_by_region = self._ga.get_feature_ids( group_by="region")["by_region"] # print('FEATURE IDS BY REGION :: ' + str(feature_ids_by_region)) # flatten the last level of the results to get a contiguous list per contig/strand feature_ids_by_contig = {} for contig_tuple in contigs_tuples: cid = contig_tuple[0] feature_ids_by_contig[cid] = {} if cid in feature_ids_by_region: if "+" in feature_ids_by_region[cid]: sorted_regions = sorted( feature_ids_by_region[cid]["+"].keys(), cmp=lambda x, y: cmp(int(x.split("-")[0]), int(y.split("-")[0]))) sorted_ids = [] for region in sorted_regions: for fid in self._sort_feature_ids( feature_ids_by_region[cid]["+"][region]): sorted_ids.append(fid) feature_ids_by_contig[cid]["+"] = sorted_ids else: feature_ids_by_contig[cid]["+"] = [] if "-" in feature_ids_by_region[cid]: sorted_regions = sorted( feature_ids_by_region[cid]["-"].keys(), cmp=lambda x, y: cmp(int(x.split("-")[0]), int(y.split("-")[0]))) sorted_ids = [] for region in sorted_regions: for fid in self._sort_feature_ids( feature_ids_by_region[cid]["-"][region]): sorted_ids.append(fid) feature_ids_by_contig[cid]["-"] = sorted_ids else: feature_ids_by_contig[cid]["-"] = [] for cid in feature_ids_by_contig: # add a header for the contig self._add_contig_header(cid) # add positive strand features if "+" in feature_ids_by_contig[cid]: for fid in feature_ids_by_contig[cid]["+"]: self._add_feature(fid) # add minus strand features if "-" in feature_ids_by_contig[cid]: for fid in feature_ids_by_contig[cid]["-"]: self._add_feature(fid) self._add_contig_sequence(cid)
def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params): """ :param params: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Params" (KButil_Build_InSilico_Metagenomes_with_Grinder() ** ** Use Grinder to generate in silico shotgun metagenomes) -> structure: parameter "workspace_name" of type "workspace_name" (** The workspace object refs are of form: ** ** objects = ws.get_objects([{'ref': params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means the entire name combining the workspace id and the object name ** "id" is a numerical identifier of the workspace or object, and should just be used for workspace ** "name" is a string identifier of a workspace or object. This is received from Narrative.), parameter "input_refs" of type "data_obj_ref", parameter "output_name" of type "data_obj_name", parameter "desc" of String, parameter "num_reads_per_lib" of Long, parameter "population_percs" of String, parameter "read_len_mean" of Long, parameter "read_len_stddev" of Double, parameter "pairs_flag" of Long, parameter "mate_orientation" of String, parameter "insert_len_mean" of Long, parameter "insert_len_stddev" of Double, parameter "mutation_dist" of String, parameter "mutation_ratio" of String, parameter "qual_good" of Long, parameter "qual_bad" of Long, parameter "len_bias_flag" of Long, parameter "random_seed" of Long :returns: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" -> structure: parameter "report_name" of type "data_obj_name", parameter "report_ref" of type "data_obj_ref" """ # ctx is the context object # return variables are: returnVal #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder #### STEP 0: basic init ## console = [] invalid_msgs = [] report_text = '' self.log(console, 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ') self.log(console, "\n" + pformat(params)) # Auth token = ctx['token'] headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # API Clients #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' wsClient = workspaceService(self.workspaceURL, token=token) readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI(url=self.serviceWizardURL, token=ctx['token']) # for dynamic service auClient = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) dfu = DFUClient(self.callbackURL) # param checks required_params = [ 'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib', 'population_percs', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'mate_orientation', 'insert_len_mean', 'insert_len_stddev', 'mutation_dist', 'mutation_ratio', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in required_params: if arg not in params or params[arg] == None or params[arg] == '': raise ValueError("Must define required param: '" + arg + "'") # cast to str unpredictable numerical params (mostly used in string context) numerical_params = [ 'num_reads_per_lib', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in numerical_params: if arg not in params or params[arg] == None or params[arg] == '': continue params[arg] = str(params[arg]) # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects'] = [] for input_ref in params['input_refs']: provenance[0]['input_ws_objects'].append(input_ref) # set the output paths timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) if not os.path.exists(output_dir): os.makedirs(output_dir) html_output_dir = os.path.join(output_dir, 'html') if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) #### STEP 1: Parse population_percs and write to file ## abundance_str = params['population_percs'].strip() abundance_file_path = os.path.join(output_dir, 'my_abundances.txt') abundance_config_num_libs = 0 abundance_config_num_libs_set = False grinder_genome_ids = [] header = [] out_buf = [] for row in abundance_str.split("\n"): cols = re.split(r'\s+', row) if cols[0].upper() == "GENOME": for col in cols: if col == '': continue header.append(col) continue grinder_genome_ids.append(cols[0]) self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'") # DEBUG out_row = [] for col in cols: if col == '': continue elif col == '%': continue elif col.endswith('%'): col = col.rstrip('%') out_row.append(col) out_buf.append("\t".join(out_row)) num_samples = len(out_row) - 1 # first col is genome id if not abundance_config_num_libs_set: abundance_config_num_libs_set = True abundance_config_num_libs = num_samples elif num_samples != abundance_config_num_libs: invalid_msgs.append( "inconsistent number of samples in population_percs input field" ) # data validation if abundance_config_num_libs == 0: invalid_msgs.append( "unable to find sample percentages in population_percs input field" ) sample_sums = [] for row_i, abund_row_str in enumerate(out_buf): abund_row = abund_row_str.split() for sample_i, abund in enumerate(abund_row[1:]): if row_i == 0: sample_sums.append(0) #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i)) # DEBUG sample_sums[sample_i] += float(abund) for sample_i, sample_sum in enumerate(sample_sums): if sample_sum < 99.5 or sample_sum > 100.5: self.log( invalid_msgs, "Sample: " + str(sample_i + 1) + " " + header[sample_i + 1] + " proportions is not summing to 100.0. Summing to: " + str(sample_sum)) if len(invalid_msgs) == 0: with open(abundance_file_path, 'w') as abundance_fh: for out_line in out_buf: abundance_fh.write(out_line + "\n") # DEBUG with open(abundance_file_path, 'r') as abundance_fh: for out_line in abundance_fh.readlines(): out_line = out_line.rstrip() self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'") #### STEP 2: get genome scaffold sequences ## if len(invalid_msgs) == 0: genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna') read_buf_size = 65536 write_buf_size = 65536 accepted_input_types = ["KBaseGenomes.Genome"] genome_refs = params['input_refs'] genome_obj_names = [] genome_sci_names = [] assembly_refs = [] for i, input_ref in enumerate(genome_refs): # genome obj info try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_ref }]})[0] input_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_obj_info[TYPE_I]) # remove trailing version genome_obj_names.append(input_obj_info[NAME_I]) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref + ')' + str(e)) if input_obj_type not in accepted_input_types: raise ValueError("Input object of type '" + input_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) # genome obj data try: genome_obj = wsClient.get_objects([{ 'ref': input_ref }])[0]['data'] genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError("unable to fetch genome: " + input_ref) # Get assembly_refs if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." self.log(console, msg) self.log(invalid_msgs, msg) continue elif 'assembly_ref' in genome_obj and genome_obj[ 'assembly_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING assembly_ref: " + str( genome_obj['assembly_ref']) self.log(console, msg) assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj[ 'contigset_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING contigset_ref: " + str( genome_obj['contigset_ref']) self.log(console, msg) assembly_refs.append(genome_obj['contigset_ref']) # get fastas for scaffolds if len(invalid_msgs) == 0: contig_file_paths = [] for genome_i, input_ref in enumerate(genome_refs): contig_file = auClient.get_assembly_as_fasta({ 'ref': assembly_refs[genome_i] }).get('path') sys.stdout.flush() contig_file_path = dfu.unpack_file({'file_path': contig_file})['file_path'] contig_file_paths.append(contig_file_path) # reformat FASTA IDs for Grinder with open(genomes_src_db_file_path, 'w', write_buf_size) as genomes_src_db_fh: for genome_i, contig_file_path in enumerate(contig_file_paths): #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path) # DEBUG #contig_ids = [] with open(contig_file_path, 'r', read_buf_size) as contig_fh: genome_seq = '' contig_seq = '' contig_seqs = [] for contig_line in contig_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): #contig_id = contig_line.strip()[1:].split(' ')[0] #contig_ids.append(contig_id) #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n") if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' continue else: #genomes_src_db_fh.write(contig_line) contig_seq += contig_line if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' # write joined contigs to file genome_seq = "NNNNNNNNNN".join( contig_seqs ) # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins genome_seq = genome_seq.upper( ) # grinder might require upper case? genomes_src_db_fh.write(">" + grinder_genome_ids[genome_i] + "\n") genomes_src_db_fh.write(genome_seq + "\n") genome_seq = '' contig_seqs = [] # DEBUG #for contig_id in contig_ids: # self.log(console, "\tCONTIG_ID: "+contig_id) # DEBUG # DEBUG toggle = 0 with open(genomes_src_db_file_path, 'r', write_buf_size) as genomes_src_db_fh: for contig_line in genomes_src_db_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): self.log(console, 'GENOMES_SRC_DB: ' + contig_line) genome_id = contig_line[1:] toggle = 0 elif toggle == 0: #elif genome_id == 'G3': self.log( console, 'GENOMES_SRC_DB: ' + contig_line[0:50] + '...') toggle += 1 #### STEP 3: Run Grinder ## if len(invalid_msgs) == 0: cmd = [] cmd.append(self.GRINDER) # output cmd.append('-base_name') cmd.append(params['output_name']) cmd.append('-output_dir') cmd.append(output_dir) # contigs input cmd.append('-reference_file') cmd.append(genomes_src_db_file_path) # abundances cmd.append('-abundance_file') cmd.append(abundance_file_path) # library size cmd.append('-total_reads') cmd.append(str(params['num_reads_per_lib'])) # num libraries (overridden by abundance file?) cmd.append('-num_libraries') cmd.append(str(abundance_config_num_libs)) # read and insert lens cmd.append('-read_dist') cmd.append(str(params['read_len_mean'])) cmd.append('normal') cmd.append(str(params['read_len_stddev'])) if str(params['pairs_flag']) == '1': cmd.append('-insert_dist') cmd.append(str(params['insert_len_mean'])) cmd.append('normal') cmd.append(str(params['insert_len_stddev'])) # mate orientation cmd.append('-mate_orientation') cmd.append(params['mate_orientation']) # genome len bias cmd.append('-length_bias') cmd.append(str(params['len_bias_flag'])) # mutation model cmd.append('-mutation_dist') cmd.append(str(params['mutation_dist'])) cmd.append('-mutation_ratio') cmd.append(str(params['mutation_ratio'])) # qual scores cmd.append('-fastq_output') cmd.append('1') cmd.append('-qual_levels') cmd.append(str(params['qual_good'])) cmd.append(str(params['qual_bad'])) # skip contig joins cmd.append('-exclude_chars') cmd.append('NX') # explicitly request bidirectional cmd.append('-unidirectional') cmd.append('0') # random seed if 'random_seed' in params and params[ 'random_seed'] != None and params['random_seed'] != '': cmd.append('-random_seed') cmd.append(str(params['random_seed'])) # RUN cmd_str = " ".join(cmd) self.log(console, "===========================================") self.log(console, "RUNNING: " + cmd_str) self.log(console, "===========================================") cmdProcess = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) outputlines = [] while True: line = cmdProcess.stdout.readline() outputlines.append(line) if not line: break self.log(console, line.replace('\n', '')) cmdProcess.stdout.close() cmdProcess.wait() self.log(console, 'return code: ' + str(cmdProcess.returncode) + '\n') if cmdProcess.returncode != 0: raise ValueError('Error running kb_grinder, return code: ' + str(cmdProcess.returncode) + '\n') #report_text += "\n".join(outputlines) #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr # capture output for report and paths to out files report_text_buf = [] struct_file_paths = [] struct_file_names = [] fastq_file_paths = [] for out_line in outputlines: out_line = out_line.rstrip() if 'Community structure' in out_line: clean_line = out_line.lstrip() struct_file_path = re.split(r'\s+', clean_line)[3] struct_file_paths.append(struct_file_path) struct_file_names.append(struct_file_path.split('/')[-1]) self.log(console, "STRUCT_FILE_NAME: '" + struct_file_path.split('/')[-1]) # DEBUG elif 'FASTQ file' in out_line: clean_line = out_line.lstrip() fastq_file_paths.append(re.split(r'\s+', clean_line)[3]) else: report_text_buf.append(out_line) report_text += "\n".join(report_text_buf) #### STEP 4: Upload Read Libs and create reads set ## if len(invalid_msgs) == 0: lib_obj_refs = [] lib_obj_names = [] readsSet_items = [] for sample_i, fastq_file_path in enumerate(fastq_file_paths): if not os.path.isfile (fastq_file_path) \ or os.path.getsize (fastq_file_path) == 0: raise ValueError("empty read lib generated: " + fastq_file_path) else: # lib obj name if len(fastq_file_paths) == 0: output_obj_name = params['output_name'] else: if str(params['pairs_flag']) == '1': output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".PairedEndLib" else: output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".SingleEndLib" lib_obj_names.append(output_obj_name) # upload lib and get obj ref self.log( console, 'Uploading trimmed paired reads: ' + output_obj_name) sequencing_tech = 'artificial reads' if str(params['pairs_flag']) == '1': interleaved = 1 else: interleaved = 0 lib_obj_ref = readsUtils_Client.upload_reads({ 'wsname': str(params['workspace_name']), 'name': output_obj_name, 'fwd_file': fastq_file_path, 'interleaved': interleaved, 'sequencing_tech': sequencing_tech })['obj_ref'] lib_obj_refs.append(lib_obj_ref) os.remove(fastq_file_path) # free up disk # add to readsSet readsSet_items.append({ 'ref': lib_obj_ref, 'label': output_obj_name }) # create readsset readsSet_obj_ref = None if len(lib_obj_refs) > 1: readsSet_obj = { 'description': "Grinder Metagenome from " + " ".join(genome_obj_names), 'items': readsSet_items } readsSet_obj_name = params['output_name'] readsSet_obj_ref = setAPI_Client.save_reads_set_v1({ 'workspace_name': params['workspace_name'], 'output_object_name': readsSet_obj_name, 'data': readsSet_obj })['set_ref'] #### STEP 5: Build report ## reportName = 'kb_grinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], #'text_message': '', # or is it 'message'? 'message': '', # or is it 'text_message'? 'direct_html': '', #'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # message if len(invalid_msgs) > 0: report_text = "\n".join(invalid_msgs) reportObj['message'] = report_text if len(invalid_msgs) == 0: # objs if readsSet_obj_ref != None: reportObj['objects_created'].append({ 'ref': readsSet_obj_ref, 'desc': params['output_name'] + " ReadsSet" }) for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs): reportObj['objects_created'].append({ 'ref': lib_obj_ref, 'desc': lib_obj_names[lib_obj_i] }) # downloadable data for data_i, data_path in enumerate(struct_file_paths): try: upload_ret = dfu.file_to_shock({ 'file_path': data_path, #'pack': 'zip'}) 'make_handle': 0 }) except: raise ValueError('error uploading ' + data_path + ' file to shock') reportObj['file_links'].append({ 'shock_id': upload_ret['shock_id'], 'name': struct_file_names[data_i], 'label': struct_file_names[data_i] }) # html report """ try: html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir, 'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading html report to shock') reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': params['output_name']+' HTML' } ] """ # save report object # SERVICE_VER = 'release' reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']}) report_info = reportClient.create_extended_report(reportObj) returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END KButil_Build_InSilico_Metagenomes_with_Grinder # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join( self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files ws = Workspace(self.ws_url) input_info = ws.get_object_info3({'objects': [{ 'ref': input_ref }]})['infos'][0] # 0 obj_id objid - the numerical id of the object. # 1 obj_name name - the name of the object. # 2 type_string type - the type of the object. # 3 timestamp save_date - the save date of the object. # 4 obj_ver ver - the version of the object. # 5 username saved_by - the user that saved or copied the object. # 6 ws_id wsid - the workspace containing the object. # 7 ws_name workspace - the workspace containing the object. # 8 string chsum - the md5 checksum of the object. # 9 int size - the size of the object in bytes. # 10 usermeta meta - arbitrary user-supplied metadata about # the object. [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI( url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate mguClient with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in [ 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ]: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({ 'ref': input_ref, 'filename': filename }) if not os.path.isfile(filename): raise ValueError( 'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil' ) # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError( 'Assembly or ContigSet is empty in filename: ' + str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref + ')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new( {'objects': [{ 'ref': this_assembly_ref }]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref + '): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i, assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': filename }) if not os.path.isfile(filename): raise ValueError( 'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil' ) # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError( 'Assembly or ContigSet is empty in filename: ' + str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({ 'input_ref': input_ref, 'save_to_shock': 0 })['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join(input_dir, fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError( 'Binned Assembly is empty for fasta_path: ' + str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2( {'objects': [{ 'ref': input_ref }]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch ' + str(input_ref) + ' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError( 'genome_ref not found for genome_id: ' + str(genome_id) + ' in genomeSet: ' + str(input_ref)) else: genomeSet_refs.append( genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i, this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2( {'objects': [{ 'ref': this_input_ref }]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError("unable to fetch genome: " + this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError(msg) continue elif 'assembly_ref' in genome_obj and genome_obj[ 'assembly_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING assembly_ref: " + str( genome_obj['assembly_ref']) print(msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj[ 'contigset_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING contigset_ref: " + str( genome_obj['contigset_ref']) print(msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i, assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': filename }) if not os.path.isfile(filename): raise ValueError( 'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil' ) # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError( 'Assembly or ContigSet is empty in filename: ' + str(filename)) # Unknown type slipped through # else: raise ValueError( 'Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return { 'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta }
def get_promoter_for_gene(self, ctx, params): """ :param params: instance of type "get_promoter_for_gene_input" (Genome is a KBase genome Featureset is a KBase featureset Promoter_length is the length of promoter requested for all genes) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "featureSet_ref" of String, parameter "promoter_length" of Long :returns: instance of String """ # ctx is the context object # return variables are: output #BEGIN get_promoter_for_gene #code goes here dfu = DataFileUtil(self.callback_url) #objectRefs = {'object_refs':[params['genome_ref'],params['featureSet_ref']]} objectRefs = {'object_refs': [params['featureSet_ref']]} ws = Workspace('https://appdev.kbase.us/services/ws') ws_name = params['workspace_name'] subset = ws.get_object_subset([{ 'included': ['/features/[*]/location', '/features/[*]/id', '/assembly_ref'], 'ref': params['genome_ref'] }]) features = subset[0]['data']['features'] aref = subset[0]['data']['assembly_ref'] objects = dfu.get_objects(objectRefs) #genome = objects['data'][0]['data'] #featureSet = objects['data'][1]['data'] featureSet = objects['data'][0]['data'] assembly_ref = {'ref': aref} #print assembly_ref #with open(self.shared_folder + '/genome.json','w') as f: # json.dump(genome,f) #with open(self.shared_folder + '/featureSet.json','w') as f: # json.dump(featureSet,f) #with open('/kb/module/work/asssembly.json','w') as f: # json.dump(assembly,f) print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref) #pprint(fasta_file) #loop over featureSet #find matching feature in genome #get record, start, orientation, length #TODO: add some error checking logic to the bounds of the promoter prom = "" featureFound = False for feature in featureSet['elements']: #print(feature) #print(featureSet['elements'][feature]) featureFound = False for f in features: #print f['id'] #print feature if f['id'] == feature: attributes = f['location'][0] featureFound = True #print('found match ' + feature) #print(f['location']) break if featureFound: for record in SeqIO.parse(fasta_file['path'], 'fasta'): #for record in SeqIO.parse('/kb/module/work/Gmax_189_genome_assembly.fa', 'fasta'): #print(record.id) #print(attributes[0]) if record.id == attributes[0]: #print('adding to prom string') #print(attributes[0]) if attributes[2] == '+': #print('1') #might need to offset by 1? end = attributes[1] start = end - params['promoter_length'] if end < 0: end = 0 promoter = record.seq[start:end].upper() #HERE: resolve ambiguous characters prom += ">" + feature + "\n" prom += promoter + "\n" elif attributes[2] == '-': #print('2') start = attributes[1] end = start + params['promoter_length'] if end > len(record.seq) - 1: end = len(record.seq) - 1 promoter = record.seq[start:end].upper() complement = { 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N' } promoter = ''.join( [complement[base] for base in promoter[::-1]]) #HERE: resolve ambiguous characters prom += ">" + feature + "\n" prom += promoter + "\n" else: print('Error on orientation') else: print('Could not find feature ' + feature + 'in genome') promOutputPath = '/kb/module/work/tmp/promFile.fa' #print('prom string\n' + str(prom)) with open(promOutputPath, 'w') as promFile: promFile.write(str(prom)) timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) html_output_dir = os.path.join(self.shared_folder, 'output_html.' + str(timestamp)) if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) html_file = 'promoter.html' output_html_file_path = os.path.join(html_output_dir, html_file) html_report_lines = '<html><body>' html_report_lines += '<pre>' + prom + '</pre>' html_report_lines += '</body></html>' with open(output_html_file_path, 'w', 0) as html_handle: html_handle.write(str(html_report_lines)) try: html_upload_ret = dfu.file_to_shock({ 'file_path': html_output_dir, #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path, #'make_handle': 0}) 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') reportName = 'identify_promoter_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], 'message': '', 'direct_html': None, 'direct_html_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': 'View' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #changing output to be path string #TODO: get rid of this html maybe and move into find_motifs output = promOutputPath #iterate over records in fasta #for record in SeqIO.parse(fasta_file['path'], 'fasta'): #objects list of Genome and featureSet #pprint(objects) #END get_promoter_for_gene # At some point might do deeper type checking... if not isinstance(output, basestring): raise ValueError('Method get_promoter_for_gene return value ' + 'output is not type basestring as required.') # return the results return [output]
def filter_contigs(self, ctx, params): # ctx is the context object # return variables are: returnVal #BEGIN filter_contigs # Print statements to stdout/stderr are captured and available as the App log print('Starting Filter Contigs function. Params=') pprint(params) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. print('Validating parameters.') if 'workspace_name' not in params: raise ValueError('Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError('Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError('Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError('Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system print('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({'file': {'path': filtered_fasta_file}, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } print('returning:' + pformat(output)) #END filter_contigs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'output is not type dict as required.') # return the results return [output]
class JNixonHelloContigsFilter: ''' Module Name: JNixonHelloContigsFilter Module Description: A KBase module: JNixonHelloContigsFilter ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/nixonpjoshua/JNixonHelloContigsFilter.git" GIT_COMMIT_HASH = "16a66ab4d699d973e210ff92163fbe763009e6d3" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = AssemblyUtil(self.callback_url) #END_CONSTRUCTOR pass def filter_contigs(self, ctx, workspace_name, contigset, minimum): """ :param workspace_name: instance of String :param contigset: instance of String :param minimum: instance of Long :returns: instance of type "FilterContigResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "assembly_ref" of String, parameter "contig_count" of Long, parameter "filtered_contig_count" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN filter_contigs print(workspace_name) print(contigset) print(minimum) def perform_filter(min_length, contigs): result_type = namedtuple( 'filter_result', ['total_count', 'filtered_count', 'filtered_set']) total_count = 0 filtered_count = 0 filtered_set = set() for contig in contigs: if len(contig) > min_length: filtered_count += 1 filtered_set.add(contig) total_count += 1 return result_type(total_count, filtered_count, filtered_set) print('about to get fasta') fasta_file = self.dfu.get_assembly_as_fasta({'ref': contigset}) print('got fasta') contigs = SeqIO.parse(fasta_file['path'], 'fasta') filtered_file = os.path.join(self.scratch, 'filtered.fasta') filtered = perform_filter(minimum, contigs) SeqIO.write(filtered.filtered_set, filtered_file, 'fasta') new_assembly = self.dfu.\ save_assembly_from_fasta({'file': {'path': filtered_file}, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(filtered.filtered_count) + ' contigs out of ' + str(filtered.total_count) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': workspace_name }) returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'contig_count': filtered.total_count, 'filtered_contig_count': filtered.filtered_count } #END filter_contigs # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_contigs return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
def tandem_repeats(self, ctx, params): """ The actual function is declared using 'funcdef' to specify the name and input/return arguments to the function. For all typical KBase Apps that run in the Narrative, your function should have the 'authentication required' modifier. :param params: instance of type "TandemRepeatsParams" (A 'typedef' can also be used to define compound or container objects, like lists, maps, and structures. The standard KBase convention is to use structures, as shown here, to define the input and output of your function. Here the input is a reference to the Assembly data object, a workspace to save output, and a length threshold for filtering. To define lists and maps, use a syntax similar to C++ templates to indicate the type contained in the list or map. For example: list <string> list_of_strings; mapping <string, int> map_of_ints;) -> structure: parameter "assembly_input_ref" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "workspace_name" of String, parameter "match_weight" of Long, parameter "mismatch_weight" of Long, parameter "delta_weight" of Long, parameter "prob_pm" of Long, parameter "prob_pi" of Long, parameter "min_score" of Long, parameter "max_period_size" of Long :returns: instance of type "TandemRepeatsResults" (Here is the definition of the output of the function. The output can be used by other SDK modules which call your code, or the output visualizations in the Narrative. 'report_name' and 'report_ref' are special output fields- if defined, the Narrative can automatically render your Report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN tandem_repeats # init # console = [] invalid_msgs = [] self.log(console, 'Running run_TandemRepeats with params=') self.log(console, "\n" + pformat(params)) report = '' #### do some basic checks # if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'assembly_input_ref' not in params: raise ValueError('assembly_input_ref parameter is required') assembly_input_ref = params['assembly_input_ref'] print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) input_fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref})['path'] print 'PATH: ', input_fasta_file # check for necessary files if not os.path.isfile(self.TRF_bin): raise ValueError("no such file '" + self.TRF_bin + "'") if not os.path.isfile(input_fasta_file): raise ValueError("no such file '" + input_fasta_file + "'") if not os.path.getsize(input_fasta_file) > 0: raise ValueError("empty file '" + input_fasta_file + "'") # set the output path # timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) # output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) # if not os.path.exists(output_dir): # os.makedirs(output_dir) # output_trf_file_path = os.path.join(output_dir, 'tandem_repeats.txt'); ### Construct the command # # e.g. trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m # trf_cmd = [self.TRF_bin] trf_options = [str(input_fasta_file)] trf_match = 2 trf_options.append(str(trf_match)) trf_mismatch = 7 trf_options.append(str(trf_mismatch)) trf_delta = 7 trf_options.append(str(trf_delta)) trf_pm = 80 trf_options.append(str(trf_pm)) trf_pi = 10 trf_options.append(str(trf_pi)) trf_minscore = 50 trf_options.append(str(trf_minscore)) trf_maxperiod = 500 trf_options.append(str(trf_maxperiod)) trf_masked = "-m" trf_options.append(trf_masked) trf_flank = "-f" trf_options.append(trf_flank) trf_data = "-d" trf_options.append(trf_data) trf_noredun = "-r" trf_maxTR = "-l 2" trf_cmd = trf_cmd + trf_options # Run Tandem Repeats Finder, capture output as it happens # self.log(console, 'RUNNING TandemRepeatsFinder:') self.log(console, ' ' + ' '.join(trf_cmd)) # Runnin TandemRepeatFinder p = subprocess.Popen(trf_cmd, \ cwd=self.scratch, \ shell = False ) retcode = p.wait() self.log(console, 'Returned Contigs: ' + str(p.returncode)) if p.returncode != 0: print "Number of Contigs Analyzed", p.returncode # Check that TandemRepeatsFinder produced output # option_string = '.'.join(trf_options[0:8]) print os.listdir(self.scratch) print "INPUT file = ", input_fasta_file html_file = option_string + ".summary.html" mask_file = option_string + ".mask" data_file = option_string + ".dat" if not os.path.isfile(html_file): html_file = option_string + ".1.html" if not os.path.isfile(html_file): raise ValueError("failed to create TandemRepeats output: " + html_file) elif not os.path.getsize(html_file) > 0: raise ValueError("created empty file for TandemRepeats output: " + html_file) # Upload results # if len(invalid_msgs) == 0: self.log(console, "UPLOADING RESULTS") # DEBUG with open(html_file, 'r', 0) as html_file_handle: html_buf = html_file_handle.read() self.log(console, html_buf + "\n") self.log(console, "BUILDING REPORT") # DEBUG # If input data is invalid # if len(invalid_msgs) != 0: reportName = 'trf_report_' + str(uuid.uuid4()) report += "FAILURE\n\n" + "\n".join(invalid_msgs) + "\n" reportObj = {'objects_created': [], 'text_message': report} report_obj_info = ws.save_objects({ # 'id':info[6], 'workspace': params['workspace_name'], 'objects': [{ 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1 }] })[0] returnVal = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]), } return [returnVal] # If input data is VALID # Create report obj # file_links = list() html_links = list() file_links.append({ 'path': html_file, 'name': os.path.basename(html_file), 'desc': 'HTML file output' }) file_links.append({ 'path': mask_file, 'name': os.path.basename(mask_file), 'desc': 'Masked file output' }) file_links.append({ 'path': data_file, 'name': os.path.basename(data_file), 'desc': 'Data file output' }) html_links.append({ 'path': html_file, 'name': os.path.basename(html_file), 'desc': 'HTML file output' }) reportName = 'trf_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], 'message': '', # or is it 'text_message'? 'direct_html': '', 'direct_html_link_index': None, 'file_links': file_links, 'html_links': html_links, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # reportObj['objects_created'].append({'ref': str(params['workspace_name'])+'/'+str(html_file),'description': 'Report'}) reportObj['direct_html_link_index'] = 0 SERVICE_VER = 'release' reportClient = KBaseReport(self.callback_url, token=ctx['token'], service_ver=SERVICE_VER) report_info = reportClient.create_extended_report(reportObj) # Done # self.log(console, "BUILDING RETURN OBJECT") output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } self.log(console, "run_TandemRepeats DONE") #END tandem_repeats # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method tandem_repeats return value ' + 'output is not type dict as required.') # return the results return [output]
def do_assembly(self, assemblyRef, wsName): #try: # assembly = wsClient.get_objects2({'objects': [{'ref': assembly_ref}]}) #except: # exc_type, exc_value, exc_traceback = sys.exc_info() # lines = traceback.format_exception(exc_type, exc_value, exc_traceback) # orig_error = ''.join(' ' + line for line in lines) # raise ValueError('Error from workspace:\n' + orig_error) #print assembly#[200:] #print assembly['data'] #print assembly['data'][0] #assembly['data'][0]['data'] #fasta_handle_ref = assembly['data'][0]['data']['fasta_handle_ref'] #print "fasta_handle_ref "+fasta_handle_ref #print type(fasta_handle_ref) #TODO create file here /kb/module/work #TODO set output file name print "SDK_CALLBACK_URL " + os.environ['SDK_CALLBACK_URL'] au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) #assembly_input_ref = "16589/2/1" #filename = "test.fasta" #obj_name = "EcoliMG1655.f" #wsname = "example_assembly" param = dict() param['ref'] = assemblyRef #assembly_input_ref input_fasta_file = au.get_assembly_as_fasta( param) #{'ref': assembly_input_ref}) #just_input_fasta_file = os.path.basename(input_fasta_file['path']) #print "input_fasta_file "+ str(input_fasta_file['path']) newtmp = "/kb/module/work/tmp/tmp_" + self.create_random_string() os.mkdir(newtmp) os.mkdir(newtmp + "/input") newfasta = newtmp + "/input/" + os.path.basename( input_fasta_file['path']) print "newfasta " + newfasta os.rename(input_fasta_file['path'], newfasta) args = [ "wrapper_phage_contigs_sorter_iPlant.pl ", "--db 2 ", "--fna ", newfasta, " --wdir ", newtmp ] print str(args) cmdstring = "".join(args) print "Executing" cmdProcess = subprocess.Popen(cmdstring, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) print "Done " + str(cmdProcess) stdout, stderr = cmdProcess.communicate() print " stdout: " + stdout print " stderr: " + stderr #return [report] # Step 5 - Build a Report and return reportObj = {'objects_created': [], 'text_message': stdout} # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], #report_info = report.create({'report': reportObj, 'workspace_name': wsName}) #reportObj = { # 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], # 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) #} #report = KBaseReport(self.callback_url) #report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # contruct the output to send back #output = {'report_name': report_info['name'], # 'report_ref': report_info['ref'] # } #print('returning:' + pformat(output)) print('Saving report') kbr = KBaseReport(self.callback_url, service_ver='dev') report = '' report += "cmdstring: " + str(cmdstring) + " stdout: " + str( stdout) + " stderr: " + str(stderr) virout = newtmp + "/" + "VIRSorter_global-phage-signal.csv" with open(virout, 'r') as myfile: data = myfile.read().replace('\n', '') print "wsName " + str(wsName) data = data.replace(",", "\t") data = data.replace("##", "\n##") report = report + "\n\n***** VirSorter output *****\n" + data report_data = { 'message': report, 'objects_created': None, 'direct_html_link_index': None, 'html_links': None, 'report_object_name': 'kb_virsorter_' + str(uuid.uuid4()), 'workspace_name': wsName } print "report_data" print str(report_data) report_info = kbr.create_extended_report(report_data) # 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}], # 'html_links': [{'shock_id': quastret['shock_id'], # 'name': 'report.html', # 'label': 'QUAST report'} # ], reportName = report_info['name'] reportRef = report_info['ref'] return reportName, reportRef