def download_genome_to_json_files(token, genome_ref, target_dir): if not os.path.exists(target_dir): os.makedirs(target_dir) file_name_to_data_map = {} dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token, service_ver='dev') genome_data = dfu.get_objects({'object_refs': [genome_ref]})['data'][0] genome_obj = genome_data['data'] genome_meta = genome_data['info'][10] file_name_to_data_map["genome.json"] = genome_obj file_name_to_data_map["genome.meta.json"] = genome_meta if 'genbank_handle_ref' in genome_obj: gbk_file_name = "genome.gbk" dfu.shock_to_file({ 'handle_id': genome_obj['genbank_handle_ref'], 'file_path': os.path.join(target_dir, gbk_file_name) }) genome_obj['genbank_handle_ref'] = gbk_file_name if 'contigset_ref' in genome_obj: contigset_data = dfu.get_objects( {'object_refs': [genome_obj['contigset_ref']]})['data'][0] contigset_obj = contigset_data['data'] contigset_meta = contigset_data['info'][10] file_name_to_data_map["contigset.json"] = contigset_obj file_name_to_data_map["contigset.meta.json"] = contigset_meta genome_obj['contigset_ref'] = "contigset.json" elif 'assembly_ref' in genome_obj: assembly_data = dfu.get_objects( {'object_refs': [genome_obj['assembly_ref']]})['data'][0] assembly_obj = assembly_data['data'] assembly_meta = assembly_data['info'][10] file_name_to_data_map["assembly.json"] = assembly_obj file_name_to_data_map["assembly.meta.json"] = assembly_meta genome_obj['assembly_ref'] = "assembly.json" fasta_handle_ref = assembly_obj['fasta_handle_ref'] fasta_file_name = "assembly.fa" dfu.shock_to_file({ 'handle_id': fasta_handle_ref, 'file_path': os.path.join(target_dir, fasta_file_name) }) assembly_obj['fasta_handle_ref'] = fasta_file_name assembly_obj['external_source_id'] = fasta_file_name if 'taxon_ref' in assembly_obj: taxon_obj = dfu.get_objects( {'object_refs': [assembly_obj['taxon_ref']]})['data'][0]['data'] file_name_to_data_map["taxon.json"] = taxon_obj assembly_obj['taxon_ref'] = "taxon.json" if 'taxon_ref' in genome_obj: genome_obj['taxon_ref'] = "taxon.json" taxon_obj['parent_taxon_ref'] = "" for target_file_name in file_name_to_data_map: with open(os.path.join(target_dir, target_file_name), 'w') as f: json.dump(file_name_to_data_map[target_file_name], f, sort_keys=True, indent=4)
def BuildFastaFromSequenceSet(self, ctx, params): """ :param params: instance of type "BuildSeqIn" -> structure: parameter "workspace_name" of String, parameter "SequenceSetRef" of String, parameter "fasta_outpath" of String :returns: instance of type "BuildSeqOut" -> structure: parameter "fasta_outpath" of String """ # ctx is the context object # return variables are: output #BEGIN BuildFastaFromSequenceSet dfu = DataFileUtil(self.callback_url) get_objects_params = {'object_refs': [params['SequenceSetRef']]} SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data'] outFile = open(params['fasta_outpath'], 'w') for s in SeqSet['sequences']: sname = '>' + s['sequence_id'] + '\n' outFile.write(sname) sseq = s['sequence'] + '\n' outFile.write(sseq) outFile.close() output = {'fasta_outpath': params['fasta_outpath']} #END BuildFastaFromSequenceSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method BuildFastaFromSequenceSet return value ' + 'output is not type dict as required.') # return the results return [output]
def test_load_with_filter_and_options(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "legacy_test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'FilteredAssembly' result = assemblyUtil.save_assembly_from_fasta( self.getContext(), { 'file': { 'path': fasta_path }, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name, 'min_contig_length': 9, 'external_source': 'someplace', 'external_source_id': 'id', 'external_source_origination_date': 'sunday', 'type': 'metagenome', 'contig_info': { 's3': { 'is_circ': 0, 'description': 'somethin' } } }) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data'] self.assertEqual(len(assembly['contigs']), 1) self.assertEqual(assembly['contigs']['s3']['md5'], '4f339bd56e5f43ecb52e8682a790a111') self.assertEqual(assembly['contigs']['s3']['contig_id'], 's3') self.assertEqual(assembly['contigs']['s3']['length'], 18) self.assertEqual(assembly['contigs']['s3']['is_circ'], 0) self.assertEqual(assembly['contigs']['s3']['description'], 'somethin') self.assertEqual(assembly['dna_size'], 18) self.assertEqual(assembly['gc_content'], 0.44444) self.assertEqual(assembly['md5'], 'eba4d1771060e19671a56832d159526e') self.assertEqual(assembly['num_contigs'], 1) self.assertEqual(assembly['type'], 'metagenome') self.assertEqual(assembly['external_source'], 'someplace') self.assertEqual(assembly['external_source_id'], 'id') self.assertEqual(assembly['external_source_origination_date'], 'sunday')
def generate_report(self, obj_refs, workspace_name): """ generate_report: generate summary report params: obj_refs: generated workspace object references. (return of upload_fastq_file) workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) obj_refs_list = obj_refs.split(',') dfu = DataFileUtil(self.callback_url) upload_message = 'Upload Finished\nUploaded Reads:\n' for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = dfu.get_objects(get_objects_params) upload_message += "Reads Name: " + str( object_data.get('data')[0].get('info')[1]) + '\n' upload_message += "Reads Type: " + str( object_data.get('data')[0].get('info')[2]) + '\n' reads_info = object_data.get('data')[0].get('info')[-1] if isinstance(reads_info, dict): upload_message += "Reads Info: " + json.dumps( reads_info, indent=1)[1:-1] + '\n' report_params = { 'message': upload_message, 'workspace_name': workspace_name, 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
def test_filtered_everything(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "legacy_test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'FilteredAssembly' result = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name, 'min_contig_length': 500 }) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data'] self.assertEqual(assembly['dna_size'], 0) self.assertEqual(assembly['gc_content'], None) self.assertEqual(assembly['num_contigs'], 0)
def test_load_with_filter_and_options(self): assemblyUtil = self.getImpl() tmp_dir = self.__class__.cfg['scratch'] file_name = "legacy_test.fna" shutil.copy(os.path.join("data", file_name), tmp_dir) fasta_path = os.path.join(tmp_dir, file_name) print('attempting upload') ws_obj_name = 'FilteredAssembly' result = assemblyUtil.save_assembly_from_fasta(self.getContext(), {'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': ws_obj_name, 'min_contig_length': 9, 'external_source': 'someplace', 'external_source_id': 'id', 'external_source_origination_date': 'sunday', 'type': 'metagenome', 'contig_info': {'s3': {'is_circ': 0, 'description': 'somethin'}} }) dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data'] self.assertEqual(len(assembly['contigs']), 1) self.assertEqual(assembly['contigs']['s3']['md5'], '4f339bd56e5f43ecb52e8682a790a111') self.assertEqual(assembly['contigs']['s3']['contig_id'], 's3') self.assertEqual(assembly['contigs']['s3']['length'], 18) self.assertEqual(assembly['contigs']['s3']['is_circ'], 0) self.assertEqual(assembly['contigs']['s3']['description'], 'somethin') self.assertEqual(assembly['dna_size'], 18) self.assertEqual(assembly['gc_content'], 0.44444) self.assertEqual(assembly['md5'], 'eba4d1771060e19671a56832d159526e') self.assertEqual(assembly['num_contigs'], 1) self.assertEqual(assembly['type'], 'metagenome') self.assertEqual(assembly['external_source'], 'someplace') self.assertEqual(assembly['external_source_id'], 'id') self.assertEqual(assembly['external_source_origination_date'], 'sunday')
def download_fasta(refs, cb_url): """ Args: ref - workspace reference in the form 'workspace_id/object_id/obj_version' cb_url - callback server URL Returns the path of the downloaded fasta file """ dfu = DataFileUtil(cb_url) assembly_util = AssemblyUtil(cb_url) ws_objects = dfu.get_objects({'object_refs': refs}) paths = [] for (obj, ref) in zip(ws_objects['data'], refs): ws_type = obj['info'][2] if 'KBaseGenomes.Genome' in ws_type: assembly_ref = get_assembly_ref_from_genome(ref, obj) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: assembly_ref = ref else: raise TypeError('Invalid type ' + ws_type + '. Must be an Assembly or Genome.') path = assembly_util.get_assembly_as_fasta({'ref': assembly_ref})['path'] paths.append(path) return paths
def DownloadMotifSet(self, ctx, params): """ :param params: instance of type "DownloadParams" -> structure: parameter "ws_name" of String, parameter "source_ref" of String, parameter "format" of String :returns: instance of type "DownloadOutput" -> structure: parameter "destination_dir" of String """ # ctx is the context object # return variables are: output #BEGIN DownloadMotifSet #fname = params[] dfu = DataFileUtil(self.callback_url) get_object_params = { 'object_refs': [params['source_ref']] } #grab motifset object MSO = dfu.get_objects(get_object_params)['data'][0]['data'] output = '' if params['format'] == 'MEME': output = MD.MotifSetToMEME(MSO) else: print('FORMAT IS NOT RECOGNIZED OR SUPPORTED') print('Supported Formats: MEME JASPAR TRANSFAC') print('Implemented: MEME') outFilePath = '/kb/module/work/tmp/' + params['outname'] with open(outFilePath) as outFile: outFile.write(output) output = {'destination_path': outFilePath} #TODO: add this... #END DownloadMotifSet # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method DownloadMotifSet return value ' + 'output is not type dict as required.') # return the results return [output]
class PangenomeDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.pga = PanGenomeAPI(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) @staticmethod def validate_params(params, expected={"workspace_name", "pangenome_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): files = {} working_dir = os.path.join(self.scratch, 'pangenome-download-'+str(uuid.uuid4())) os.makedirs(working_dir) pg_id, id_name_map, genome_df = self.make_genomes_df( params['pangenome_ref']) files['genomes_path'] = os.path.join(working_dir, pg_id + "_Genomes.tsv") genome_df.to_csv(files['genomes_path'], sep="\t") ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map) files['orthologs_path'] = os.path.join(working_dir, pg_id + "_Orthologs.tsv") ortho_df.to_csv(files['orthologs_path'], sep="\t") return pg_id, files def to_excel(self, params): files = {} working_dir = os.path.join(self.scratch, 'pangenome-download-' + str(uuid.uuid4())) os.makedirs(working_dir) pg_id, id_name_map, genome_df = self.make_genomes_df( params['pangenome_ref']) files['path'] = os.path.join(working_dir, pg_id + ".xlsx") writer = pandas.ExcelWriter(files['path']) genome_df.to_excel(writer, "Genomes") ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map) ortho_df.to_excel(writer, "Orthologs") writer.save() return pg_id, files def make_genomes_df(self, pg_ref): summary = self.pga.compute_summary_from_pangenome({ "pangenome_ref": pg_ref}) return summary['pangenome_id'], summary['genome_ref_name_map'], \ pandas.DataFrame(summary['shared_family_map']) def make_ortholog_df(self, pg_ref, id_name_map): pangen = self.dfu.get_objects({'object_refs': [pg_ref]} )['data'][0]['data'] ortho = {} for cluster in pangen['orthologs']: ortho[cluster['id']] = { "representative function": cluster.get('function', ""), "type": cluster.get("type", ""), "protein sequence": cluster.get("protein_translation", ""), } for gid, name in id_name_map.items(): ortho[cluster['id']][name] = ";".join( [x[0] for x in cluster['orthologs'] if x[2] == gid]) return pandas.DataFrame.from_dict(ortho, 'index')[ ["representative function", "type", "protein sequence"] + sorted([x for x in id_name_map.values()])] def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['pangenome_ref']] }) return {'shock_id': package_details['shock_id']}
class QualiMapRunner: QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap' JAVA_MEM_DEFAULT_SIZE = '16G' LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024 # 20 GB TIMEOUT = 72 * 60 * 60 # 72 hours def _get_file_size(self, file_path): file_size = os.path.getsize(file_path) print('File size: {} -- {}'.format(file_size, file_path)) return file_size def _large_file(self, file_path): filename, file_extension = os.path.splitext(file_path) multiplier = 0 if file_extension == '.txt': total_file_size = 0 with open(file_path, 'r') as f: for line in f: bam_file_path = line.split('\t')[1] total_file_size += self._get_file_size(bam_file_path) print('Total file size: {}'.format(total_file_size)) multiplier = int(total_file_size) / int(self.LARGE_BAM_FILE_SIZE) else: multiplier = int(self._get_file_size(file_path)) / int( self.LARGE_BAM_FILE_SIZE) print('setting number of windows multiplier to: {}'.format(multiplier)) return multiplier def _timeout_handler(self, signum, frame): print('Signal handler called with signal', signum) raise ValueError('QualiMap takes too long') def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc'] def run_app(self, params): self.validate_params(params) print('Validated Params = ') pprint(params) run_info = self.get_run_info(params) if run_info.get('mode') not in ['single', 'multi']: raise ValueError( 'Error in fetching the type to determine run settings.') run_error = False try: signal.signal(signal.SIGALRM, self._timeout_handler) signal.alarm(self.TIMEOUT) if run_info['mode'] == 'single': result = self.run_bamqc(params['input_ref'], run_info['input_info']) elif run_info['mode'] == 'multi': result = self.run_multi_sample_qc(params['input_ref'], run_info['input_info']) signal.alarm(0) except Exception: run_error = True workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) os.makedirs(workdir) with open(os.path.join(workdir, 'qualimapReport.html'), 'w') as report: report.write('<html><body><p></p></body></html>') package_info = self.package_output_folder( workdir, 'QualiMap_report', 'EMPTY HTML report directory for QualiMap BAM QC', 'qualimapReport.html') result = { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info, 'shock_id': None } error_msg = 'Running QualiMap returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Generating simple report instead\n' print(error_msg) if params['create_report']: result = self.create_report(result, params['output_workspace'], run_error, params['input_ref']) return result def create_report(self, result, output_workspace, run_error=None, input_ref=None): if run_error: objects_created = [] info = self.get_obj_info(input_ref) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: objects_created.append({ 'ref': input_ref, 'description': 'Alignment' }) if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: objects_created.append({ 'ref': input_ref, 'description': 'AlignmentSet' }) reads_alignment_info = self.get_alignments_from_set(input_ref) for alignment in reads_alignment_info: alignment_ref = alignment.get('ref') objects_created.append({ 'ref': alignment_ref, 'description': 'Alignment' }) report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result qc_result_zip_info = result['qc_result_zip_info'] report_info = self.kbr.create_extended_report({ 'message': '', 'objects_created': [], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result def get_gtf_file(self, input_ref, set_op=False): print('Start fetching GFF file from genome') if set_op: set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) input_ref = set_data['data']['items'][0]['ref'] obj_data = self.dfu.get_objects({"object_refs": [input_ref]})['data'][0]['data'] genome_ref = obj_data.get('genome_id') if not genome_ref: raise ValueError( 'Alignment is not associated with a Genome object') result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4())) os.makedirs(result_directory) genome_gtf_file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'is_gtf': True, 'target_dir': result_directory })['file_path'] return genome_gtf_file def run_bamqc(self, input_ref, input_info): # download the input and setup a working dir alignment_info = self.rau.download_alignment({'source_ref': input_ref}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) try: gtf_file = self.get_gtf_file(input_ref) except: gtf_file = '' workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) options = [ '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) # always use large mem multiplier = self._large_file(bam_file_path) if multiplier: window_size = multiplier * 400 print('using larger window size: {} and Java memory: {}'.format( window_size, self.JAVA_MEM_DEFAULT_SIZE)) options.append( '-nw {}'.format(window_size)) # increase size of windows self.run_cli_command('bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap BAM QC', 'qualimapReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def run_multi_sample_qc(self, input_ref, input_info): # download the input and setup a working dir reads_alignment_info = self.get_alignments_from_set(input_ref) try: gtf_file = self.get_gtf_file(input_ref, set_op=True) except: gtf_file = '' suffix = 'qualimap_' + str(int(time.time() * 10000)) workdir = os.path.join(self.scratch_dir, suffix) os.makedirs(workdir) input_file_path = self.create_multi_qualimap_cfg( reads_alignment_info, workdir) options = [ '-d', input_file_path, '-r', '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] multiplier = self._large_file(input_file_path) if multiplier: window_size = multiplier * 400 print('using larger window size: {} and Java memory: {}'.format( window_size, self.JAVA_MEM_DEFAULT_SIZE)) options.append( '-nw {}'.format(window_size)) # increase size of windows options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) self.run_cli_command('multi-bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap Multi-sample BAM QC', 'multisampleBamQcReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def get_alignments_from_set(self, alignment_set_ref): set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 1 }) items = set_data['data']['items'] reads_alignment_data = [] for alignment in items: alignment_info = self.rau.download_alignment( {'source_ref': alignment['ref']}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) label = None if 'label' in alignment: label = alignment['label'] reads_alignment_data.append({ 'bam_file_path': bam_file_path, 'ref': alignment['ref'], 'label': label, 'info': alignment['info'] }) return reads_alignment_data def create_multi_qualimap_cfg(self, reads_alignment_info, workdir): # Group by labels if there is at least one defined use_labels = False for alignment in reads_alignment_info: if alignment['label']: use_labels = True break # write the file input_file_path = os.path.join(workdir, 'multi_input.txt') input_file = open(input_file_path, 'w') name_lookup = {} for alignment in reads_alignment_info: name = alignment['info'][1] if name in name_lookup: name_lookup[name] += 1 name = name + '_' + str(name_lookup[name]) else: name_lookup[name] = 1 input_file.write(name + '\t' + alignment['bam_file_path']) if use_labels: if alignment['label']: input_file.write('\t' + alignment['label']) else: input_file.write('\tunlabeled') input_file.write('\n') input_file.close() return input_file_path def get_run_info(self, params): info = self.get_obj_info(params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: return {'mode': 'single', 'input_info': info} if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: return {'mode': 'multi', 'input_info': info} raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def validate_params(self, params): if 'input_ref' not in params: raise ValueError( 'required parameter field "input_ref" was not set') create_report = False if 'create_report' in params: if int(params['create_report']) == 1: if 'output_workspace' not in params: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) if not params['output_workspace']: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) create_report = True params['create_report'] = create_report def run_cli_command(self, command, options, cwd=None): if command not in self.valid_commands: raise ValueError('Invalid QualiMap command: ' + str(command)) command = [self.QUALIMAP_PATH, command] + options print('Running: ' + ' '.join(command)) if not cwd: cwd = self.scratch_dir p = subprocess.Popen(command, cwd=cwd, shell=False) exitCode = p.wait() if (exitCode == 0): print('Success, exit code was: ' + str(exitCode)) else: raise ValueError('Error running command: ' + ' '.join(command) + '\n' + 'Exit Code: ' + str(exitCode)) def find_my_bam_file(self, dirpath): bam_path = None for f in os.listdir(dirpath): fullpath = os.path.join(dirpath, f) if os.path.isfile(fullpath) and f.lower().endswith('.bam'): if bam_path is not None: raise ValueError( 'Error! Too many BAM files were downloaded for this alignment!' ) bam_path = fullpath if bam_path is None: raise ValueError( 'Error! No BAM files were downloaded for this alignment!') return bam_path def package_output_folder(self, folder_path, zip_file_name, zip_file_description, index_html_file): ''' Simple utility for packaging a folder and saving to shock ''' output = self.dfu.file_to_shock({ 'file_path': folder_path, 'make_handle': 0, 'pack': 'zip' }) return { 'shock_id': output['shock_id'], 'name': zip_file_name, 'description': zip_file_description, 'index_html_file_name': index_html_file } def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
class ImportExpressionMatrixUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fv = KBaseFeatureValues(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_tsv_as_expression_matrix_from_staging(self, params): ''' import_tsv_as_expression_matrix_from_staging: wrapper method for KBaseFeatureValues.tsv_file_to_matrix required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name matrix_name: output Expressin Matirx file name workspace_name: workspace name/ID of the object optional params: genome_ref: optional reference to a Genome object that will be used for mapping feature IDs to fill_missing_values: optional flag for filling in missing values in matrix (default value is false) data_type: optional filed, value is one of 'untransformed', 'log2_level', 'log10_level', 'log2_ratio', 'log10_ratio' or 'unknown' (last one is default value) data_scale: optional parameter (default value is '1.0') return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_tsv_as_expression_matrix_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_tsv_as_expression_matrix_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') import_matrix_params = params import_matrix_params['input_file_path'] = scratch_file_path import_matrix_params['output_ws_name'] = params.get('workspace_name') import_matrix_params['output_obj_name'] = params.get('matrix_name') ref = self.fv.tsv_file_to_matrix(import_matrix_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('output_matrix_ref')) returnVal = {'obj_ref': ref.get('output_matrix_ref')} return returnVal def validate_import_tsv_as_expression_matrix_from_staging_params(self, params): """ validate_import_tsv_as_expression_matrix_from_staging_params: validates params passed to import_tsv_as_expression_matrix_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'workspace_name', 'matrix_name']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_tsv_as_expression_matrix_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) upload_message += "Expression Matrix Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported TSV File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class Utils: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.gen_api = GenericsAPI(self.callback_url) self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom" self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def get_conditions(self, params): data = self.dfu.get_objects( {'object_refs': [params['condition_set_ref']]})['data'][0]['data'] conditions = {} keep_keys = params.get('conditions', data['conditions'].keys()) for key in keep_keys: conditions[key] = defaultdict(list) for factor, val in zip(data['factors'], data['conditions'][key]): ont_abriv = factor['factor_ont_id'].split(":")[0] factor['value'] = val conditions[key][ont_abriv].append(copy.copy(factor)) return {"conditions": conditions} def file_to_condition_set(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep="\t", dtype='str') comp_set = self._df_to_cs_obj(df) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.ConditionSet", "data": comp_set, "name": params['output_obj_name'] }] })[0] return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])} def _conditionset_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ factors = pd.DataFrame(data['factors']) factors.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) conditions = pd.DataFrame(data['conditions']) cs_df = factors.join(conditions) return cs_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.gen_api.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ cluster.get('id_to_data_position').keys() for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on condition data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = cluster.get('id_to_data_position').keys() item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a dataframe""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.ConditionSet" in obj_type: cs_df = self._conditionset_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _df_to_cs_obj(self, cs_df): """Converts a dataframe from a user file to a compound set object""" condition_set = {'ontology_mapping_method': "User Curation"} cs_df.fillna('', inplace=True) if not len(cs_df): raise ValueError("No factors in supplied files") factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor") condition_df = cs_df.drop(factor_df.columns, axis=1) if not len(condition_df.columns): raise ValueError( "Unable to find any condition columns in supplied file") factor_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "factor" not in factor_df.columns: raise ValueError( "Unable to find a 'Factor' column in supplied file") factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id') factors = factor_df.filter(items=factor_fields).to_dict('records') condition_set['factors'] = [ self._add_ontology_info(f) for f in factors ] condition_set['conditions'] = condition_df.to_dict('list') return condition_set def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, factor): """Searches KBASE ontologies for terms matching the user supplied factors and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } factor = { k: v for k, v in factor.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( factor.get('factor_ont_id', "").replace("_", ":")) if ont_info: factor['factor_ont_ref'] = ont_info['ontology_ref'] factor['factor_ont_id'] = ont_info['id'] else: factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID if factor.get('unit'): ont_info = self._search_ontologies( factor.get('unit_ont_id', '').replace("_", ":")) if ont_info: factor['unit_ont_ref'] = ont_info['ontology_ref'] factor['unit_ont_id'] = ont_info['id'] else: factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['unit_ont_id'] = self.DEFAULT_UNIT_ID return factor def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.ConditionSet" in obj_type: df.to_excel(writer, "Conditions", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
def generate_report(self, obj_refs, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of upload_fastq_file) params: fwd_staging_file_name: single-end fastq file name or forward/left paired-end fastq file name from user's staging area rev_staging_file_name: reverse/right paired-end fastq file name user's staging area workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) obj_refs_list = obj_refs.split(',') dfu = DataFileUtil(self.callback_url) upload_message = 'Import Finished\nImported Reads:\n' for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = dfu.get_objects(get_objects_params) upload_message += "Reads Name: " + str( object_data.get('data')[0].get('info')[1]) + '\n' upload_message += "Reads Type: " + str( object_data.get('data')[0].get('info')[2]) + '\n' if params.get('fwd_staging_file_name'): upload_message += "Imported Reads File: %s" % params.get( 'fwd_staging_file_name') if params.get('rev_staging_file_name'): upload_message += ' and %s\n' % params.get( 'rev_staging_file_name') else: upload_message += '\n' else: reads_info = object_data.get('data')[0].get('info')[-1] if isinstance(reads_info, dict): upload_message += "Reads Info: " + json.dumps( reads_info, indent=1)[1:-1] + '\n' report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
def generate_report(self, obj_refs, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of upload_fastq_file) params: fwd_staging_file_name: single-end fastq file name or forward/left paired-end fastq file name from user's staging area rev_staging_file_name: reverse/right paired-end fastq file name user's staging area workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) obj_refs_list = obj_refs.split(',') dfu = DataFileUtil(self.callback_url) reads_number = 1 if 'urls_to_add' not in params else len( params['urls_to_add']) upload_message = 'Import Finished\nImported Reads: {}\n'.format( reads_number) for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = dfu.get_objects(get_objects_params) number_of_reads = object_data.get('data')[0].get('data').get( 'read_count') upload_message += "Reads Name: " upload_message += str( object_data.get('data')[0].get('info')[1]) + '\n' if params.get('fwd_staging_file_name'): if params.get('rev_staging_file_name'): upload_message += 'Imported Reads Files:\n' upload_message += 'Forward: {}\n'.format( params.get('fwd_staging_file_name')) upload_message += 'Reverse: {}\n'.format( params.get('rev_staging_file_name')) else: upload_message += 'Imported Reads File: {}\n'.format( params.get('fwd_staging_file_name')) if isinstance(number_of_reads, (int, long)): upload_message += 'Number of Reads: {:,}\n'.format( number_of_reads) else: reads_info = object_data.get('data')[0].get('info')[-1] if isinstance(reads_info, dict): upload_message += "Reads Info: " upload_message += json.dumps(reads_info, indent=1)[1:-1] + '\n' report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportGenbankUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev') self.uploader_utils = UploaderUtil(config) def import_genbank_from_staging(self, params): ''' import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name genome_name - becomes the name of the object workspace_name - the name of the workspace it gets saved to. source - Source of the file typically something like RefSeq or Ensembl optional params: release - Release or version number of the data per example Ensembl has numbered releases of all their data: Release 31 generate_ids_if_needed - If field used for feature id is not there, generate ids (default behavior is raising an exception) genetic_code - Genetic code of organism. Overwrites determined GC from taxon object type - Reference, Representative or User upload return: genome_ref: return object reference ''' log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_genbank_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_genbank_params = params import_genbank_params['file'] = file del import_genbank_params['staging_file_subdir_path'] returnVal = self.gfu.genbank_to_genome(import_genbank_params) """ Update the workspace object related meta-data for staged file """ #self.uploader_utils.update_staging_service( # download_staging_file_params.get('staging_file_subdir_path'), # returnVal['genome_ref']) return returnVal def validate_import_genbank_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_html_report(self, genome_ref, params): """ _generate_html_report: generate html summary report """ log('start generating html report') genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]}) html_report = list() result_file_path = os.path.join(self.scratch, 'report.html') genome_name = str(genome_obj.get('data')[0].get('info')[1]) genome_file = params.get('staging_file_subdir_path') genome_data = genome_obj.get('data')[0].get('data') genome_info = genome_obj.get('data')[0].get('info') source = genome_info[10].get('Source') num_contigs = genome_info[10].get('Number contigs') size = genome_info[10].get('Size') gc_content = genome_info[10].get('GC content') warnings = genome_data.get('warnings', []) feature_counts = sorted(list(genome_data.get('feature_counts', {}) .items())) genome_overview_data = collections.OrderedDict() genome_overview_data['Name'] = '{} ({})'.format(genome_name, genome_ref) #genome_overview_data['Uploaded File'] = genome_file genome_overview_data['Date Uploaded'] = time.strftime("%c") genome_overview_data['Source'] = source genome_overview_data['Number of Contigs'] = num_contigs genome_overview_data['Size'] = size genome_overview_data['GC Content'] = gc_content genome_overview_data['Warnings'] = "\n".join(warnings) genome_overview_data.update(feature_counts) overview_content = '' overview_content += '<br/><table>\n' for key, val in genome_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>\n' overview_content += '</table>' feature_content = str([[str(k), v] for k, v in genome_data.get('feature_counts', {}).items() if k != 'gene']) contig_content = str([[str(c), l] for c, l in zip(genome_data.get('contig_ids', []), genome_data.get('contig_lengths', []))]) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template_genome.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', overview_content) report_template = report_template.replace('*FEATURE_DATA*', feature_content) report_template = report_template.replace('*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({'file_path': self.scratch, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for imported Genome'}) return html_report def generate_report(self, genome_ref, params): """ :param genome_ref: Return Val from GenomeFileUtil for Uploaded genome Need to get report warnings and message from it. :return: """ uuid_string = str(uuid.uuid4()) objects_created = [{'ref': genome_ref, 'description': 'Imported Genome'}] output_html_files = self.generate_html_report(genome_ref, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 300, 'report_object_name': 'kb_genome_upload_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class ImportAssemblyUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'import_assembly_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_fasta_as_assembly_from_staging(self, params): ''' import_fasta_as_assembly_from_staging: wrapper method for AssemblyUtil.save_assembly_from_fasta required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name assembly_name - output Assembly file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_fasta_as_assembly_from_staging(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_assembly_params = params import_assembly_params['file'] = file ref = self.au.save_assembly_from_fasta(import_assembly_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), ref) returnVal = {'obj_ref': ref} return returnVal def validate_import_fasta_as_assembly_from_staging(self, params): """ validate_import_fasta_as_assembly_from_staging: validates params passed to import_fasta_as_assembly_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'assembly_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_html_report(self, assembly_ref, assembly_object, params): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() assembly_data = assembly_object.get('data')[0].get('data') assembly_info = assembly_object.get('data')[0].get('info') result_file_path = os.path.join(self.scratch, 'report.html') assembly_name = str(assembly_info[1]) assembly_file = params.get('staging_file_subdir_path') dna_size = assembly_data.get('dna_size') num_contigs = assembly_data.get('num_contigs') assembly_overview_data = collections.OrderedDict() assembly_overview_data['Name'] = '{} ({})'.format( assembly_name, assembly_ref) assembly_overview_data['Uploaded File'] = assembly_file assembly_overview_data['Date Uploaded'] = time.strftime("%c") assembly_overview_data['DNA Size'] = dna_size assembly_overview_data['Number of Contigs'] = num_contigs overview_content = '' overview_content += '<br/><table>\n' for key, val in assembly_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>\n' overview_content += '</table>' contig_data = assembly_data.get('contigs').values() contig_content = str([[str(e['contig_id']), e['length']] for e in contig_data]) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template_assembly.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>*Overview_Content*</p>', overview_content) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({ 'file_path': self.scratch, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_fasta_as_assembly_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) objects_created = [{ 'ref': obj_ref, 'description': 'Imported Assembly' }] output_html_files = self.generate_html_report(obj_ref, object_data, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 270, 'report_object_name': 'kb_upload_assembly_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class kb_fastqc: ''' Module Name: kb_fastqc Module Description: A KBase module: kb_fastqc ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "1.0.4" GIT_URL = "https://github.com/Tianhao-Gu/kb_fastqc.git" GIT_COMMIT_HASH = "3f552db07e04f4b01eec0b38ec49546a2335d87e" #BEGIN_CLASS_HEADER def _get_input_file_ref_from_params(self, params): if 'input_file_ref' in params: return params['input_file_ref'] else: if 'input_ws' not in params and 'input_file' not in params: raise ValueError('Either the "input_file_ref" field or the ' + '"input_ws" with "input_file" fields ' + 'must be set.') return str(params['input_ws']) + '/' + str(params['input_file']) def create_report(self, token, ws, uuid_string, read_file_path): output_html_files = list() output_zip_files = list() first_file = "" html_string = "" html_count = 0 with open('/kb/data/index_start.txt', 'r') as start_file: html_string = start_file.read() # Make HTML folder html_folder = os.path.join(read_file_path, 'html') os.mkdir(html_folder) for file in os.listdir(read_file_path): label = ".".join(file.split(".")[1:]) if (file.endswith(".zip")): desc = 'Zip file generated by fastqc that contains ' + \ 'original images seen in the report' output_zip_files.append({ 'path': os.path.join(read_file_path, file), 'name': file, 'label': label, 'description': desc }) if (file.endswith(".html")): # Move html into html folder shutil.move(os.path.join(read_file_path, file), os.path.join(html_folder, file)) if (first_file == ""): first_file = file html_string+=" <button data-button=\"page "+str(html_count) + \ "\" data-page=\""+file+"\">Page "+str(html_count+1)+"</button>\n" html_count += 1 html_string += " </div> </div> <div id=\"body\">\n" html_string += " <iframe id=\"content\" " html_string += "style=\"width: 100%; border: none; \" src=\"" + first_file + "\"></iframe>\n </div>" with open('/kb/data/index_end.txt', 'r') as end_file: html_string += end_file.read() with open(os.path.join(html_folder, "index.html"), 'w') as index_file: index_file.write(html_string) shock = self.dfu.file_to_shock({ 'file_path': html_folder, 'make_handle': 0, 'pack': 'zip' }) desc = 'HTML files generated by fastqc that contains report on ' + \ 'quality of reads' output_html_files.append({ 'shock_id': shock['shock_id'], 'name': 'index.html', 'label': 'html files', 'description': desc }) report_params = { 'direct_html_link_index': 0, 'file_links': output_zip_files, 'html_links': output_html_files, 'workspace_name': ws, 'report_object_name': 'kb_fastqc_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=token) output = kbase_report_client.create_extended_report(report_params) return output #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.workspaceURL = config['workspace-url'] self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass def runFastQC(self, ctx, input_params): """ :param input_params: instance of type "FastQCParams" -> structure: parameter "input_ws" of String, parameter "input_file" of String, parameter "input_file_ref" of String :returns: instance of type "FastQCOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: reported_output #BEGIN runFastQC token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) uuid_string = str(uuid.uuid4()) read_file_path = self.scratch + "/" + uuid_string os.mkdir(read_file_path) input_file_ref = self._get_input_file_ref_from_params(input_params) library = None try: library = wsClient.get_objects2( {'objects': [{ 'ref': input_file_ref }]})['data'][0] except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + input_file_ref + ')' + str(e)) download_read_params = {'read_libraries': [], 'interleaved': "false"} if ("SingleEnd" in library['info'][2] or "PairedEnd" in library['info'][2]): download_read_params['read_libraries'].append(library['info'][7] + "/" + library['info'][1]) elif ("SampleSet" in library['info'][2]): for sample_id in library['data']['sample_ids']: if ("/" in sample_id): download_read_params['read_libraries'].append(sample_id) else: if (sample_id.isdigit()): download_read_params['read_libraries'].append( library['info'][6] + "/" + sample_id) else: download_read_params['read_libraries'].append( library['info'][7] + "/" + sample_id) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) ret = ru.download_reads(download_read_params) read_file_list = list() for file in ret['files']: obj_info = self.dfu.get_objects({'object_refs': [file]})['data'][0]['info'] obj_name = obj_info[1] obj_ref_suffix = '_' + str(obj_info[6]) + '_' + str( obj_info[0]) + '_' + str(obj_info[4]) files = ret['files'][file]['files'] fwd_name = files['fwd'].split('/')[-1] fwd_name = fwd_name.replace('.gz', '') # using object_name + ref_suffix + suffix as file name fwd_name = obj_name + obj_ref_suffix + '.' + fwd_name.split( '.', 1)[-1] shutil.move(files['fwd'], os.path.join(read_file_path, fwd_name)) read_file_list.append(os.path.join(read_file_path, fwd_name)) if (files['rev'] is not None): rev_name = files['rev'].split('/')[-1] rev_name = rev_name.replace('.gz', '') rev_name = obj_name + obj_ref_suffix + '.' + rev_name.split( '.', 1)[-1] shutil.move(files['rev'], os.path.join(read_file_path, rev_name)) read_file_list.append(os.path.join(read_file_path, rev_name)) subprocess.check_output(["fastqc"] + read_file_list) # report = "Command run: "+" ".join(["fastqc"]+read_file_list) output = self.create_report(token, input_params['input_ws'], uuid_string, read_file_path) reported_output = { 'report_name': output['name'], 'report_ref': output['ref'] } # Remove temp reads directory shutil.rmtree(read_file_path, ignore_errors=True) #END runFastQC # At some point might do deeper type checking... if not isinstance(reported_output, dict): raise ValueError('Method runFastQC return value ' + 'reported_output is not type dict as required.') # return the results return [reported_output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class ImportAssemblyUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) def import_fasta_as_assembly_from_staging(self, params): ''' import_fasta_as_assembly_from_staging: wrapper method for AssemblyUtil.save_assembly_from_fasta required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name assembly_name - output Assembly file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_fasta_as_assembly_from_staging(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_assembly_params = params import_assembly_params['file'] = file ref = self.au.save_assembly_from_fasta(import_assembly_params) returnVal = {'obj_ref': ref} return returnVal def validate_import_fasta_as_assembly_from_staging(self, params): """ validate_import_fasta_as_assembly_from_staging: validates params passed to import_fasta_as_assembly_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'assembly_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_fasta_as_assembly_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) base_count = object_data.get('data')[0].get('data').get('base_counts') dna_size = object_data.get('data')[0].get('data').get('dna_size') upload_message += "Assembly Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported Fasta File: {}\n'.format( params.get('staging_file_subdir_path')) if isinstance(dna_size, (int, long)): upload_message += 'DNA Size: {:,}\n'.format(dna_size) if isinstance(base_count, dict): upload_message += 'Base Count:\n{}\n'.format( json.dumps(base_count, indent=1)[2:-2]) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
def find_motifs(self, ctx, params): """ :param params: instance of type "find_motifs_params" (Genome is a KBase genome Featureset is a KBase featureset Promoter_length is the length of promoter requested for all genes) -> structure: parameter "workspace_name" of String, parameter "fastapath" of String, parameter "motif_min_length" of Long, parameter "motif_max_length" of Long :returns: instance of type "extract_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs #TODO: Things to fix in here... # Use MotifUtils to parse output and create object # create new function for report ? if 'motif_min_length' not in params: params['motif_min_length'] = 8 if 'motif_max_length' not in params: params['motif_max_length'] = 16 motMin = params['motif_min_length'] motMax = params['motif_max_length'] #promoterFastaFilePath = self.get_promoter_for_gene(ctx,params)[0] promoterFastaFilePath = params['fastapath'] MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath) MEU.run_meme_command(MEMEMotifCommand) meme_out_path = '/kb/module/work/tmp/meme_out/meme.txt' meme_params = { 'ws_name': params['workspace_name'], 'path': meme_out_path, 'obj_name': params['obj_name'] } MOU = MotifUtils(self.callback_url) dfu = DataFileUtil(self.callback_url) locDict = {} if 'SS_ref' in params: get_ss_params = {'object_refs': [params['SS_ref']]} SS = dfu.get_objects(get_ss_params)['data'][0]['data'] for s in SS['sequences']: if s['source'] is not None: locDict['sequence_id'] = { 'contig': s['source']['location'][0][0], 'start': str(s['source']['location'][0][1]) } if len(locDict.keys()) > 0: meme_params['absolute_locations'] = locDict meme_params['min_len'] = motMin meme_params['max_len'] = motMax obj_ref = MOU.UploadFromMEME(meme_params)['obj_ref'] #memeMotifList = MEU.parse_meme_output() #HERE: #we've got object ref #we've got html building functions #build report, setup return, #make report and return it #buildReportFromMotifSet() timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) timestamp = str(timestamp) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) #lineCount = 0 #with open(promoterFastaFilePath,'r') as pFile: # for line in pFile: # lineCount += 1 #numFeat = lineCount/2 #with open(promoterFastaFilePath,'r') as pFile: # fileStr = pFile.read() #promHtmlStr = '<html><body> ' + fileStr + ' </body></html>' #with open(htmlDir + '/promoters.html','w') as promHTML: # promHTML.write(promHtmlStr) #JsonPath = '/kb/module/work/tmp' dfu = DataFileUtil(self.callback_url) get_obj_params = {'object_refs': [obj_ref]} memeMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data'] MakeReport(htmlDir, memeMotifSet) #buildReportFromMotifSet(memeMotifSet,htmlDir,'meme') #TODO: Here replace the makereport with a call to motifset utils #subprocess.call(['python','/kb/module/lib/identify_promoter/Utils/makeReport.py',JsonPath + '/meme_out/meme.json',htmlDir + '/meme.html',str(numFeat)]) #fullMotifList = [] #for m in memeMotifList: # fullMotifList.append(m) #What needs to happen here: #call makeLogo for each of the json outputs(capture these from somewhere) #plt.rcParams['figure.dpi'] = 300 #htmlFiles = ['index.html','gibbs.html','homer.html'] #shockParamsList = [] #for f in htmlFiles: # shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'}) try: html_upload_ret = dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set #MSO = {} #MSO['Condition'] = 'Temp' #MSO['FeatureSet_ref'] = '123' #MSO['Motifs'] = [] #MSO['Alphabet'] = ['A','C','G','T'] #MSO['Background'] = {} #for letter in MSO['Alphabet']: # MSO['Background'][letter] = 0.0 #MSU.parseMotifList(fullMotifList,MSO) #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)) #Pass motif set into this #save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}] #info = dfu.save_objects(save_objects_params)[0] #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'MEMEMotifFinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [{ 'ref': obj_ref, 'description': 'Motif Set generated by MEME' }], 'message': '', 'direct_html': None, 'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
class ProkkaAnnotationTest(unittest.TestCase): @classmethod def setUpClass(cls): config_file = environ.get("KB_DEPLOYMENT_CONFIG", None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items("kb_prokka"): cls.cfg[nameval[0]] = nameval[1] # Token validation token = environ.get("KB_AUTH_TOKEN", None) authServiceUrl = cls.cfg.get( "auth-service-url", "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don"t call any logging methods on the context object, # it"ll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ "token": token, "user_id": user_id, "provenance": [{ "service": "ProkkaAnnotation", "method": "please_never_use_it_in_production", "method_params": [] }], "authenticated": 1 }) cls.wsURL = cls.cfg["workspace-url"] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = kb_prokka(cls.cfg) @classmethod def tearDownClass(cls): if hasattr(cls, "wsName"): cls.wsClient.delete_workspace({"workspace": cls.wsName}) print("Test workspace was deleted") def getWsClient(self): return self.__class__.wsClient def getWsName(self): if hasattr(self.__class__, "wsName"): return self.__class__.wsName suffix = int(time.time() * 1000) wsName = "test_ProkkaAnnotation_" + str(suffix) ret = self.getWsClient().create_workspace({"workspace": wsName}) # noqa self.__class__.wsName = wsName return wsName def getImpl(self): return self.__class__.serviceImpl def getContext(self): return self.__class__.ctx # def testGenomeOntologyEventsField(self): # testwith ontology events # test without def Xtest_modify_old_genome(self): self.callback_url = os.environ["SDK_CALLBACK_URL"] self.gfu = GenomeFileUtil(self.callback_url) self.dfu = DataFileUtil(self.callback_url) old_genome = "30045/15/1" new_genome = "30045/14/1" genome_name = 'OldRhodo' genome_data_old = self.dfu.get_objects({"object_refs": [old_genome]})["data"][0] genome_data_new = self.dfu.get_objects({"object_refs": [new_genome]})["data"][0] sso_1 = { "id": "1", "evidence": [], "term_name": "1", "ontology_ref": "1", "term_lineage": [] } sso_2 = { "id": "2", "evidence": [], "term_name": "2", "ontology_ref": "2", "term_lineage": [] } sso_terms = {'SSO1': sso_1, 'SSO2': sso_2} print("ABOUT TO MODIFY OLD GENOME") for i, item in enumerate(genome_data_old['data']['features']): genome_data_old['data']['features'][i]['ontology_terms'] = { "SSO": sso_terms } print("ABOUT TO MODIFY NEW GENOME") for i, item in enumerate(genome_data_new['data']['features']): genome_data_new['data']['features'][i]['ontology_terms'] = { "SSO": sso_terms } print("ABOUT TO SAVE OLD GENOME") info = self.gfu.save_one_genome({ "workspace": self.getWsName(), "name": genome_name, "data": genome_data_old["data"], "provenance": self.ctx.provenance() })["info"] print("ABOUT TO SAVE NEW GENOME") info = self.gfu.save_one_genome({ "workspace": self.getWsName(), "name": genome_name, "data": genome_data_new["data"], "provenance": self.ctx.provenance() })["info"] def test_reannotate_RICKETS(self): genome_ref = '31932/5/1' genome_ref = '32038/3/2' genome_ref = '32132/5/1' genome_name = 'Aceti' self.callback_url = os.environ["SDK_CALLBACK_URL"] self.dfu = DataFileUtil(self.callback_url) result = self.getImpl().annotate( self.getContext(), { "object_ref": genome_ref, "output_workspace": self.getWsName(), "output_genome_name": genome_name, "evalue": None, "fast": 0, "gcode": 0, "genus": "genus", "kingdom": "Bacteria", "metagenome": 0, "mincontiglen": 1, "norrna": 0, "notrna": 0, "rawproduct": 0, "rfam": 1, "scientific_name": "RhodoBacter" })[0] genome_data = self.dfu.get_objects( {"object_refs": [result['output_genome_ref']]})["data"][0]['data'] scratch = "/kb/module/work/tmp/" with open(scratch + 'OUTPUT_GENOME.txt', 'w+') as outfile: json.dump(genome_data, outfile) def Xtest_reannotate_new_genome(self): genome_ref = '30045/14/1' genome_name = 'NewRhodo' result = self.getImpl().annotate( self.getContext(), { "object_ref": genome_ref, "output_workspace": self.getWsName(), "output_genome_name": genome_name, "evalue": None, "fast": 0, "gcode": 0, "genus": "genus", "kingdom": "Bacteria", "metagenome": 0, "mincontiglen": 1, "norrna": 0, "notrna": 0, "rawproduct": 0, "rfam": 1, "scientific_name": "RhodoBacter" })[0] def Xtest_reannotate_old_genome(self): genome_ref = '30045/15/1' genome_name = 'OldRhodo' result = self.getImpl().annotate( self.getContext(), { "object_ref": genome_ref, "output_workspace": self.getWsName(), "output_genome_name": genome_name, "evalue": None, "fast": 0, "gcode": 0, "genus": "genus", "kingdom": "Bacteria", "metagenome": 0, "mincontiglen": 1, "norrna": 0, "notrna": 0, "rawproduct": 0, "rfam": 1, "scientific_name": "RhodoBacter" })[0]
class kb_plant_rast: ''' Module Name: kb_plant_rast Module Description: A KBase module: kb_plant_rast ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/kb_plant_rast" GIT_COMMIT_HASH = "a652c0120abf90e97d0f0214f8ed4174f27b9a09" #BEGIN_CLASS_HEADER KMER_THRESHOLD = 1 #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.workspaceURL = config['workspace-url'] self.token = os.environ['KB_AUTH_TOKEN'] self.scratch = os.path.abspath(config['scratch']) self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) #END_CONSTRUCTOR pass def annotate_plant_transcripts(self, ctx, input): """ :param input: instance of type "AnnotatePlantTranscriptsParams" -> structure: parameter "input_ws" of String, parameter "input_genome" of String, parameter "output_genome" of String :returns: instance of type "AnnotatePlantTranscriptsResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN annotate_plant_transcripts # Retrieve plant genome plant_genome = self.dfu.get_objects({'object_refs': [input['input_ws']+'/'+input['input_genome']]})['data'][0] # Force upgrade if("feature_counts" in plant_genome['data']): del(plant_genome['data']['feature_counts']) use_cds=1 features = plant_genome['data']['cdss'] if(len(features)==0): features = plant_genome['data']['features'] use_cds=0 if(len(features)==0): raise Exception("The genome does not contain any CDSs or features!") output = {'ftrs': len(features)} # Retrieve kmers Functions = set() Kmers_Functions = dict() for line in open('/data/functions_kmers.txt'): line=line.strip() (function_string,kmers_string)=line.split('\t') Functions.add(function_string) for kmer in kmers_string.split(', '): Kmers_Functions[kmer]=function_string output['fns']=len(Functions) output['kmers']=len(Kmers_Functions) Kmer_Length=8 Hit_Proteins=dict() Hit_Kmers=set() output['short']=0 for ftr in features: if('protein_translation' not in ftr): output['short']+=1 continue Seq = ftr['protein_translation'] SeqLen = len(Seq); if(SeqLen < 10): output['short']+=1 continue seq_kmers = [Seq[i:i + Kmer_Length] for i in range(SeqLen-Kmer_Length+1)] for kmer in seq_kmers: if(kmer in Kmers_Functions): if(ftr['id'] not in Hit_Proteins): Hit_Proteins[ftr['id']]=dict() if(Kmers_Functions[kmer] not in Hit_Proteins[ftr['id']]): Hit_Proteins[ftr['id']][Kmers_Functions[kmer]]=0 Hit_Proteins[ftr['id']][Kmers_Functions[kmer]]+=1 Hit_Kmers.add(kmer) output['hit_kmers']=len(Hit_Kmers) #Eliminate hits that have a small number of kmers #Each function must have more than 1 kmer in order to be assigned Deleted_Proteins = set() output['few']=0 for ftr in Hit_Proteins.keys(): Deleted_Functions = set() for function in Hit_Proteins[ftr].keys(): N_Kmers = Hit_Proteins[ftr][function] if(N_Kmers <= self.KMER_THRESHOLD): Deleted_Functions.add(function) for function in Deleted_Functions: del(Hit_Proteins[ftr][function]) if(len(Hit_Proteins[ftr])==0): output['few']+=1 Deleted_Proteins.add(ftr) #Scan for multi-functional hits #If a function has more hits than others, it takes precendence #If there are more than one function with an equal number of hits, the feature is removed output['ambiguous']=0 for ftr in Hit_Proteins: if(len(Hit_Proteins[ftr])==1): continue if(ftr in Deleted_Proteins): continue Top_Hit_Functions=dict() for function in Hit_Proteins[ftr].keys(): if(Hit_Proteins[ftr][function] not in Top_Hit_Functions): Top_Hit_Functions[Hit_Proteins[ftr][function]]=dict() Top_Hit_Functions[Hit_Proteins[ftr][function]][function]=1 Top_Number = (sorted(Top_Hit_Functions.keys(),reverse=True))[0] if(len(Top_Hit_Functions[Top_Number].keys())>1): output['ambiguous']+=1 Deleted_Proteins.add(ftr) else: Top_Function = Top_Hit_Functions[Top_Number].keys()[0] Hit_Proteins[ftr]={Top_Function:Top_Number} #remove the egregious proteins for ftr in Deleted_Proteins: del(Hit_Proteins[ftr]) #count functions Hit_Functions=set() for ftr in Hit_Proteins.keys(): Hit_Functions.add(Hit_Proteins[ftr].keys()[0]) output['hit_ftrs']=len(Hit_Proteins) output['hit_fns']=len(Hit_Functions) #But, if annotating CDS, need to be able to retrieve parent feature parent_feature_index = dict() if(use_cds==1): for i in range(len(plant_genome['data']['features'])): parent_feature_index[plant_genome['data']['features'][i]['id']]=i #Now, re-populate feature functions, and save genome object #But, if annotating CDS, need to be able to retrieve parent feature parent_feature_index = dict() if(use_cds==1): parent_feature_index = dict([(f['id'], i) for i, f in enumerate(plant_genome['data']['features'])]) # parent_feature_index = dict([(f['id'], i) for i, f in plant_genome['data']['features']]) # for i in range(len(plant_genome['data']['features'])): # parent_feature_index[plant_genome['data']['features'][i]['id']]=i for ftr in features: if(ftr['id'] in Hit_Proteins): new_function = Hit_Proteins[ftr['id']].keys()[0] ftr['function'] = new_function if(use_cds==1): plant_genome['data']['features'][parent_feature_index[ftr['parent_gene']]]['function']=new_function if('output_genome' not in input): input['output_genome']=input['input_genome'] save_result = self.gfu.save_one_genome({'workspace' : input['input_ws'], 'name' : input['output_genome'], 'data' : plant_genome['data'], 'upgrade' : 1}); html_string="<html><head><title>KBase Plant Rast Report</title></head><body>" html_string+="<p>The Plant Rast app has finished running. " html_string+=str(output['ftrs'])+" protein sequences were scanned for "+str(output['kmers'])+" signature kmers.</p>" html_string+="<p>The app found "+str(output['hit_kmers'])+" signature kmers and was able to predict " html_string+=str(output['hit_fns'])+" enzymatic functions for "+str(output['hit_ftrs'])+" protein sequences.</p>" # html_string+="<p>During the annotation process, "+str(output['short'])+" features " # html_string+="were ignored because they were too short (<10 AA in length). " # html_string+=str(output['few'])+" features were ignored because they were hit by fewer than 2 kmers, and " # html_string+=str(output['ambiguous'])+" features were ignored because they were too ambiguous " # html_string+="(connected to multiple distinct metabolic functions).</p>" fraction_plantseed = float( (float(output['hit_fns']) / float(output['fns'])) * 100.0 ) html_string+="<p>This result indicates that, for this set of protein sequences, the app detected {0:.0f}%".format(fraction_plantseed) html_string+=" of the enzymatic functions of plant primary metabolism that were curated as part of the PlantSEED project.</p></body></html>" saved_genome = "{}/{}/{}".format(save_result['info'][6],save_result['info'][0],save_result['info'][4]) description = "Plant genome "+plant_genome['data']['id']+" annotated with metabolic functions" uuid_string = str(uuid.uuid4()) report_params = { 'objects_created' : \ [{"ref":saved_genome,"description":description}], 'direct_html' : html_string, 'workspace_name' : input['input_ws'], 'report_object_name' : 'kb_plant_rast_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) report_client_output = kbase_report_client.create_extended_report(report_params) output['report_name']=report_client_output['name'] output['report_ref']=report_client_output['ref'] #END annotate_plant_transcripts # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method annotate_plant_transcripts return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
def assembly_metadata_report(self, ctx, params): """ :param params: instance of type "AssemblyMetadataReportParams" -> structure: parameter "assembly_input_ref" of type "assembly_ref", parameter "workspace_name" of String, parameter "showContigs" of type "boolean" (A boolean. 0 = false, other = true.) :returns: instance of type "AssemblyMetadataResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN assembly_metadata_report token = ctx['token'] uuid_string = str(uuid.uuid4()) write_file_path = self.scratch + "/" + uuid_string # Print statements to stdout/stderr are captured and available as the App log print('Starting Assembly MetaData Report Function. Params=') pprint(params) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. print('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'showContigs' not in params: raise ValueError( 'Parameter showContigs is not set in input arguments') showContigs_orig = params['showContigs'] showContigs = None try: showContigs = int(showContigs_orig) except ValueError: raise ValueError( 'Cannot parse integer from showContigs parameter (' + str(showContigs_orig) + ')') if showContigs < 0: raise ValueError('showContigs parameter cannot be negative (' + str(showContigs) + ')') if showContigs > 1: raise ValueError( 'showContigs parameter cannot be greater than one (' + str(showContigs) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') # assemblyUtil = AssemblyUtil(self.callback_url) # fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. data_file_cli = DataFileUtil(self.callback_url) # assembly_metadata = data_file_cli.get_objects({'object_refs': ['assembly_input_ref']})['data'][0]['data'] assembly = data_file_cli.get_objects( {'object_refs': [assembly_input_ref]}) assembly_metadata = assembly['data'][0]['data'] string = "\nAssembly Metadata\n" list = [ 'assembly_id', 'dna_size', 'gc_content', 'num_contigs', 'fasta_handle_ref', 'md5', 'type', 'taxon_ref' ] for item in list: if item in assembly_metadata: string += "\t{:20} = {}".format(item, assembly_metadata[item]) + "\n" if 'fasta_handle_info' in assembly_metadata and 'node_file_name' in assembly_metadata[ 'fasta_handle_info']: string += "\tfilename = " + assembly_metadata[ 'fasta_handle_info']['node_file_name'] + "\n" string += "BASE counts\n" for base in assembly_metadata['base_counts']: # string += "\t" + base + str(assembly_metadata['base_counts'][base]) + "\n" string += "\t{:5} = {}".format( base, str(assembly_metadata['base_counts'][base])) + "\n" string += "\nName\tLength\tGC content\tContigID\tDescription\n" if 'contigs' in assembly_metadata: myContig = assembly_metadata['contigs'] for ctg in myContig: list = ['length', 'gc_content', 'contig_id', 'description'] string += ctg # describeDict(myContig[ctg]) for item in list: if item in myContig[ctg]: string += "\t{}".format(myContig[ctg][item]) else: string += "\t" string += "\n" report_path = os.path.join(write_file_path, 'assembly_metadata_report.txt') report_txt = open(report_path, "w") report_txt.write(string) report_txt.close() # with open('assembly_metadata_report.txt',"w") as report_txt: # report_txt.write(string) # with open('assembly_metadata_report.html',"w") as report_txt: # report_txt.write(string) # output_file = [] # output_file.append({'path' : os.path.join(self.shared_folder, 'assembly_metadata_report.txt'), # 'name' : 'assembly_metadata_report.txt', # 'label' : 'AssemblyMetadata.label', # 'description' : 'Text output for the assembly metadata'}) # html_file = [] # html_file.append({'path' : os.path.join(self.shared_folder, 'assembly_metadata_report.html'), # 'name' : 'assembly_metadata_report.html', # 'label' : 'AssemblyMetadata.label.html', # 'description' : 'Text output for the assembly metadata'}) print string # Step 5 - Build a Report and return # report_params = {'message': string, # 'direct_html_link_index': 0, # 'html_links': [html_file], # 'file_links': [output_file], # 'report_object_name': 'assembly_metadata_report_' + str(uuid.uuid4()), # 'workspace_name': params['workspace_name'] # } # reportObj = { # 'objects_created': [{'ref': 'assembly_metadata_report_' + str(uuid.uuid4()), 'description': 'AssemblyMetadata'}], # 'report_object_name' : 'assembly_metadata_report', # 'text_message': "\n" + string # } # report = KBaseReport(self.callback_url) # report_info = report.create_extended_report({'report': reportObj, 'workspace_name': params['workspace_name']}) # report_info = report.create_extended_report(report_params) # STEP 6: contruct the output to send back # output = {'report_name': 'My_report', # 'report_ref': report_info['ref'] # } output = self.create_report(token, params['workspace_name'], uuid_string, write_file_path) reported_output = { 'report_name': output['name'], 'report_ref': output['ref'] } print('returning: ' + pformat(output)) #END assembly_metadata_report # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method assembly_metadata_report return value ' + 'output is not type dict as required.') # return the results return [output]
class staging_downloader: # staging file prefix STAGING_GLOBAL_FILE_PREFIX = '/data/bulk/' STAGING_USER_FILE_PREFIX = '/staging/' def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _get_staging_file_prefix(self, token_user): """ _get_staging_file_prefix: return staging area file path prefix directory pattern: perfered to return user specific path: /staging/ if this path is not visible to user, use global bulk path: /data/bulk/user_name/ """ if os.path.exists(self.STAGING_USER_FILE_PREFIX): return self.STAGING_USER_FILE_PREFIX else: return os.path.join(self.STAGING_GLOBAL_FILE_PREFIX, token_user) def _validate_export_params(self, params): """ validates params passed to export_to_staging """ log('start validating export_to_staging params') # check for required parameters for p in ['input_ref', 'workspace_name', 'destination_dir']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _generate_export_report(self, file_names, obj_name, workspace_name): log('start creating report') msg = 'Successfully exported object [{}] to staging area\n\n'.format( obj_name) msg += 'Exported files:\n' + '\n'.join(file_names) report_params = { 'message': msg, 'workspace_name': workspace_name, 'report_object_name': 'staging_exporter_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _download_reads(self, reads_ref, reads_name): """ download Reads as FASTQ """ log('start downloading Reads file') download_params = {'read_libraries': [reads_ref]} download_ret = self.ru.download_reads(download_params) files = download_ret['files'][reads_ref]['files'] # create the output directory and move the file there result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) fwd = files['fwd'] rev = files.get('rev') result_zip_name = reads_name + '_' + reads_ref.replace( '/', '_') + '.FASTQ.zip' result_zip = os.path.join(result_dir, result_zip_name) with ZipFile(result_zip, 'w', ZIP_DEFLATED) as zipObj2: zipObj2.write(fwd, os.path.basename(fwd)) if rev: zipObj2.write(rev, os.path.basename(rev)) log('downloaded files:\n' + str(os.listdir(result_dir))) return result_dir def _download_assembly(self, assembly_ref, assembly_name): """ download Assembly as FASTA """ log('start downloading Assembly file') file_name = assembly_name + '_' + assembly_ref.replace('/', '_') + '.fa' download_params = {'ref': assembly_ref, 'filename': file_name} download_ret = self.au.get_assembly_as_fasta(download_params) # create the output directory and move the file there result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) shutil.move(download_ret.get('path'), result_dir) log('downloaded files:\n' + str(os.listdir(result_dir))) return result_dir def _download_alignment(self, alignment_ref, alignment_name, export_alignment): """ downloand Alignment as BAM or SAM """ log('start downloading Alignment file') if not export_alignment: log('start downloading BAM as default') export_alignment = {'export_alignment_bam': 1} # create the output directory and move the file there result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) if export_alignment.get('export_alignment_bam'): download_params = { 'source_ref': alignment_ref, 'downloadBAI': True } download_ret = self.rau.download_alignment(download_params) destination_dir = download_ret.get('destination_dir') file_names = os.listdir(destination_dir) for filename in file_names: new_file_name = alignment_name + '_' + alignment_ref.replace('/', '_') + \ '.' + filename.split('.', 1)[1] os.rename(os.path.join(destination_dir, filename), os.path.join(destination_dir, new_file_name)) shutil.copy2(os.path.join(destination_dir, new_file_name), result_dir) if export_alignment.get('export_alignment_sam'): download_params = { 'source_ref': alignment_ref, 'downloadBAI': True, 'downloadSAM': True } download_ret = self.rau.download_alignment(download_params) destination_dir = download_ret.get('destination_dir') file_names = os.listdir(destination_dir) for filename in file_names: new_file_name = alignment_name + '_' + alignment_ref.replace('/', '_') + \ '.' + filename.split('.', 1)[1] os.rename(os.path.join(destination_dir, filename), os.path.join(destination_dir, new_file_name)) shutil.copy2(os.path.join(destination_dir, new_file_name), result_dir) log('downloaded files:\n' + str(os.listdir(result_dir))) return result_dir def _download_metagenome(self, metagenome_ref, metagenome_name): """ """ log("start downloading Annotated Metagenome Assembly files") result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) download_ret = self.gfu.metagenome_to_gff( {'metagenome_ref': metagenome_ref}) gff_file = download_ret.get('file_path') gff_file_name = os.path.basename(gff_file) shutil.move(gff_file, result_dir) new_file_name = metagenome_name + '_' + metagenome_ref.replace('/', '_') + \ '.' + gff_file_name.split('.', 1)[1] os.rename(os.path.join(result_dir, gff_file_name), os.path.join(result_dir, new_file_name)) return result_dir def _download_genome(self, genome_ref, genome_name, export_genome): """ download Genome as GENBANK or GFF """ log('start downloading Genome file') if not export_genome: log('start downloading GENBANK as default') export_genome = {'export_genome_genbank': 1} # create the output directory and move the file there result_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_dir) if export_genome.get('export_genome_genbank'): download_params = {'genome_ref': genome_ref} download_ret = self.gfu.genome_to_genbank(download_params) genbank_file = download_ret.get('genbank_file').get('file_path') genbank_file_name = os.path.basename(genbank_file) shutil.move(genbank_file, result_dir) new_file_name = genome_name + '_' + genome_ref.replace('/', '_') + \ '.' + genbank_file_name.split('.', 1)[1] os.rename(os.path.join(result_dir, genbank_file_name), os.path.join(result_dir, new_file_name)) if export_genome.get('export_genome_gff'): download_params = {'genome_ref': genome_ref} download_ret = self.gfu.genome_to_gff(download_params) gff_file = download_ret.get('file_path') gff_file_name = os.path.basename(gff_file) shutil.move(gff_file, result_dir) new_file_name = genome_name + '_' + genome_ref.replace('/', '_') + \ '.' + gff_file_name.split('.', 1)[1] os.rename(os.path.join(result_dir, gff_file_name), os.path.join(result_dir, new_file_name)) log('downloaded files:\n' + str(os.listdir(result_dir))) return result_dir def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) def export_to_staging(self, ctx, params): """ export large file associated with workspace object to staging area params: input_ref: workspace object reference workspace_name: workspace name objects to be saved to destination_dir: destination directory for downloaded files optional: generate_report: indicator for generating workspace report. (default False) """ self._validate_export_params(params) input_ref = params.get('input_ref') workspace_name = params.get('workspace_name') destination_dir = params.get('destination_dir') generate_report = params.get('generate_report', False) obj_source = self.dfu.get_objects({"object_refs": [input_ref]})['data'][0] obj_info = obj_source.get('info') obj_type = obj_info[2].split('-')[0] obj_name = obj_info[1] if obj_type in [ 'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary' ]: result_dir = self._download_reads(input_ref, obj_name) elif obj_type in ['KBaseGenomeAnnotations.Assembly']: result_dir = self._download_assembly(input_ref, obj_name) elif obj_type in ['KBaseRNASeq.RNASeqAlignment']: result_dir = self._download_alignment( input_ref, obj_name, params.get('export_alignment')) elif obj_type in ['KBaseGenomes.Genome']: result_dir = self._download_genome(input_ref, obj_name, params.get('export_genome')) elif obj_type in ['KBaseMetagenomes.AnnotatedMetagenomeAssembly']: result_dir = self._download_metagenome(input_ref, obj_name) else: raise ValueError('Unexpected data type') staging_dir_prefix = self._get_staging_file_prefix(ctx['user_id']) staging_dir = os.path.join(staging_dir_prefix, destination_dir) self._mkdir_p(staging_dir) files = os.listdir(result_dir) for file in files: shutil.copy2(os.path.join(result_dir, file), staging_dir) if not (set(os.listdir(staging_dir)) >= set(files)): raise ValueError('Unexpected error occurred during copying files') returnVal = dict() returnVal['result_dir'] = result_dir if generate_report: report_output = self._generate_export_report( files, obj_name, workspace_name) returnVal.update(report_output) return returnVal
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file """ path, file = os.path.split(bam_file) return self.samtools.get_stats(file, path) def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment * :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') uuid_prefix = uuid_str[:8] if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get('downloadBAI', False): bai_file = uuid_prefix + '_' + file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get('downloadSAM', False): sam_file = uuid_prefix + '_' + file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.iteritems(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class ImportMediaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fba = fba_tools(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_media_from_staging(self, params): ''' import_media_from_staging: wrapper method for FBAFileUtil.tsv_file_to_media and FBAFileUtil.excel_file_to_media required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name media_name - output Media file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportMediaUtil.import_media_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_media_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_media_params = params import_media_params['media_file'] = file try: ref = self.fba.tsv_file_to_media(import_media_params) except: try: ref = self.fba.excel_file_to_media(import_media_params) except: raise ValueError('"{}" is not a valid EXCEL nor TSV file'.format( params.get('staging_file_subdir_path'))) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('ref')) returnVal = {'obj_ref': ref.get('ref')} return returnVal def import_tsv_as_media_from_staging(self, params): ''' import_tsv_as_media_from_staging: wrapper method for FBAFileUtil.tsv_file_to_media required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name media_name - output Media file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportMediaUtil.import_tsv_as_media_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_media_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_media_params = params import_media_params['media_file'] = file ref = self.fba.tsv_file_to_media(import_media_params) returnVal = {'obj_ref': ref.get('ref')} return returnVal def import_excel_as_media_from_staging(self, params): ''' import_excel_as_media_from_staging: wrapper method for FBAFileUtil.excel_file_to_media required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name media_name - output Media file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportMediaUtil.import_excel_as_media_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_media_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_media_params = params import_media_params['media_file'] = file ref = self.fba.excel_file_to_media(import_media_params) returnVal = {'obj_ref': ref.get('ref')} return returnVal def validate_import_media_from_staging_params(self, params): """ validate_import_media_from_staging_params: validates params passed to import_excel(tsv)_as_media_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'workspace_name', 'media_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_excel(tsv)_as_media_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) upload_message += "Media Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class RNASeqDownloaderUtils: def __init__(self, config): log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' % config) self.scratch = config['scratch'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url, token=self.token) self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token) def download_RNASeq(self, params): """ download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file params: input_ref: RNASeq object reference ID rna_seq_type: one of ['RNASeqAlignment', 'RNASeqExpression', 'RNASeqDifferentialExpression'] return: shock_id: Shock ID of stored zip file """ log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq:\nparams: %s' % params) # Validate params self.validate_download_rna_seq_alignment_parameters(params) # Download RNASeq zip file # RNASeq Alignemnt, Expression and DifferentialExpression # has same object_data/handle_data structure returnVal = self._download_rna_seq_zip(params.get('input_ref')) return returnVal def download_RNASeq_Alignment(self, params): """ download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file params: input_ref: RNASeq object reference ID rna_seq_type: 'RNASeqAlignment' download_file_type: one of 'bam', 'sam' or 'bai' return: shock_id: Shock ID of stored zip file """ log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq_Alignment:\nparams: %s' % params) # Validate params self.validate_download_rna_seq_alignment_parameters(params) input_ref = params.get('input_ref') returnVal = dict() download_file_type = params.get('download_file_type') if download_file_type == 'bam': destination_dir = self.rau.download_alignment({ 'source_ref': input_ref, 'downloadBAI': True })['destination_dir'] shock_id = self._upload_dir_to_shock(destination_dir) elif download_file_type == 'sam': destination_dir = self.rau.download_alignment({ 'source_ref': input_ref, 'downloadSAM': True, 'downloadBAI': True })['destination_dir'] files = os.listdir(destination_dir) bam_files = [x for x in files if re.match('.*\.bam', x)] for bam_file in bam_files: log('removing file: {}'.format(bam_file)) os.remove(os.path.join(destination_dir, bam_file)) shock_id = self._upload_dir_to_shock(destination_dir) returnVal['shock_id'] = shock_id return returnVal def validate_download_rna_seq_alignment_parameters(self, params): """ validate_download_rna_seq_alignment_parameters: validates params passed to download_rna_seq_alignment method """ # check required parameters for p in ['input_ref', 'rna_seq_type']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') # check supportive RNASeq types valid_rnaseq_types = [ 'RNASeqAlignment', 'RNASeqExpression', 'RNASeqDifferentialExpression' ] if params['rna_seq_type'] not in valid_rnaseq_types: raise ValueError('Unexpected RNASeq type: %s' % params['rna_seq_type']) def _download_rna_seq_zip(self, input_ref): """ _download_rna_seq_zip: download RNASeq's archive zip file returns: shock_id: Shock ID of stored zip file """ # get object data object_data = self._get_object_data(input_ref) log('---> getting object data\n object_date: %s' % json.dumps(object_data, indent=1)) # get handle data handle = self._get_handle_data(object_data) log('---> getting handle data\n handle data: %s' % json.dumps(object_data, indent=1)) # make tmp directory for downloading dstdir = os.path.join(self.scratch, 'tmp') if not os.path.exists(dstdir): os.makedirs(dstdir) # download original zip file and save to tmp directory handle_id = handle.get('hid') original_zip_file_path = self._download_original_zip_file( handle_id, dstdir) log('---> loading %s to shock' % original_zip_file_path) shock_id = self._upload_to_shock(original_zip_file_path) log('---> removing folder: %s' % dstdir) shutil.rmtree(dstdir) returnVal = {"shock_id": shock_id} return returnVal def _get_object_data(self, input_ref): """ _get_object_data: get object_data using DataFileUtil """ get_objects_params = { 'object_refs': [input_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) return object_data def _get_handle_data(self, object_data): """ _get_handle_data: get Handle from object_data """ try: handle = object_data.get('data')[0].get('data').get('file') except: error_msg = "Unexpected object format. Refer to DataFileUtil.get_objects definition\n" error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1) raise ValueError(error_msg) if handle is None: error_msg = "object_data does NOT have Handle(file key)\n" error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1) raise ValueError(error_msg) elif handle.get('hid') is None: error_msg = "Handle does have NOT HandleId(hid key)\n" error_msg += "handle_data:\n%s" % json.dumps(handle, indent=1) raise ValueError(error_msg) else: return handle def _download_original_zip_file(self, handle_id, dstdir): """ _download_original_zip_file: download original archive .zip file using DataFileUtil """ shock_to_file_params = {'handle_id': handle_id, 'file_path': dstdir} original_zip_file = self.dfu.shock_to_file(shock_to_file_params) original_zip_file_path = original_zip_file.get('file_path') return original_zip_file_path def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ file_to_shock_params = {'file_path': file_path} shock_file = self.dfu.file_to_shock(file_to_shock_params) shock_id = shock_file.get('shock_id') return shock_id def _upload_dir_to_shock(self, directory): """ _upload_to_shock: upload target file to shock using DataFileUtil """ file_to_shock_params = {'file_path': directory, 'pack': 'zip'} shock_file = self.dfu.file_to_shock(file_to_shock_params) shock_id = shock_file.get('shock_id') return shock_id
class ExpressionUtils: ''' Module Name: ExpressionUtils Module Description: A KBase module: ExpressionUtils This module is intended for use by Assemblers to upload RNASeq Expression files (gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent. The expression files are uploaded as a single compressed file.This module also generates expression levels and tpm expression levels from the input files and saves them in the workspace object. Once uploaded, the expression files can be downloaded onto an output directory. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.1.1" GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git" GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99" #BEGIN_CLASS_HEADER PARAM_IN_SRC_DIR = 'source_dir' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_ALIGNMENT_REF = 'alignment_ref' PARAM_IN_GENOME_REF = 'genome_ref' PARAM_IN_ANNOTATION_ID = 'annotation_id' PARAM_IN_BAM_FILE_PATH = 'bam_file_path' PARAM_IN_DESCRIPTION = 'description' PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level' PARAM_IN_PROC_COMMENTS = 'processing_comments' PARAM_IN_PLATFORM = 'platform' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_ORIG_MEDIAN = 'original_median' PARAM_IN_EXT_SRC_DATE = 'external_source_date' PARAM_IN_TRANSCRIPTS = 'transcripts' PARAM_IN_SRC = 'source' def _check_required_param(self, in_params, param_list): """ Check if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Check the validity of workspace and object params and return them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) dfu = DataFileUtil(self.callback_url) if not isinstance(ws_name_id, int): try: ws_name_id = dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _proc_upload_expression_params(self, ctx, params): """ Check the presence and validity of upload expression params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR, self.PARAM_IN_ALIGNMENT_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) source_dir = params.get(self.PARAM_IN_SRC_DIR) if not (os.path.isdir(source_dir)): raise ValueError('Source directory does not exist: ' + source_dir) if not os.listdir(source_dir): raise ValueError('Source directory is empty: ' + source_dir) return ws_name_id, obj_name_id, source_dir def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _get_genome_ref(self, assembly_or_genome_ref, params): if self.PARAM_IN_GENOME_REF in params and params[ self.PARAM_IN_GENOME_REF] is not None: return params[self.PARAM_IN_GENOME_REF] obj_type = self._get_ws_info(assembly_or_genome_ref)[2] if obj_type.startswith('KBaseGenomes.Genome'): return assembly_or_genome_ref raise ValueError('Alignment object does not contain genome_ref; ' '"{}" parameter is required'.format( self.PARAM_IN_GENOME_REF)) def _get_expression_levels(self, source_dir, genome_ref, transcripts=False): fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking') if transcripts: fpkm_file_path = os.path.join(source_dir, 't_data.ctab') if not os.path.isfile(fpkm_file_path): raise ValueError('{} file is required'.format(fpkm_file_path)) id_col = 5 if transcripts else 0 self.__LOGGER.info( 'Generating expression levels from {}'.format(fpkm_file_path)) return self.expression_utils.get_expression_levels( fpkm_file_path, genome_ref, id_col) def _gen_ctab_files(self, params, alignment_ref): source_dir = params.get(self.PARAM_IN_SRC_DIR) if len(glob.glob(source_dir + '/*.ctab')) < 5: self.__LOGGER.info(' ======= Generating ctab files ==========') gtf_file = os.path.join(source_dir, 'transcripts.gtf') if not os.path.isfile(gtf_file): raise ValueError( "{} file is required to generate ctab files, found missing" .format(gtf_file)) if self.PARAM_IN_BAM_FILE_PATH in params and \ params[self.PARAM_IN_BAM_FILE_PATH] is not None: bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH] else: self.__LOGGER.info( 'Downloading bam file from alignment object') rau = ReadsAlignmentUtils(self.callback_url) alignment_retVal = rau.download_alignment( {'source_ref': alignment_ref}) alignment_dir = alignment_retVal.get('destination_dir') allbamfiles = glob.glob(alignment_dir + '/*.bam') if len(allbamfiles) == 0: raise ValueError('bam file does not exist in {}'.format(d)) elif len(allbamfiles) == 1: bam_file_path = allbamfiles[0] elif len(allbamfiles) > 1: tmp_file_path = os.path.join(alignment_dir, 'accepted_hits.bam') if os.path.isfile(tmp_file_path): bam_file_path = tmp_file_path else: tmp_file_path = os.path.join( alignment_dir, 'accepted_hits_sorted.bam') if os.path.isfile(tmp_file_path): bam_file_path = tmp_file_path else: raise ValueError( 'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}' .format(alignment_dir)) result = self.table_maker.build_ctab_files( ref_genome_path=gtf_file, alignment_path=bam_file_path, output_dir=source_dir) if result != 0: raise ValueError('Tablemaker failed') #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.__LOGGER = logging.getLogger('ExpressionUtils') self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s" ) formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.config['SDK_CALLBACK_URL'] = self.callback_url self.expression_utils = Expression_Utils(self.config) self.dfu = DataFileUtil(self.callback_url) self.table_maker = TableMaker(config, self.__LOGGER) self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER) #END_CONSTRUCTOR pass def upload_expression(self, ctx, params): """ Uploads the expression * :param params: instance of type "UploadExpressionParams" (* Required input parameters for uploading a reads expression data string destination_ref - object reference of expression data. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id string source_dir - directory with the files to be uploaded string alignment_ref - alignment workspace object reference *) -> structure: parameter "destination_ref" of String, parameter "source_dir" of String, parameter "alignment_ref" of String, parameter "genome_ref" of String, parameter "annotation_id" of String, parameter "bam_file_path" of String, parameter "transcripts" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "data_quality_level" of Long, parameter "original_median" of Double, parameter "description" of String, parameter "platform" of String, parameter "source" of String, parameter "external_source_date" of String, parameter "processing_comments" of String :returns: instance of type "UploadExpressionOutput" (* Output from upload expression *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_expression self.__LOGGER.info('Starting upload expression, parsing parameters ') pprint(params) ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params( ctx, params) alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF) try: alignment_obj = self.dfu.get_objects( {'object_refs': [alignment_ref]})['data'][0] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise alignment = alignment_obj['data'] assembly_or_genome_ref = alignment['genome_id'] genome_ref = self._get_genome_ref(assembly_or_genome_ref, params) expression_levels, tpm_expression_levels = self._get_expression_levels( source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS)) self._gen_ctab_files(params, alignment_ref) uploaded_file = self.dfu.file_to_shock({ 'file_path': source_dir, 'make_handle': 1, 'pack': 'zip' }) """ move the zipfile created in the source directory one level up """ path, dir = os.path.split(source_dir) zipfile = dir + '.zip' if os.path.isfile(os.path.join(source_dir, zipfile)): shutil.move(os.path.join(source_dir, zipfile), os.path.join(path, zipfile)) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] expression_data = { 'numerical_interpretation': 'FPKM', 'genome_id': genome_ref, 'mapped_rnaseq_alignment': { alignment['read_sample_id']: alignment_ref }, 'condition': alignment['condition'], 'file': file_handle, 'expression_levels': expression_levels, 'tpm_expression_levels': tpm_expression_levels } additional_params = [ self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION, self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM, self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID, self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE, self.PARAM_IN_SRC ] for opt_param in additional_params: if opt_param in params and params[opt_param] is not None: expression_data[opt_param] = params[opt_param] extra_provenance_input_refs = list() extra_provenance_input_refs.append( params.get(self.PARAM_IN_ALIGNMENT_REF)) if self.PARAM_IN_GENOME_REF in params and params.get( self.PARAM_IN_GENOME_REF) is not None: extra_provenance_input_refs.append( params.get(self.PARAM_IN_GENOME_REF)) self.__LOGGER.info('=========== Adding extra_provenance_refs') self.__LOGGER.info(str(extra_provenance_input_refs)) self.__LOGGER.info('==========================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqExpression", "data": expression_data, "name": obj_name_id, "extra_provenance_input_refs": extra_provenance_input_refs }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') print(returnVal) #END upload_expression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_expression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_expression(self, ctx, params): """ Downloads expression * :param params: instance of type "DownloadExpressionParams" (* Required input parameters for downloading expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "DownloadExpressionOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String """ # ctx is the context object # return variables are: returnVal #BEGIN download_expression self.__LOGGER.info('Running download_expression with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'download_' + str(timestamp)) os.mkdir(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': expression[0]['data']['file']['id'], 'file_path': output_dir, 'unpack': 'unpack' }) if not os.listdir(output_dir): raise ValueError('No files were downloaded: ' + output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) returnVal = {'destination_dir': output_dir} #END download_expression # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_expression return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_expression(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download expressions from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting expression string source_ref - object reference of expression source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_expression inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required') try: expression = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': expression[0]['data']['file']['id']} #END export_expression # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_expression return value ' + 'output is not type dict as required.') # return the results return [output] def get_expressionMatrix(self, ctx, params): """ :param params: instance of type "getExprMatrixParams" (* Following are the required input parameters to get Expression Matrix *) -> structure: parameter "workspace_name" of String, parameter "output_obj_name" of String, parameter "expressionset_ref" of String :returns: instance of type "getExprMatrixOutput" -> structure: parameter "exprMatrix_FPKM_ref" of String, parameter "exprMatrix_TPM_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN get_expressionMatrix fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix( params) returnVal = { 'exprMatrix_FPKM_ref': fpkm_ref, 'exprMatrix_TPM_ref': tpm_ref } #END get_expressionMatrix # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method get_expressionMatrix return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class AssemblyToFasta: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def export_as_fasta(self, ctx, params): ''' Used almost exclusively for download only ''' # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot export Assembly- not input_ref field defined.') # export to a file file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']}) # create the output directory and move the file there export_package_dir = os.path.join(self.scratch, file['assembly_name']) os.makedirs(export_package_dir) shutil.move( file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) return {'shock_id': package_details['shock_id']} def assembly_as_fasta(self, ctx, params): ''' main function that accepts a ref to an object and writes a file ''' self.validate_params(params) print('downloading ws object data (' + params['ref'] + ')') assembly_object = self.dfu.get_objects( {'object_refs': [params['ref']]})['data'][0] ws_type = assembly_object['info'][2] obj_name = assembly_object['info'][1] if 'filename' in params: output_filename = params['filename'] else: output_filename = obj_name + '.fa' output_fasta_file_path = os.path.join(self.scratch, output_filename) if 'KBaseGenomes.ContigSet' in ws_type: self.process_legacy_contigset(output_fasta_file_path, assembly_object['data']) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: self.process_assembly(output_fasta_file_path, assembly_object['data']) else: raise ValueError( 'Cannot write data to fasta; invalid WS type (' + ws_type + '). Supported types are KBaseGenomes.ContigSet and ' + 'KBaseGenomeAnnotations.Assembly') return {'path': output_fasta_file_path, 'assembly_name': obj_name} def fasta_rows_generator_from_contigset(self, contig_list): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' for contig in contig_list: description = '' if 'description' in contig and contig['description']: description = contig['description'] yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet), id=contig['id'], description=description) def process_legacy_contigset(self, output_fasta_path, data): ''' ''' SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']), output_fasta_path, "fasta") def process_assembly(self, output_fasta_path, data): ''' ''' self.dfu.shock_to_file({ 'handle_id': data['fasta_handle_ref'], 'file_path': output_fasta_path, 'unpack': 'uncompress' }) def validate_params(self, params): for key in ['ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(feat['aliases'].keys()), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
class MetagenomeFileUtils: def _validate_merge_bins_from_binned_contig_params(self, params): """ _validate_merge_bins_from_binned_contig_params: validates params passed to merge_bins_from_binned_contig method """ log('Start validating merge_bins_from_binned_contig params') # check for required parameters for p in [ 'old_binned_contig_ref', 'bin_merges', 'output_binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) bin_merges = params.get('bin_merges') if not isinstance(bin_merges, list): error_msg = 'expecting a list for bin_merges param, ' error_msg += 'but getting a [{}]'.format(type(bin_merges)) raise ValueError(error_msg) for bin_merge in bin_merges: for p in ['new_bin_id', 'bin_to_merge']: if p not in bin_merge: raise ValueError( '"{}" key is required in bin_merges, but missing'. format(p)) bin_to_merge = bin_merge.get('bin_to_merge') if not isinstance(bin_to_merge, list): error_msg = 'expecting a list for bin_to_merge, ' error_msg += 'but getting a [{}]'.format(type(bin_to_merge)) raise ValueError(error_msg) def _validate_remove_bins_from_binned_contig_params(self, params): """ _validate_remove_bins_from_binned_contig_params: validates params passed to remove_bins_from_binned_contig method """ log('Start validating remove_bins_from_binned_contig params') # check for required parameters for p in [ 'old_binned_contig_ref', 'bins_to_remove', 'output_binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) bins_to_remove = params.get('bins_to_remove') if not isinstance(bins_to_remove, list): error_msg = 'expecting a list for bins_to_remove param, ' error_msg += 'but getting a [{}]'.format(type(bins_to_remove)) raise ValueError(error_msg) def _validate_file_to_binned_contigs_params(self, params): """ _validate_file_to_binned_contigs_params: validates params passed to file_to_binned_contigs method """ log('Start validating file_to_binned_contigs params') # check for required parameters for p in [ 'assembly_ref', 'file_directory', 'binned_contig_name', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_binned_contigs_to_file_params(self, params): """ _validate_binned_contigs_to_file_params: validates params passed to binned_contigs_to_file method """ log('Start validating binned_contigs_to_file params') # check for required parameters for p in ['input_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_extract_binned_contigs_as_assembly_params(self, params): """ _validate_extract_binned_contigs_as_assembly_params: validates params passed to extract_binned_contigs_as_assembly method """ log('Start validating extract_binned_contigs_as_assembly params') # check for required parameters for p in [ 'binned_contig_obj_ref', 'extracted_assemblies', 'assembly_suffix', 'workspace_name' ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # convert comma-separated list of bins into a list of individual ids (the python # comprehension construction deals with the fact that split(',') returns a list of # length one, [''], for an empty string input extracted_assemblies = [ x for x in params.get('extracted_assemblies').split(',') if x ] # parameter assembly_set_name is required if extracted_assemblies list has more # than one element if len(extracted_assemblies) > 1 and 'assembly_set_name' not in params: raise ValueError( '"assembly_set_names" parameter is required for more than one extracted assembly' ) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _get_bin_ids(self, file_directory): """ _get_bin_ids: getting bin contig ids from files NOTE: This method is very specific to MaxBin2 app result. Bin contig files generated by MaxBin2 follow 'header.0xx.fasta' name pattern """ bin_ids = [] result_files = os.listdir(file_directory) for file in result_files: if re.match(r'.*\.\d{3}\.fasta', file): bin_ids.append(file) log('generated bin ids:\n{}'.format('\n'.join(bin_ids))) return bin_ids def _process_summary_file(self, bin_id, lines): """ _process_summary_file: process header.summary file content getting GC content (gc), Genome size (sum_contig_len) and Completeness (cov) from header.summary file NOTE: This method is very specific to MaxBin2 app result. header.summary file could be one of below fomat: Bin name Abundance Completeness Genome size GC content maxbin_output.001.fasta 0.00 97.2% 2690533 52.9 Bin name Completeness Genome size GC content maxbin_output.001.fasta 97.2% 2690533 52.9 """ for line in lines: line_list = line.split('\t') if line_list[0] == bin_id: if len(line_list) == 5: gc = round(float(line_list[4]) / 100, 5) sum_contig_len = int(line_list[3]) cov = round(float(line_list[2].partition('%')[0]) / 100, 5) elif len(line_list) == 4: gc = round(float(line_list[3]) / 100, 5) sum_contig_len = int(line_list[2]) cov = round(float(line_list[1].partition('%')[0]) / 100, 5) return gc, sum_contig_len, cov def _get_total_contig_len(self, file_directory): """ _get_total_contig_len: process header.summary file content getting total contig length from header.summary file NOTE: This method is very specific to MaxBin2 app result. """ log('generating total contig length') total_contig_len = 0 file_list = os.listdir(file_directory) for file in file_list: if file.endswith('.summary'): with open(os.path.join(file_directory, file), 'r') as summary_file: lines = summary_file.readlines() for line in lines[1:]: line_list = line.split('\t') if len(line_list) == 5: total_contig_len += int(line_list[3]) elif len(line_list) == 4: total_contig_len += int(line_list[2]) log('generated total contig length: {}'.format(total_contig_len)) return total_contig_len def _generate_contig_bin_summary(self, bin_id, file_directory): """ _generate_contig_bin_summary: getting ContigBin summary from header.summary file NOTE: This method is very specific to MaxBin2 app result. """ log('generating summary for bin_id: {}'.format(bin_id)) file_list = os.listdir(file_directory) for file in file_list: if file.endswith('.summary'): with open(os.path.join(file_directory, file), 'r') as summary_file: lines = summary_file.readlines() gc, sum_contig_len, cov = self._process_summary_file( bin_id, lines) log('generated GC content: {}, Genome size: {} '.format( gc, sum_contig_len)) log('and Completeness: {} for bin_id: {}'.format(cov, bin_id)) return gc, sum_contig_len, cov def _generate_contigs(self, file_name, file_directory, assembly_ref): """ _generate_contigs: generate contigs from assembly object file_name: file name of fasta file file_directory: fasta file directory assembly_ref: associated assembly object reference """ log('start generating contig objects for file: {}'.format(file_name)) assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] assembly_contigs = assembly.get('data').get('contigs') contigs = {} for record in SeqIO.parse(os.path.join(file_directory, file_name), "fasta"): contig_id = record.id contig = assembly_contigs.get(contig_id) if contig: # using assembly object data contig_gc = contig.get('gc_content') sequence_length = contig.get('length') else: log('cannot find contig [{}] from assembly.'.format(contig_id)) log('computing contig info') sequence = str(record.seq).upper() sequence_length = len(sequence) contig_gc_len = 0 contig_gc_len += sequence.count('G') contig_gc_len += sequence.count('C') contig_gc = round( float(contig_gc_len) / float(sequence_length), 5) contig = {'gc': contig_gc, 'len': sequence_length} contigs[contig_id] = contig log('complete generating contig objects for file: {}'.format( file_name)) return contigs def _generate_contig_bin(self, bin_id, file_directory, assembly_ref): """ _generate_contig_bin: gerneate ContigBin structure """ log('start generating BinnedContig info for bin: {}'.format(bin_id)) # generate ContigBin summery info gc, sum_contig_len, cov = self._generate_contig_bin_summary( bin_id, file_directory) # generate Contig info contigs = self._generate_contigs(bin_id, file_directory, assembly_ref) contig_bin = { 'bid': bin_id, 'contigs': contigs, 'n_contigs': len(contigs), 'gc': gc, 'sum_contig_len': sum_contig_len, 'cov': cov } log('complete generating BinnedContig info for bin: {}'.format(bin_id)) return contig_bin def _get_contig_file(self, assembly_ref): """ _get_contig_file: get contif file from GenomeAssembly object """ log('retrieving contig file from assembly: {}'.format(assembly_ref)) contig_file = self.au.get_assembly_as_fasta({ 'ref': assembly_ref }).get('path') sys.stdout.flush() contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path'] log('saved contig file to: {}'.format(contig_file)) return contig_file def _get_contig_string(self, contig_id, assembly_contig_file, parsed_assembly): """ _get_contig_string: find and return contig string from assembly contig file """ # parsed_assembly = SeqIO.to_dict(SeqIO.parse(assembly_contig_file, "fasta")) contig_record = parsed_assembly.get(contig_id) if contig_record: string_contig = '' string_contig += '>{}\n'.format(contig_id) string_contig += str(contig_record.seq).upper() string_contig += '\n' else: error_msg = 'Cannot find contig [{}] from file [{}].'.format( contig_id, assembly_contig_file) raise ValueError(error_msg) return string_contig def _pack_file_to_shock(self, result_files): """ _pack_file_to_shock: pack files in result_files list and save in shock """ log('start packing and uploading files:\n{}'.format( '\n'.join(result_files))) output_directory = os.path.join( self.scratch, 'packed_binned_contig_' + str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join( output_directory, 'packed_binned_contig_' + str(uuid.uuid4()) + '.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for file in result_files: zip_file.write(file, os.path.basename(file)) shock_id = self.dfu.file_to_shock({ 'file_path': result_file }).get('shock_id') log('saved file to shock: {}'.format(shock_id)) return shock_id def _generate_report(self, report_message, params): """ generate_report: generate summary report """ log('Generating report') uuid_string = str(uuid.uuid4()) upload_message = 'Job Finished\n\n' upload_message += report_message log('Report message:\n{}'.format(upload_message)) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'MetagenomeUtils_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_report_message(self, new_binned_contig_ref): """ _generate_report_message: generate a report message for BinnedContig object """ report_message = '' binned_contig = self.dfu.get_objects( {'object_refs': [new_binned_contig_ref]})['data'][0] binned_contig_info = binned_contig.get('info') binned_contig_name = binned_contig_info[1] report_message += 'Generated BinnedContig: {} [{}]\n'.format( binned_contig_name, new_binned_contig_ref) binned_contig_count = 0 total_bins = binned_contig.get('data').get('bins') total_bins_count = len(total_bins) bin_ids = [] for bin in total_bins: binned_contig_count += len(bin.get('contigs')) bin_ids.append(bin.get('bid')) report_message += '--------------------------\nSummary:\n\n' report_message += 'Binned contigs: {}\n'.format(binned_contig_count) report_message += 'Total size of bins: {}\n'.format(total_bins_count) report_message += 'Bin IDs:\n{}\n'.format('\n'.join(bin_ids)) return report_message def _merge_bins(self, new_bin_id, bin_objects_to_merge): """ _merge_bins: merge a list of bins into new_bin_id """ total_contigs = {} total_gc_count = 0 total_sum_contig_len = 0 total_cov_len = 0 for bin in bin_objects_to_merge: total_contigs.update(bin.get('contigs')) sum_contig_len = bin.get('sum_contig_len') total_sum_contig_len += sum_contig_len total_gc_count += sum_contig_len * bin.get('gc') total_cov_len += sum_contig_len * bin.get('cov') contig_bin = { 'bid': new_bin_id, 'contigs': total_contigs, 'n_contigs': len(total_contigs), 'gc': round(float(total_gc_count) / total_sum_contig_len, 5), 'sum_contig_len': total_sum_contig_len, 'cov': round(float(total_cov_len) / total_sum_contig_len, 5) } return contig_bin def _save_binned_contig(self, binned_contigs, workspace_name, binned_contig_name): """ _build_binned_contig: save BinnedContig object """ workspace_name = workspace_name if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) object_type = 'KBaseMetagenomes.BinnedContigs' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': binned_contigs, 'name': binned_contig_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] new_binned_contig_ref = str(dfu_oi[6]) + '/' + str( dfu_oi[0]) + '/' + str(dfu_oi[4]) return new_binned_contig_ref def _check_bin_merges(self, bin_merges): """ _check_bin_merges: checking bin_merges """ bin_id_list = map(lambda item: item.get('bin_to_merge'), bin_merges) bin_ids = [] map(lambda item: map(lambda bin_id: bin_ids.append(bin_id), item), bin_id_list) for bin_id in bin_id_list: if len(bin_id) <= 1: raise ValueError( "Please provide at least two bin_ids to merge") for id in bin_id: if bin_ids.count(id) > 1: raise ValueError( "Same bin [{}] appears in muliple merges".format(id)) new_bin_id_list = map(lambda item: item.get('new_bin_id'), bin_merges) for new_bin_id in new_bin_id_list: if new_bin_id_list.count(new_bin_id) > 1: raise ValueError( "Same new Bin ID [{}] appears in muliple merges".format( id)) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.setapi = SetAPI(self.callback_url) self.wss = workspaceService(config['workspace-url']) def file_to_binned_contigs(self, params): """ file_to_binned_contigs: Generating BinnedContigs ojbect from files input params: file_directory: file directory containing compressed/unpacked contig file(s) to build BinnedContig object assembly_ref: metagenome assembly object reference binned_contig_name: BinnedContig object name workspace_name: the name/id of the workspace it gets saved to return params: binned_contig_obj_ref: generated result BinnedContig object reference """ log('--->\nrunning MetagenomeFileUtils.file_to_binned_contigs\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_file_to_binned_contigs_params(params) file_directory = params.get('file_directory') assembly_ref = params.get('assembly_ref') log('starting generating BinnedContig object') bin_ids = self._get_bin_ids(file_directory) bins = [] for bin_id in bin_ids: contig_bin = self._generate_contig_bin(bin_id, file_directory, assembly_ref) bins.append(contig_bin) log('finished generating BinnedContig object') total_contig_len = self._get_total_contig_len(file_directory) binned_contigs = { 'assembly_ref': assembly_ref, 'bins': bins, 'total_contig_len': total_contig_len } binned_contig_obj_ref = self._save_binned_contig( binned_contigs, params.get('workspace_name'), params.get('binned_contig_name')) returnVal = {'binned_contig_obj_ref': binned_contig_obj_ref} log('successfully saved BinnedContig object') return returnVal def binned_contigs_to_file(self, params): """ binned_contigs_to_file: Convert BinnedContig object to fasta files and pack them to shock input params: input_ref: BinnedContig object reference optional params: save_to_shock: saving result bin files to shock. default to True bin_id_list: only extract bin_id_list return params: shock_id: saved packed file shock id bin_file_directory: directory that contains all bin files """ log('--->\nrunning MetagenomeFileUtils.binned_contigs_to_file\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_binned_contigs_to_file_params(params) binned_contig_object = self.dfu.get_objects( {'object_refs': [params.get('input_ref')]})['data'][0] assembly_ref = binned_contig_object.get('data').get('assembly_ref') assembly_contig_file = self._get_contig_file(assembly_ref) log('parsing assembly file [{}] to dictionary'.format( assembly_contig_file)) parsed_assembly = SeqIO.to_dict( SeqIO.parse(assembly_contig_file, "fasta")) bins = binned_contig_object.get('data').get('bins') result_directory = os.path.join( self.scratch, 'binned_contig_files_' + str(uuid.uuid4())) self._mkdir_p(result_directory) result_files = [] bin_id_list = params.get('bin_id_list') for bin in bins: bin_id = bin.get('bid') if bin_id_list: if bin_id in bin_id_list: log('processing bin: {}'.format(bin_id)) with open(os.path.join(result_directory, bin_id), 'w') as file: contigs = bin.get('contigs') for contig_id in contigs.keys(): contig_string = self._get_contig_string( contig_id, assembly_contig_file, parsed_assembly) file.write(contig_string) result_files.append(os.path.join(result_directory, bin_id)) log('saved contig file to: {}'.format(result_files[-1])) else: log('processing bin: {}'.format(bin_id)) with open(os.path.join(result_directory, bin_id), 'w') as file: contigs = bin.get('contigs') for contig_id in contigs.keys(): contig_string = self._get_contig_string( contig_id, assembly_contig_file, parsed_assembly) file.write(contig_string) result_files.append(os.path.join(result_directory, bin_id)) log('saved contig file to: {}'.format(result_files[-1])) if params.get('save_to_shock') or params.get('save_to_shock') is None: shock_id = self._pack_file_to_shock(result_files) else: shock_id = None returnVal = { 'shock_id': shock_id, 'bin_file_directory': result_directory } return returnVal def _get_object_name_from_ref(self, obj_ref): """given the object reference, return the object_name as a string""" return (self.wss.get_object_info_new({"objects": [{ 'ref': obj_ref }]})[0][1]) def extract_binned_contigs_as_assembly(self, params): """ extract_binned_contigs_as_assembly: extract one/multiple Bins from BinnedContigs as Assembly input params: binned_contig_obj_ref: BinnedContig object reference extracted_assemblies: a string, a comma-separated list of bin_ids to be extracted workspace_name: the name of the workspace it gets saved to return params: assembly_ref_list: a list of generated result Assembly object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning MetagenomeFileUtils.extract_binned_contigs_as_assembly\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_extract_binned_contigs_as_assembly_params(params) # convert comma-separated list of bins into a list of individual ids (the python # comprehension construction deals with the fact that split(',') returns a list of # length one, [''], for an empty string input extracted_assemblies = [ x for x in params.get('extracted_assemblies').split(',') if x ] binned_contig_obj_ref = params.get('binned_contig_obj_ref') contigs_to_file_ret = self.binned_contigs_to_file({ 'input_ref': binned_contig_obj_ref, 'save_to_shock': False, 'bin_id_list': extracted_assemblies }) bin_file_directory = contigs_to_file_ret.get('bin_file_directory') # bin_files will be either a list of the bin contig files corresponding to the # target bin ids, or a list of all bin contig files if extracted_assemblies is empty bin_files = os.listdir(bin_file_directory) # if extracted_assemblies is empty list, create a full one here if not extracted_assemblies: extracted_assemblies = bin_files log("extracted_assemblies was empty, is now " + pformat(extracted_assemblies)) generated_assembly_ref_list = [] assembly_suffix = params.get('assembly_suffix').strip() for bin_id in extracted_assemblies: if bin_id not in map(os.path.basename, bin_files): error_msg = 'bin_id [{}] cannot be found in BinnedContig '.format( bin_id) error_msg += '[{}]'.format(binned_contig_obj_ref) raise ValueError(error_msg) else: output_assembly_name = bin_id + assembly_suffix log('saving assembly: {}'.format(output_assembly_name)) for bin_file in bin_files: if os.path.basename(bin_file) == bin_id: log('starting generating assembly from {}'.format( bin_id)) assembly_params = { 'file': { 'path': os.path.join(bin_file_directory, bin_file) }, 'workspace_name': params.get('workspace_name'), 'assembly_name': output_assembly_name } assembly_ref = self.au.save_assembly_from_fasta( assembly_params) log('finished generating assembly from {}'.format( bin_id)) generated_assembly_ref_list.append(assembly_ref) setref = None if (len(generated_assembly_ref_list) > 1): binned_contig_object_name = self._get_object_name_from_ref( binned_contig_obj_ref) assembly_set_name = params.get('assembly_set_name') log("saving assembly set {0}".format(assembly_set_name)) setref = self.setapi.save_assembly_set_v1({ 'workspace': params.get('workspace_name'), 'output_object_name': assembly_set_name, 'data': { 'description': 'binned assemblies from {0}'.format( binned_contig_object_name), 'items': [{ 'ref': r } for r in generated_assembly_ref_list] } }) log("save assembly set_ref is {0}".format(setref.get('set_ref'))) report_message = 'Generated Assembly Reference: {}'.format( ', '.join(generated_assembly_ref_list)) reportVal = self._generate_report(report_message, params) returnVal = {'assembly_ref_list': generated_assembly_ref_list} returnVal.update(reportVal) if setref: returnVal.update({'assembly_set_ref': setref}) return returnVal def remove_bins_from_binned_contig(self, params): """ remove_bins_from_binned_contig: remove a list of bins from BinnedContig object input params: old_binned_contig_ref: Original BinnedContig object reference bins_to_remove: a list of bin ids to be removed output_binned_contig_name: Name for the output BinnedContigs object workspace_name: the name of the workspace new object gets saved to return params: new_binned_contig_ref: newly created BinnedContig object referece """ log('--->\nrunning MetagenomeFileUtils.remove_bins_from_binned_contig\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_remove_bins_from_binned_contig_params(params) binned_contig_object = self.dfu.get_objects( {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0] assembly_ref = binned_contig_object.get('data').get('assembly_ref') total_contig_len = int( binned_contig_object.get('data').get('total_contig_len')) old_bins = binned_contig_object.get('data').get('bins') bins_to_remove = params.get('bins_to_remove') for bin in list(old_bins): bin_id = bin.get('bid') if bin_id in bins_to_remove: log('removing bin_id: {}'.format(bin_id)) old_bins.remove(bin) total_contig_len -= int(bin.get('sum_contig_len')) log('removed bin_id: {} from BinnedContig object'.format( bin_id)) binned_contigs = { 'assembly_ref': assembly_ref, 'bins': old_bins, 'total_contig_len': total_contig_len } new_binned_contig_ref = self._save_binned_contig( binned_contigs, params.get('workspace_name'), params.get('output_binned_contig_name')) returnVal = {'new_binned_contig_ref': new_binned_contig_ref} log('successfully saved BinnedContig object') return returnVal def merge_bins_from_binned_contig(self, params): """ merge_bins_from_binned_contig: merge a list of bins from BinnedContig object input params: old_binned_contig_ref: Original BinnedContig object reference bin_merges: a list of bin merges dicts new_bin_id: newly created bin id bin_to_merge: list of bins to merge output_binned_contig_name: Name for the output BinnedContigs object workspace_name: the name of the workspace new object gets saved to return params: new_binned_contig_ref: newly created BinnedContig object referece """ log('--->\nrunning MetagenomeFileUtils.merge_bins_from_binned_contig\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_merge_bins_from_binned_contig_params(params) bin_merges = params.get('bin_merges') self._check_bin_merges(bin_merges) binned_contig_object = self.dfu.get_objects( {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0] assembly_ref = binned_contig_object.get('data').get('assembly_ref') total_contig_len = int( binned_contig_object.get('data').get('total_contig_len')) bins = binned_contig_object.get('data').get('bins') old_bin_ids = map(lambda item: item.get('bid'), bins) for bin_merge in bin_merges: new_bin_id = bin_merge.get('new_bin_id') bin_id_to_merge = bin_merge.get('bin_to_merge') if set(bin_id_to_merge) <= set(old_bin_ids): bin_objects_to_merge = [] for bin in list(bins): bin_id = bin.get('bid') if bin_id in bin_id_to_merge: bin_objects_to_merge.append(bin) log('removing bin_id: {}'.format(bin_id)) bins.remove(bin) total_contig_len -= int(bin.get('sum_contig_len')) log('removed bin_id: {} from BinnedContig object'. format(bin_id)) new_bin = self._merge_bins(new_bin_id, bin_objects_to_merge) log('appending bin_id: {}'.format(new_bin_id)) bins.append(new_bin) total_contig_len += int(new_bin.get('sum_contig_len')) log('appended bin_id: {} to BinnedContig object'.format( new_bin_id)) else: bad_bin_ids = list(set(bin_id_to_merge) - set(old_bin_ids)) error_msg = 'bin_id: [{}] '.format(', '.join(bad_bin_ids)) error_msg += 'is not listed in BinnedContig object' raise ValueError(error_msg) binned_contigs = { 'assembly_ref': assembly_ref, 'bins': bins, 'total_contig_len': total_contig_len } new_binned_contig_ref = self._save_binned_contig( binned_contigs, params.get('workspace_name'), params.get('output_binned_contig_name')) returnVal = {'new_binned_contig_ref': new_binned_contig_ref} log('successfully saved BinnedContig object') return returnVal def edit_bins_from_binned_contig(self, params): """ edit_bins_from_binned_contig: merge/remove a list of bins from BinnedContig object a wrapper method of: merge_bins_from_binned_contig remove_bins_from_binned_contig input params: old_binned_contig_ref: Original BinnedContig object reference bins_to_remove: a list of bin ids to be removed bin_merges: a list of bin merges dicts new_bin_id: newly created bin id bin_to_merge: list of bins to merge output_binned_contig_name: Name for the output BinnedContigs object workspace_name: the name of the workspace new object gets saved to return params: new_binned_contig_ref: newly created BinnedContig object referece report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning MetagenomeFileUtils.edit_bins_from_binned_contig\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) input_params = params.copy() if params.get('bins_to_remove'): bins_to_remove = input_params.get('bins_to_remove') if isinstance(bins_to_remove, string_types): input_params['bins_to_remove'] = bins_to_remove.split(',') new_binned_contig_ref = self.remove_bins_from_binned_contig( input_params).get('new_binned_contig_ref') input_params['old_binned_contig_ref'] = new_binned_contig_ref if params.get('bin_merges'): new_binned_contig_ref = self.merge_bins_from_binned_contig( input_params).get('new_binned_contig_ref') returnVal = {'new_binned_contig_ref': new_binned_contig_ref} report_message = self._generate_report_message(new_binned_contig_ref) reportVal = self._generate_report(report_message, params) returnVal.update(reportVal) return returnVal
def get_promoter_for_gene(self, ctx, params): """ :param params: instance of type "get_promoter_for_gene_input" (Genome is a KBase genome Featureset is a KBase featureset Promoter_length is the length of promoter requested for all genes) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "featureSet_ref" of String, parameter "promoter_length" of Long :returns: instance of String """ # ctx is the context object # return variables are: output #BEGIN get_promoter_for_gene #code goes here dfu = DataFileUtil(self.callback_url) #objectRefs = {'object_refs':[params['genome_ref'],params['featureSet_ref']]} objectRefs = {'object_refs': [params['featureSet_ref']]} ws = Workspace('https://appdev.kbase.us/services/ws') ws_name = params['workspace_name'] subset = ws.get_object_subset([{ 'included': ['/features/[*]/location', '/features/[*]/id', '/assembly_ref'], 'ref': params['genome_ref'] }]) features = subset[0]['data']['features'] aref = subset[0]['data']['assembly_ref'] objects = dfu.get_objects(objectRefs) #genome = objects['data'][0]['data'] #featureSet = objects['data'][1]['data'] featureSet = objects['data'][0]['data'] assembly_ref = {'ref': aref} #print assembly_ref #with open(self.shared_folder + '/genome.json','w') as f: # json.dump(genome,f) #with open(self.shared_folder + '/featureSet.json','w') as f: # json.dump(featureSet,f) #with open('/kb/module/work/asssembly.json','w') as f: # json.dump(assembly,f) print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref) #pprint(fasta_file) #loop over featureSet #find matching feature in genome #get record, start, orientation, length #TODO: add some error checking logic to the bounds of the promoter prom = "" featureFound = False for feature in featureSet['elements']: #print(feature) #print(featureSet['elements'][feature]) featureFound = False for f in features: #print f['id'] #print feature if f['id'] == feature: attributes = f['location'][0] featureFound = True #print('found match ' + feature) #print(f['location']) break if featureFound: for record in SeqIO.parse(fasta_file['path'], 'fasta'): #for record in SeqIO.parse('/kb/module/work/Gmax_189_genome_assembly.fa', 'fasta'): #print(record.id) #print(attributes[0]) if record.id == attributes[0]: #print('adding to prom string') #print(attributes[0]) if attributes[2] == '+': #print('1') #might need to offset by 1? end = attributes[1] start = end - params['promoter_length'] if end < 0: end = 0 promoter = record.seq[start:end].upper() #HERE: resolve ambiguous characters prom += ">" + feature + "\n" prom += promoter + "\n" elif attributes[2] == '-': #print('2') start = attributes[1] end = start + params['promoter_length'] if end > len(record.seq) - 1: end = len(record.seq) - 1 promoter = record.seq[start:end].upper() complement = { 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N' } promoter = ''.join( [complement[base] for base in promoter[::-1]]) #HERE: resolve ambiguous characters prom += ">" + feature + "\n" prom += promoter + "\n" else: print('Error on orientation') else: print('Could not find feature ' + feature + 'in genome') promOutputPath = '/kb/module/work/tmp/promFile.fa' #print('prom string\n' + str(prom)) with open(promOutputPath, 'w') as promFile: promFile.write(str(prom)) timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) html_output_dir = os.path.join(self.shared_folder, 'output_html.' + str(timestamp)) if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) html_file = 'promoter.html' output_html_file_path = os.path.join(html_output_dir, html_file) html_report_lines = '<html><body>' html_report_lines += '<pre>' + prom + '</pre>' html_report_lines += '</body></html>' with open(output_html_file_path, 'w', 0) as html_handle: html_handle.write(str(html_report_lines)) try: html_upload_ret = dfu.file_to_shock({ 'file_path': html_output_dir, #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path, #'make_handle': 0}) 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') reportName = 'identify_promoter_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], 'message': '', 'direct_html': None, 'direct_html_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': 'View' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #changing output to be path string #TODO: get rid of this html maybe and move into find_motifs output = promOutputPath #iterate over records in fasta #for record in SeqIO.parse(fasta_file['path'], 'fasta'): #objects list of Genome and featureSet #pprint(objects) #END get_promoter_for_gene # At some point might do deeper type checking... if not isinstance(output, basestring): raise ValueError('Method get_promoter_for_gene return value ' + 'output is not type basestring as required.') # return the results return [output]
class variation_importer_utils: def __init__(self, utility_params): self.params = utility_params # self.scratch = utility_params['scratch'] self.scratch = os.path.join(utility_params['scratch'], 'variation_importer_' + str(uuid.uuid4())) os.mkdir(self.scratch) self.service_wiz_url = utility_params['srv-wiz-url'] self.callback_url = utility_params['callback_url'] self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url, token=utility_params['token']) def _create_fake_location_data(self): location = { 'lat': random.uniform(-90, 90), 'lon': random.uniform(-180, 180), 'elevation': random.uniform(0, 100), 'description': "".join([random.choice(string.ascii_letters) for n in xrange(20)]) } return location def _create_fake_straininfo(self, genotype_id): straininfo = { 'source_id': genotype_id, 'location_info': self._create_fake_location_data() } return straininfo def _create_fake_population(self, genotypes): population = {'description': 'Faker population data.', 'strains': []} for genome in genotypes: population['strains'].append(self._create_fake_straininfo(genome)) return population def _create_fake_kinship_matrix(self): kinship = { 'row_ids': ['one', 'two'], 'col_ids': ['one', 'two'], 'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]] } return kinship def _compare(self, s, t): return Counter(s) == Counter(t) def pretend_download_staging_file(self, vcf_filename, scratch): vcf_filepath = os.path.join(scratch, vcf_filename) shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath) return {'copy_file_path': vcf_filepath} def _generate_population(self, location_filepath, genotypes, population_description="None Provided"): locations = pd.read_csv(location_filepath, delimiter='\t') # Drop any missing data from id, latitude, or longitude. locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True) # Compare the location IDs with the genotype IDs if not (self._compare(locations.iloc[:, 0].astype(str).tolist(), genotypes)): log("Location IDs do not match Sample IDs in Variation file!") raise ValueError( "Location IDs do not match Sample IDs in Variation file!") col_names = [x.lower() for x in locations.columns.values] expected_columns = ['id', 'latitude', 'longitude'] optional_columns = ['elevation', 'description'] # CHeck that first three columns match the expected columns. if not (self._compare(col_names[0:3], expected_columns)): raise ValueError("Missing or unexpected column names in {}".format( location_filepath)) # If optional columns are not present, give default value for each. for col in optional_columns: if col not in col_names: if col == 'elevation': locations[col] = 0.0 else: locations[col] = "None provided." population = {'description': population_description, 'strains': []} for idx, row in locations.iterrows(): population['strains'].append({ 'source_id': str(row['id']), 'location_info': { 'lat': row['latitude'], 'lon': row['longitude'], 'elevation': row['elevation'], 'description': row['description'] } }) return population def _validate_vcf(self, vcf_filepath, vcf_version): validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-o") validator_cmd.append(validation_output_dir) else: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version below 4.1. No validation logging.") print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break validator_output.append(line) p.wait() validation_output_filename = [ f for f in os.listdir(validation_output_dir) if f.endswith('.txt') ][0] validation_output_filepath = os.path.join(validation_output_dir, validation_output_filename) if not validation_output_filename: print('Validator did not generate log file!') raise Exception("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filepath)) log("Return code from validator {}".format(p.returncode)) return validation_output_filepath, p.returncode # Retrieve contigs from assembly file. def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'): try: assembly_data = self.dfu.get_objects( {'object_refs': [assembly_ref]})['data'][0]['data'] except Exception as e: print("Unable to retrieve Assembly reference: {}".format( assembly_ref)) raise ValueError(e) raw_contigs = assembly_data['contigs'] contigs = {} # Contigs returns just a dict with key and contig_id for key, value in raw_contigs.iteritems(): contigs[str(key)] = value['contig_id'] return raw_contigs def _get_version_contigs_genotypes(self, vcf_filepath): contigs = [] genotypes = [] version = '' with (gzip.open if vcf_filepath.endswith('.gz') else open)( vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted." ) raise ValueError( "Invalid VCF. ##fileformat line in meta is improperly formatted." ) version = float(tokens[1][-4:].rstrip()) log("VCF version: {}".format(version)) for line in vcf: if line.startswith("#CHROM"): log("#CHROM encountered, exiting loop.") genotypes = line.split()[9:] log("Number Genotypes in vcf: {}".format(len(genotypes))) break tokens = line.split("=") if tokens[0].startswith('##contig'): contigs.append(tokens[2][:-2]) return version, contigs, genotypes # Arabidopsis ref: 18590/2/8 def _get_assembly_ref_from_genome(self, genome_ref): ga = GenomeAnnotationAPI(self.service_wiz_url) inputs_get_assembly = {'ref': genome_ref} try: assembly_object_ref = ga.get_assembly(inputs_get_assembly) except Exception as e: print( "Unable to retrieve Assembly reference ID from Genome ref_id: {}" .format(genome_ref)) raise Exception(e) return assembly_object_ref def _generate_output_file_list(self): log('Start packing result files') output_files = list() result_file = os.path.join(self.scratch, 'variation_importer_results.zip') excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store'] with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(self.scratch): for file in files: if not (file.endswith(tuple(excluded_extensions)) # file.endswith('.zip') or # file.endswith('.vcf') or # file.endswith('.vcf.gz') or # file.endswith('.html') or # file.endswith('.DS_Store') ): zip_file.write(os.path.join(root, file), file) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Variation Importer' }) log("Importer output generated: {}".format(output_files)) return output_files def _generate_report(self, params, variation_results, variation_file_path): stats_results = self._generate_variation_stats( params['additional_output_type'], variation_file_path) html_report = self._generate_html_report(variation_results, stats_results) file_links = self._generate_output_file_list() objects = [] if (variation_results['valid_variation_file']): objects = [{ 'ref': variation_results['variation_obj_ref'], 'description': 'Variation Object created by VCF Importer' }] report_params = { 'objects_created': objects, 'message': '', 'direct_html_link_index': 0, 'file_links': file_links, 'html_links': html_report, 'html_window_height': 330, 'workspace_name': params['workspace_name'], 'report_object_name': 'variation_importer_report_' + str(uuid.uuid4()) } kbr_output = self.kbr.create_extended_report(report_params) report_output = { 'report_name': kbr_output['name'], 'report_ref': kbr_output['ref'], 'variation_ref': variation_results['variation_obj_ref'] } log("Returning from _generate_report!") return report_output def _generate_html_report(self, variation_results, stats_output=None): """ _generate_html_report: generate html report from output files """ html_report = list() print("Validation output filepath passed to html report: {}".format( variation_results['validation_output_filepath'])) try: report_dir = os.path.join(self.scratch, 'html') os.mkdir(report_dir) with open(template_dir, 'r') as html, open( variation_results['validation_output_filepath'], 'r') as validation: validation_content = '<p><h4>{} '.format( variation_results['variation_filename']) if variation_results.get('valid_variation_file'): validation_content += '<em><i>is</i> a valid </em> variation file.' else: validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.' validation_content += '</h4></p>' report = html.read() # Discard the first line of the validation file. It is irrelevant. validation.readline() validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>' validation_content += '<ul>' for line in validation.readlines(): validation_content += '<li>{}</li>'.format(line) validation_content += '</ul>' if variation_results.get('invalid_contigs'): validation_content += '<h4>The following Contigs were not found in the reference genome. The possible contigs have been written to the file {}. Please see the associated links to download.</h4>'.format( variation_results.get('genome_ref'), 'valid_contigs.txt') validation_content += '<ul>' for contig in variation_results.get('invalid_contigs'): validation_content += '<li>{}</li>'.format(contig) validation_content += '</ul>' # if not variation_results.get('contigs'): # validation_content += '<h4>No contig information was included in the VCF file header! Please recreate the VCF file with each contig described in the meta description </h4>' report = report.replace('Validation_Results', validation_content) if (stats_output.get('stats_file_dir')): summary_results = '<p><h4>Summary Statistics</h4></p>' summary_results += ''' <table> <tr> <th>Number of SNPs</th> <th>Number of Genotypes </th> </tr> ''' summary_results += '<tr>' summary_results += '<td>{}</td><td>{}</td>'.format( 'To be added later', variation_results['num_genotypes']) summary_results += '</tr></table>' report = report.replace('Variation_Statistics', summary_results) # visualization image_content = '' if (stats_output.get('stats_img_dir')): image_dir = stats_output.get('stats_img_dir') for file in glob.glob(os.path.join(image_dir, '*.png')): shutil.move(file, report_dir) for image in glob.glob(report_dir + "/*.png"): image = image.replace(report_dir + '/', '') caption = image.replace(report_dir + '/', '').replace('.png', '') image_content += '<p style="text-align:center"><img align="center" src="{}" ' \ '></a><a target="_blank"><br>' \ '<p align="center">{}</p></p>'.format(image, caption) else: image_content += 'No visualizations generated.' report = report.replace("Visualization_Results", image_content) except Exception as e: print("Error generating HTML report.") raise report_file_path = os.path.join(report_dir, 'index.html') with open(report_file_path, 'w') as output: output.write(report) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': report_file_path, 'make_handle': 0, 'pack': 'zip' }) log("Variation HTML report to shock ref: {}".format( html_upload_ret)) except: raise ValueError('Error uploading HTML to shock') html_report.append({ 'shock_id': html_upload_ret['shock_id'], 'name': os.path.basename(report_file_path), 'label': os.path.basename(report_file_path), 'description': 'HTML report for Variation Importer' }) return html_report def _generate_variation_stats(self, additional_output_type, variation_filepath): """ :param commments go here """ file_output_directory = os.path.join(self.scratch, 'stats_' + str(uuid.uuid4())) os.mkdir(file_output_directory) image_output_directory = os.path.join( self.scratch, 'stats_images_' + str(uuid.uuid4())) os.mkdir(image_output_directory) # TODO: Validate user supplied params and build PLINK command plink_cmd = ["plink"] plink_cmd.append('--vcf') plink_cmd.append(variation_filepath) # plink_cmd.append('--recode12') # plink_cmd.append('transpose') # plink_cmd.append('--output-missing-genotype') # plink_cmd.append("0") plink_cmd.append('--freq') plink_cmd.append('--hardy') # plink_cmd.append('gz') plink_cmd.append('--out') plink_cmd.append(variation_filepath) print("PLINK arguments: {}".format(plink_cmd)) plink_output = { "errors": [], "warnings": [] # "notes" : [] } p = subprocess.Popen(plink_cmd, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) while True: line = p.stdout.readline() if not line: break # log(line) tokens = line.split(':') if (tokens[0] == 'Error'): plink_output['errors'].append(line) raise ValueError('PLINK 1.9 error: ' + line) elif (tokens[0] == 'Warning'): plink_output['warnings'].append(line) print(line) # elif(tokens[0] == 'Note'): # plink_output['notes'].append(line) # print(line) p.stdout.close() p.wait() plink_output_filepath = os.path.join(file_output_directory, 'plink_cli_output.txt') with open(plink_output_filepath, 'w') as plink: for data in plink_output: plink.write("{}: {}\n".format(data, plink_output[data])) plink_output_files = [ f for f in os.listdir(self.scratch) if f.startswith(os.path.basename(variation_filepath) + '.') ] for file in plink_output_files: shutil.move(os.path.join(self.scratch, file), file_output_directory) if p.returncode != 0: log("PLINK encountered an error during runtime. Please see log file." ) variation_filename = os.path.basename(variation_filepath) base_filepath = os.path.join(file_output_directory, variation_filename) freq_filepath = base_filepath + '.frq' maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R' hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R' log("Frequency filepath: {}".format(freq_filepath)) # TODO: make function to do Rscript calls. # generate visualizations and store in directory maf_command = ['Rscript'] maf_command.append('--no-save') maf_command.append('--vanilla') maf_command.append(maf_script_filepath) maf_command.append(freq_filepath) maf_command.append("Minor Allele Frequencies.png") r = subprocess.Popen(maf_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) r.wait() if r.returncode != 0: log("Error creating MAF histogram in R") hwe_filepath = base_filepath + '.hwe' zoom_filepath = hwe_filepath + '.zoom' zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format( hwe_filepath, zoom_filepath) try: z = subprocess.Popen(zoom_command, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) z.wait() if z.returncode != 0: log("Error creating HWE zoom file.") except Exception as e: log("Error creating zoom HWE file: {}".format(e)) hwe_command = ['Rscript'] hwe_command.append('--no-save') hwe_command.append('--vanilla') hwe_command.append(hwe_script_filepath) hwe_command.append(hwe_filepath) hwe_command.append("Hardy-Weinberg Equilibrium.png") hwe_command.append(zoom_filepath) hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png") print("MAF command: {}".format(hwe_command)) h = subprocess.Popen(hwe_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) h.wait() if h.returncode != 0: log("Error generating HWE Zoom plot") return { 'stats_file_dir': file_output_directory, 'stats_img_dir': image_output_directory } def _save_variation_to_ws(self, workspace_name, variation_obj, variation_filepath, kinship_matrix): ws_id = self.dfu.ws_name_to_id(workspace_name) try: vcf_shock_return = self.dfu.file_to_shock({ 'file_path': variation_filepath, 'make_handle': 1, 'pack': 'gzip' }) except Exception as e: print("Error uploading file to shock!") raise ValueError(e) variation_obj['variation_file_reference'] = vcf_shock_return.get( 'shock_id') info = self.dfu.save_objects({ 'id': ws_id, 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_obj, 'name': 'TestVariationImporterName' }] })[0] variation_ref = "%s/%s/%s" % (info[6], info[0], info[4]) log("Variation reference created: {}".format(variation_ref)) return variation_ref def validate_vcf(self, params): """ :param params: dict containing all input parameters. """ returnVal = {} valid_vcf_file = True try: vcf_filepath = self.pretend_download_staging_file( params['staging_file_subdir_path'], self.scratch).get('copy_file_path') location_filepath = self.pretend_download_staging_file( params['location_file_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['staging_file_subdir_path'])) try: location_filepath = self.pretend_download_staging_file( params['location_file_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['location_file_subdir_path'])) # Check file size log("{} file size: {}".format(vcf_filepath, os.path.getsize(vcf_filepath))) log('\nValidating {}...'.format(vcf_filepath)) vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes( vcf_filepath) if not vcf_contigs: log("No contig data in {} header.".format(vcf_filepath)) raise ValueError( "No contig data in {} header.".format(vcf_filepath)) if (vcf_version < 4.1): log("VCF file is version {}. Must be at least version 4.1".format( vcf_version)) raise ValueError( "VCF file is version {}. Must be at least version 4.1".format( vcf_version)) # Generate population object population = self._generate_population(location_filepath, vcf_genotypes) # Retrieve Assembly object reference associated with genome. try: assembly_ref = self._get_assembly_ref_from_genome( params['genome_ref']) except Exception as e: print("Unable to retrieve {}".format(params['genome_ref'])) raise ValueError(e) # Retrieve contig list from Assembly object. try: assembly_contigs = self._get_contigs_from_assembly(assembly_ref) except Exception as e: print("Unable to retrieve contigs from Assembly ref: {}".format( assembly_ref)) raise ValueError(e) log("Length of assembly contigs: {}".format(len(assembly_contigs))) # Compare contig IDs from VCF to those in the Assembly object invalid_contigs = [] for contig in vcf_contigs: if contig not in assembly_contigs.keys(): invalid_contigs.append(contig) if invalid_contigs: log("Invalid contig IDs found in {}".format(vcf_filepath)) valid_contig_filepath = os.path.join(self.scratch, 'valid_contigs.txt') log("Writing valid contigs to file: {}".format( valid_contig_filepath)) with open(valid_contig_filepath, 'w') as icf: for contig in assembly_contigs: icf.write(contig + '\n') valid_vcf_file = False validation_output_filepath, returncode = self._validate_vcf( vcf_filepath, vcf_version) if returncode != 0: valid_vcf_file = False kinship_matrix = self._create_fake_kinship_matrix() variation_obj_ref = '' if valid_vcf_file: variation_object = { "genome": params['genome_ref'], "population": population, "contigs": vcf_contigs, "comment": "Comments go here", "assay": "Assay data goes gere.", "originator": "PI/Lab info goes here", "pubmed_id": "PubMed ID goes here", "kinship_info": kinship_matrix } variation_obj_ref = self._save_variation_to_ws( params['workspace_name'], variation_object, vcf_filepath, kinship_matrix) log("Variation object reference: {}".format(variation_obj_ref)) variation_report_metadata = { 'valid_variation_file': valid_vcf_file, 'variation_obj_ref': variation_obj_ref, 'variation_filename': os.path.basename(vcf_filepath), 'validation_output_filepath': validation_output_filepath, 'vcf_version': vcf_version, 'num_genotypes': len(vcf_genotypes), 'num_contigs': len(vcf_contigs), 'invalid_contigs': invalid_contigs } returnVal = self._generate_report(params, variation_report_metadata, vcf_filepath) return returnVal