class genome_wide_association_studies: ''' Module Name: genome_wide_association_studies Module Description: A KBase module: genome_wide_association_studies ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "[email protected]:pranjan77/genome_wide_association_studies" GIT_COMMIT_HASH = "fb9bc0c5d00a0314e9f832d30a996c7448e61db9" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.scratch = os.path.abspath(config['scratch']) self.callbackURL = os.environ['SDK_CALLBACK_URL'] # self.shared_folder = os.path.abspath(config['scratch']) self.dfu = DataFileUtil(self.callbackURL) #END_CONSTRUCTOR pass def import_gwas_data(self, ctx, import_gwas_data_params): """ :param import_gwas_data_params: instance of type "import_gwas_data_params" (Insert your typespec information here.) -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String :returns: instance of type "Run_import_gwas_data_result" -> structure: parameter "report_ref" of String, parameter "report_name" of String """ # ctx is the context object # return variables are: returnVal #BEGIN import_gwas_data #Download file from staging area params = import_gwas_data_params print(params) # print (import_gwas_data_params) # download_staging_file_params = { # 'staging_file_subdir_path': params.get('staging_file_subdir_path') # } # try: # scratch_file_path = self.dfu.download_staging_file( # download_staging_file_params).get('copy_file_path') # except: # raise ValueError ('error uploading HTML file to shock') gwas_utils = gwas_results_utils.gwas_results_utils(self.config) returnVal = gwas_utils.run_import_gwas_results(params) #END import_gwas_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_gwas_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def import_snp_data(self, ctx, import_snp_data_params): """ :param import_snp_data_params: instance of type "import_snp_data_params" -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String :returns: instance of type "Run_import_snp_data_result" -> structure: parameter "report_ref" of String, parameter "report_name" of String """ # ctx is the context object # return variables are: returnVal #BEGIN import_snp_data params = import_snp_data_params print(import_snp_data_params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } try: scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') except: raise ValueError('error uploading HTML file to shock') print scratch_file_path #END import_snp_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_snp_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def import_trait_data(self, ctx, import_trait_data_params): """ :param import_trait_data_params: instance of type "import_trait_data_params" -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String :returns: instance of type "Run_import_trait_data_result" -> structure: parameter "report_ref" of String, parameter "report_name" of String """ # ctx is the context object # return variables are: returnVal #BEGIN import_trait_data params = import_trait_data_params print(import_trait_data_params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } try: scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') except: raise ValueError('error uploading HTML file to shock') #END import_trait_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_trait_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def import_network_data(self, ctx, import_network_data_params): """ :param import_network_data_params: instance of type "import_network_data_params" -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String :returns: instance of type "Run_import_network_data_result" -> structure: parameter "report_ref" of String, parameter "report_name" of String """ # ctx is the context object # return variables are: returnVal #BEGIN import_network_data params = import_network_data_params print(import_network_data_params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } try: scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') except: raise ValueError('error uploading HTML file to shock') #END import_network_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_network_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def import_motif_data(self, ctx, import_motif_data_params): """ :param import_motif_data_params: instance of type "import_motif_data_params" -> structure: parameter "input_shock_id" of String, parameter "input_file_path" of String :returns: instance of type "Run_import_motif_data_result" -> structure: parameter "report_ref" of String, parameter "report_name" of String """ # ctx is the context object # return variables are: returnVal #BEGIN import_motif_data #END import_motif_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method import_motif_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class ImportMediaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fba = fba_tools(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_media_from_staging(self, params): ''' import_media_from_staging: wrapper method for FBAFileUtil.tsv_file_to_media and FBAFileUtil.excel_file_to_media required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name media_name - output Media file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportMediaUtil.import_media_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_media_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_media_params = params import_media_params['media_file'] = file try: ref = self.fba.tsv_file_to_media(import_media_params) except: try: ref = self.fba.excel_file_to_media(import_media_params) except: raise ValueError('"{}" is not a valid EXCEL nor TSV file'.format( params.get('staging_file_subdir_path'))) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('ref')) returnVal = {'obj_ref': ref.get('ref')} return returnVal def import_tsv_as_media_from_staging(self, params): ''' import_tsv_as_media_from_staging: wrapper method for FBAFileUtil.tsv_file_to_media required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name media_name - output Media file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportMediaUtil.import_tsv_as_media_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_media_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_media_params = params import_media_params['media_file'] = file ref = self.fba.tsv_file_to_media(import_media_params) returnVal = {'obj_ref': ref.get('ref')} return returnVal def import_excel_as_media_from_staging(self, params): ''' import_excel_as_media_from_staging: wrapper method for FBAFileUtil.excel_file_to_media required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name media_name - output Media file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportMediaUtil.import_excel_as_media_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_media_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_media_params = params import_media_params['media_file'] = file ref = self.fba.excel_file_to_media(import_media_params) returnVal = {'obj_ref': ref.get('ref')} return returnVal def validate_import_media_from_staging_params(self, params): """ validate_import_media_from_staging_params: validates params passed to import_excel(tsv)_as_media_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'workspace_name', 'media_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_excel(tsv)_as_media_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) upload_message += "Media Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class ImportGFFFastaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.gfu = kb_gffupload(self.callback_url, service_ver='dev') def import_gff_fasta_from_staging(self, params): ''' import_gff_fasta_from_staging: wrapper method for kb_gffupload.fasta_gff_to_genome required params: fasta_file: fasta file from user's staging area gff_file: gff file from user's staging area genome_name: output genome object name workspace_name: workspace name that genome will be stored to file paths for both fasta and gff files must be subdirectory file path in staging area e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name return: genome_ref: return object reference report_name: name of generated report (if any) report_ref: report reference (if any) ''' log('--->\nrunning ImportGFFFastaUtil.import_gff_fasta_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_gff_fasta_from_staging_params(params) #If not testing, fetch from staging if ('test' not in params or params['test'] != 1): for key in ('fasta_file', 'gff_file'): file_path = params[key] file = os.path.basename(file_path) download_staging_file_params = { 'staging_file_subdir_path': file } dfu_returnVal = self.dfu.download_staging_file( download_staging_file_params) params[key] = dfu_returnVal['copy_file_path'] print params returnVal = self.gfu.fasta_gff_to_genome(params) return returnVal def validate_import_gff_fasta_from_staging_params(self, params): """ validate_import_gff_fasta_from_staging_params: validates params passed to fasta_gff_to_genome method """ # check for required parameters for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') #for now must use workspace name, but no ws_id_to_name() function available if str(params["workspace_name"]).isdigit(): raise ValueError( '"' + params["workspace_name"] + '" parameter is a workspace id and workspace name is required') def validate_import_genbank_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'genome_name', 'workspace_name', 'source' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing')
class ImportExpressionMatrixUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fv = KBaseFeatureValues(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_tsv_as_expression_matrix_from_staging(self, params): ''' import_tsv_as_expression_matrix_from_staging: wrapper method for KBaseFeatureValues.tsv_file_to_matrix required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name matrix_name: output Expressin Matirx file name workspace_name: workspace name/ID of the object optional params: genome_ref: optional reference to a Genome object that will be used for mapping feature IDs to fill_missing_values: optional flag for filling in missing values in matrix (default value is false) data_type: optional filed, value is one of 'untransformed', 'log2_level', 'log10_level', 'log2_ratio', 'log10_ratio' or 'unknown' (last one is default value) data_scale: optional parameter (default value is '1.0') return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_tsv_as_expression_matrix_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_tsv_as_expression_matrix_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') import_matrix_params = params import_matrix_params['input_file_path'] = scratch_file_path import_matrix_params['output_ws_name'] = params.get('workspace_name') import_matrix_params['output_obj_name'] = params.get('matrix_name') ref = self.fv.tsv_file_to_matrix(import_matrix_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('output_matrix_ref')) returnVal = {'obj_ref': ref.get('output_matrix_ref')} return returnVal def validate_import_tsv_as_expression_matrix_from_staging_params(self, params): """ validate_import_tsv_as_expression_matrix_from_staging_params: validates params passed to import_tsv_as_expression_matrix_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'workspace_name', 'matrix_name']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_tsv_as_expression_matrix_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } object_data = self.dfu.get_objects(get_objects_params) upload_message += "Expression Matrix Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported TSV File: {}\n'.format( params.get('staging_file_subdir_path')) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class ImportFBAModelUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.fba = fba_tools(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_fbamodel_from_staging(self, params): log('--->\nrunning {}.{}\n params:\n{}'.format( self.__class__.__name__, sys._getframe().f_code.co_name, json.dumps(params, indent=1))) self._check_param(params, [ 'model_file', 'file_type', 'workspace_name', 'model_name', 'biomass' ], ['genome', 'compounds_file']) if params['file_type'] == 'tsv' and not params.get( 'compounds_file', None): raise ValueError('A compound file is required for tsv upload.') fba_tools_params = params.copy() for infile in ['model_file', 'compounds_file']: if not params.get(infile, None): continue download_staging_file_params = { 'staging_file_subdir_path': params[infile] } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') fba_tools_params[infile] = {'path': scratch_file_path} if params['file_type'] == 'sbml': res = self.fba.sbml_file_to_model(fba_tools_params) elif params['file_type'] == 'excel': res = self.fba.excel_file_to_model(fba_tools_params) elif params['file_type'] == 'tsv': res = self.fba.tsv_file_to_model(fba_tools_params) else: raise ValueError('"{}" is not a valid import file_type'.format( params['file_type'])) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( download_staging_file_params.get('staging_file_subdir_path'), res['ref']) return {'obj_ref': res['ref']} @staticmethod def _check_param(in_params, req_param, opt_param=list()): """ Check if each of the params in the list are in the input params """ for param in req_param: if param not in in_params: raise ValueError( 'Required parameter "{}" is missing'.format(param)) defined_param = set(req_param + opt_param) for param in in_params: if param not in defined_param: print('WARNING: received unexpected parameter "{}"'.format( param)) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_excel(tsv)_as_media_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' upload_message += "FBAModel Object Name: " upload_message += params['model_name'] + '\n' upload_message += 'Imported File: {}\n'.format( params.get('model_file')) report_params = { 'message': upload_message, 'objects_created': [{ 'ref': obj_ref, 'description': 'Imported FBAModel' }], 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_methods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportAssemblyUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'import_assembly_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_fasta_as_assembly_from_staging(self, params): ''' import_fasta_as_assembly_from_staging: wrapper method for AssemblyUtil.save_assembly_from_fasta required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name assembly_name - output Assembly file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_fasta_as_assembly_from_staging(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_assembly_params = params import_assembly_params['file'] = file ref = self.au.save_assembly_from_fasta(import_assembly_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), ref) returnVal = {'obj_ref': ref} return returnVal def validate_import_fasta_as_assembly_from_staging(self, params): """ validate_import_fasta_as_assembly_from_staging: validates params passed to import_fasta_as_assembly_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'assembly_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_html_report(self, assembly_ref, assembly_object, params): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() assembly_data = assembly_object.get('data')[0].get('data') assembly_info = assembly_object.get('data')[0].get('info') result_file_path = os.path.join(self.scratch, 'report.html') assembly_name = str(assembly_info[1]) assembly_file = params.get('staging_file_subdir_path') dna_size = assembly_data.get('dna_size') num_contigs = assembly_data.get('num_contigs') assembly_overview_data = collections.OrderedDict() assembly_overview_data['Name'] = '{} ({})'.format( assembly_name, assembly_ref) assembly_overview_data['Uploaded File'] = assembly_file assembly_overview_data['Date Uploaded'] = time.strftime("%c") assembly_overview_data['DNA Size'] = dna_size assembly_overview_data['Number of Contigs'] = num_contigs overview_content = '' overview_content += '<br/><table>\n' for key, val in assembly_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>\n' overview_content += '</table>' contig_data = assembly_data.get('contigs').values() contig_content = str([[str(e['contig_id']), e['length']] for e in contig_data]) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template_assembly.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>*Overview_Content*</p>', overview_content) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({ 'file_path': self.scratch, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_fasta_as_assembly_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) objects_created = [{ 'ref': obj_ref, 'description': 'Imported Assembly' }] output_html_files = self.generate_html_report(obj_ref, object_data, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 270, 'report_object_name': 'kb_upload_assembly_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class ImportGenbankUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) def import_genbank_from_staging(self, params): ''' import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name genome_name - becomes the name of the object workspace_name - the name of the workspace it gets saved to. source - Source of the file typically something like RefSeq or Ensembl optional params: release - Release or version number of the data per example Ensembl has numbered releases of all their data: Release 31 generate_ids_if_needed - If field used for feature id is not there, generate ids (default behavior is raising an exception) genetic_code - Genetic code of organism. Overwrites determined GC from taxon object type - Reference, Representative or User upload return: genome_ref: return object reference ''' log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_genbank_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_genbank_params = params import_genbank_params['file'] = file del import_genbank_params['staging_file_subdir_path'] returnVal = self.gfu.genbank_to_genome(import_genbank_params) return returnVal def validate_import_genbank_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing')
class ImportGenbankUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev') self.uploader_utils = UploaderUtil(config) def import_genbank_from_staging(self, params): ''' import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name genome_name - becomes the name of the object workspace_name - the name of the workspace it gets saved to. source - Source of the file typically something like RefSeq or Ensembl optional params: release - Release or version number of the data per example Ensembl has numbered releases of all their data: Release 31 generate_ids_if_needed - If field used for feature id is not there, generate ids (default behavior is raising an exception) genetic_code - Genetic code of organism. Overwrites determined GC from taxon object type - Reference, Representative or User upload return: genome_ref: return object reference ''' log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_genbank_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = { 'path': scratch_file_path } import_genbank_params = params import_genbank_params['file'] = file del import_genbank_params['staging_file_subdir_path'] returnVal = self.gfu.genbank_to_genome(import_genbank_params) """ Update the workspace object related meta-data for staged file """ #self.uploader_utils.update_staging_service( # download_staging_file_params.get('staging_file_subdir_path'), # returnVal['genome_ref']) return returnVal def validate_import_genbank_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_html_report(self, genome_ref, params): """ _generate_html_report: generate html summary report """ log('start generating html report') genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]}) html_report = list() result_file_path = os.path.join(self.scratch, 'report.html') genome_name = str(genome_obj.get('data')[0].get('info')[1]) genome_file = params.get('staging_file_subdir_path') genome_data = genome_obj.get('data')[0].get('data') genome_info = genome_obj.get('data')[0].get('info') source = genome_info[10].get('Source') num_contigs = genome_info[10].get('Number contigs') size = genome_info[10].get('Size') gc_content = genome_info[10].get('GC content') warnings = genome_data.get('warnings', []) feature_counts = sorted(list(genome_data.get('feature_counts', {}) .items())) genome_overview_data = collections.OrderedDict() genome_overview_data['Name'] = '{} ({})'.format(genome_name, genome_ref) #genome_overview_data['Uploaded File'] = genome_file genome_overview_data['Date Uploaded'] = time.strftime("%c") genome_overview_data['Source'] = source genome_overview_data['Number of Contigs'] = num_contigs genome_overview_data['Size'] = size genome_overview_data['GC Content'] = gc_content genome_overview_data['Warnings'] = "\n".join(warnings) genome_overview_data.update(feature_counts) overview_content = '' overview_content += '<br/><table>\n' for key, val in genome_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>\n' overview_content += '</table>' feature_content = str([[str(k), v] for k, v in genome_data.get('feature_counts', {}).items() if k != 'gene']) contig_content = str([[str(c), l] for c, l in zip(genome_data.get('contig_ids', []), genome_data.get('contig_lengths', []))]) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template_genome.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', overview_content) report_template = report_template.replace('*FEATURE_DATA*', feature_content) report_template = report_template.replace('*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({'file_path': self.scratch, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for imported Genome'}) return html_report def generate_report(self, genome_ref, params): """ :param genome_ref: Return Val from GenomeFileUtil for Uploaded genome Need to get report warnings and message from it. :return: """ uuid_string = str(uuid.uuid4()) objects_created = [{'ref': genome_ref, 'description': 'Imported Genome'}] output_html_files = self.generate_html_report(genome_ref, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 300, 'report_object_name': 'kb_genome_upload_report_' + uuid_string} kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output
class ImportAssemblyUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) def import_fasta_as_assembly_from_staging(self, params): ''' import_fasta_as_assembly_from_staging: wrapper method for AssemblyUtil.save_assembly_from_fasta required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name assembly_name - output Assembly file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_fasta_as_assembly_from_staging(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_assembly_params = params import_assembly_params['file'] = file ref = self.au.save_assembly_from_fasta(import_assembly_params) returnVal = {'obj_ref': ref} return returnVal def validate_import_fasta_as_assembly_from_staging(self, params): """ validate_import_fasta_as_assembly_from_staging: validates params passed to import_fasta_as_assembly_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'assembly_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_fasta_as_assembly_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) base_count = object_data.get('data')[0].get('data').get('base_counts') dna_size = object_data.get('data')[0].get('data').get('dna_size') upload_message += "Assembly Object Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported Fasta File: {}\n'.format( params.get('staging_file_subdir_path')) if isinstance(dna_size, (int, long)): upload_message += 'DNA Size: {:,}\n'.format(dna_size) if isinstance(base_count, dict): upload_message += 'Base Count:\n{}\n'.format( json.dumps(base_count, indent=1)[2:-2]) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class CompoundSetUtils: ''' Module Name: CompoundSetUtils Module Description: A KBase module: CompoundSetUtils Contains tools for import & export of compound sets ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/CompoundSetUtils.git" GIT_COMMIT_HASH = "53bac077a8efaaea9ead90d5557b1af1c0b23394" #BEGIN_CLASS_HEADER @staticmethod def _check_required_param(in_params, param_list): """ Check if each of the params in the list are in the input params """ for param in param_list: if param not in in_params or not in_params[param]: raise ValueError('{} parameter is required'.format(param)) def _save_to_ws_and_report(self, ctx, method, workspace, source, compoundset): """Save compound set to the workspace and make report""" provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] if 'model' in method: provenance[0]['input_ws_objects'] = workspace + '/' + source provenance[0]['service'] = 'CompoundSetUtils' provenance[0]['method'] = method info = self.ws_client.save_objects( {'workspace': workspace, "objects": [{ "type": "KBaseBiochem.CompoundSet", "data": compoundset, "name": compoundset['name'] }]})[0] compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4]) report_params = { 'objects_created': [{'ref': compoundset_ref, 'description': 'Compound Set'}], 'message': 'Imported %s as %s' % (source, compoundset_ref), 'workspace_name': workspace, 'report_object_name': 'compound_set_creation_report' } # Construct the output to send back report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report(report_params) output = {'report_name': report_info['name'], 'report_ref': report_info['ref'], 'compoundset_ref': compoundset_ref} return output #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) #END_CONSTRUCTOR pass def compound_set_from_file(self, ctx, params): """ CompoundSetFromFile string staging_file_path :param params: instance of type "compoundset_upload_params" -> structure: parameter "workspace_name" of String, parameter "staging_file_path" of String, parameter "compound_set_name" of String :returns: instance of type "compoundset_upload_results" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "compoundset_ref" of type "obj_ref" """ # ctx is the context object # return variables are: output #BEGIN compound_set_from_file self._check_required_param(params, ['workspace_name', 'staging_file_path', 'compound_set_name']) scratch_file_path = self.dfu.download_staging_file( {'staging_file_subdir_path': params['staging_file_path']} ).get('copy_file_path') # I probably should be uploading the raw files to shock ext = os.path.splitext(scratch_file_path)[1] file_name = os.path.basename(scratch_file_path) if ext == '.sdf': compounds = parse.read_sdf(scratch_file_path) elif ext == '.tsv': compounds = parse.read_tsv(scratch_file_path) else: raise ValueError('Invalid input file type. Expects .tsv or .sdf') compoundset = { 'id': params['compound_set_name'], 'name': params['compound_set_name'], 'description': 'Compound Set produced from %s' % file_name, 'compounds': compounds, } output = self._save_to_ws_and_report(ctx, 'compound_set_from_file', params['workspace_name'], params['staging_file_path'], compoundset) #END compound_set_from_file # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method compound_set_from_file return value ' + 'output is not type dict as required.') # return the results return [output] def compound_set_to_file(self, ctx, params): """ CompoundSetToFile string compound_set_name string output_format :param params: instance of type "compoundset_download_params" -> structure: parameter "workspace_name" of String, parameter "compound_set_name" of String, parameter "output_format" of String :returns: instance of type "compoundset_download_results" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN compound_set_to_file self._check_required_param(params, ['workspace_name', 'compound_set_name', 'output_format']) compoundset = self.ws_client.get_objects2({'objects': [ {'workspace': params['workspace_name'], 'name': params['compound_set_name']}]})['data'][0]['data'] ext = params['output_format'] out = "%s/%s.%s" % (self.scratch, compoundset['name'], ext) if ext == 'sdf': outfile_path = parse.write_sdf(compoundset, out) elif ext == 'tsv': outfile_path = parse.write_tsv(compoundset, out) else: raise ValueError('Invalid output file type. Expects tsv or sdf') report_files = [{'path': outfile_path, 'name': os.path.basename(outfile_path), 'label': os.path.basename(outfile_path), 'description': 'A compound set in %s format' % ext}] report_params = { 'objects_created': [], 'message': 'Converted %s compound set to %s format.' % ( params['compound_set_name'], params['output_format']), 'file_links': report_files, 'workspace_name': params['workspace_name'], 'report_object_name': 'compound_set_download_report' } # Construct the output to send back report_client = KBaseReport(self.callback_url) report_info = report_client.create_extended_report(report_params) output = {'report_name': report_info['name'], 'report_ref': report_info['ref'], } #END compound_set_to_file # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method compound_set_to_file return value ' + 'output is not type dict as required.') # return the results return [output] def compound_set_from_model(self, ctx, params): """ CompoundSetFromModel required: string workspace_name string model_name string compound_set_name :param params: instance of type "compoundset_from_model_params" -> structure: parameter "workspace_name" of String, parameter "model_name" of String, parameter "compound_set_name" of String :returns: instance of type "compoundset_upload_results" -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "compoundset_ref" of type "obj_ref" """ # ctx is the context object # return variables are: output #BEGIN compound_set_from_model self._check_required_param(params, ['workspace_name', 'model_name', 'compound_set_name']) model = self.ws_client.get_objects2({'objects': [ {'workspace': params['workspace_name'], 'name': params['model_name']}]})['data'][0]['data'] compounds, undef = parse.parse_model(model) compoundset = { 'id': params['compound_set_name'], 'name': params['compound_set_name'], 'description': 'Compound Set produced from %s, a metabolic model' % model['id'], 'compounds': compounds, } output = self._save_to_ws_and_report(ctx, 'compound_set_from_model', params['workspace_name'], params['model_name'], compoundset) #END compound_set_from_model # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method compound_set_from_model return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
class ImportGFFFastaUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_gff_fasta_from_staging(self, params): """ import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome required params: fasta_file: fasta file from user's staging area gff_file: gff file from user's staging area genome_name: output genome object name workspace_name: workspace name that genome will be stored to file paths for both fasta and gff files must be subdirectory file path in staging area e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name optional params: scientific_name: proper name for species, key for taxonomy lookup.Default to 'unknown_taxon' source: Source Of The GenBank File. Default to 'User' taxon_wsname - where the reference taxons are. Default to 'ReferenceTaxons' taxon_reference - if defined, will try to link the Genome to the specified taxonomy object release: Release Or Version Of The Source Data genetic_code: Genetic Code For The Organism type: 'Reference', 'User upload', 'Representative' return: genome_ref: return object reference report_name: name of generated report (if any) report_ref: report reference (if any) """ log('--->\nrunning ImportGFFFastaUtil.import_gff_fasta_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_gff_fasta_from_staging_params(params) for key in ('fasta_file', 'gff_file'): file_path = params[key] download_staging_file_params = { 'staging_file_subdir_path': file_path } dfu_returnVal = self.dfu.download_staging_file( download_staging_file_params) params[key] = {'path': dfu_returnVal['copy_file_path']} returnVal = self.gfu.fasta_gff_to_genome(params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( download_staging_file_params.get('staging_file_subdir_path'), returnVal['genome_ref']) return returnVal def validate_import_gff_fasta_from_staging_params(self, params): """ validate_import_gff_fasta_from_staging_params: validates params passed to import_gff_fasta_from_staging method """ # check for required parameters for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') # for now must use workspace name, but no ws_id_to_name() function available if str(params["workspace_name"]).isdigit(): error_msg = '"{}" parameter is a workspace id and workspace name is required'.format( params["workspace_name"]) raise ValueError(error_msg)
class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path, params): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: self._validate_paired_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: self._validate_single_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file} return fastq_file_path def _validate_single_end_advanced_params(self, params): """ _validate_single_end_advanced_params: validate advanced params for single end reads """ if (params.get('insert_size_mean') or params.get('insert_size_std_dev') or params.get('read_orientation_outward')): error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or ' error_msg += '"Reads Orientation Outward" is Paried End Reads specific' raise ValueError(error_msg) if 'interleaved' in params: del params['interleaved'] def _validate_paired_end_advanced_params(self, params): """ _validate_paired_end_advanced_params: validate advanced params for paired end reads """ sequencing_tech = params.get('sequencing_tech') if sequencing_tech in ['PacBio CCS', 'PacBio CLR']: error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" ' error_msg += 'is Single End Reads specific' raise ValueError(error_msg) def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), returnVal['obj_ref']) return returnVal def import_sra_from_web(self, params): ''' import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome required params: download_type: download type for web source fastq file ('Direct Download', 'FTP', 'DropBox', 'Google Drive') workspace_name: workspace name/ID of the object sra_urls_to_add: dict of SRA file URLs required params: file_url: SRA file URL sequencing_tech: sequencing technology name: output reads file name Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_web_params(params) download_type = params.get('download_type') workspace_name = params.get('workspace_name') obj_refs = [] uploaded_files = [] for sra_url_to_add in params.get('sra_urls_to_add'): download_web_file_params = { 'download_type': download_type, 'file_url': sra_url_to_add.get('file_url') } scratch_sra_file_path = self.dfu.download_web_file( download_web_file_params).get('copy_file_path') log('Downloaded web file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add) import_sra_reads_params = sra_url_to_add import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = workspace_name if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) obj_ref = self.ru.upload_reads(import_sra_reads_params).get( 'obj_ref') obj_refs.append(obj_ref) uploaded_files.append(sra_url_to_add.get('file_url')) return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files} def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability( params.get('staging_file_subdir_path')) def validate_import_sra_from_web_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['download_type', 'workspace_name', 'sra_urls_to_add']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if not isinstance(params.get('sra_urls_to_add'), list): raise ValueError('sra_urls_to_add is not type list as required') for sra_url_to_add in params.get('sra_urls_to_add'): for p in ['file_url', 'sequencing_tech', 'name']: if p not in sra_url_to_add: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_refs_list, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of import_sra_from_staging/web) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) objects_created = list() objects_data = list() for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } objects_data.append(self.dfu.get_objects(get_objects_params)) objects_created.append({ 'ref': obj_ref, 'description': 'Imported Reads' }) output_html_files = self.generate_html_report(objects_data, params, uuid_string) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 460, 'report_object_name': 'kb_sra_upload_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def generate_html_report(self, reads_objs, params, uuid_string): """ _generate_html_report: generate html summary report """ log('Start generating html report') pprint(params) result_file_path = os.path.join(self.scratch, 'report.html') html_report = list() objects_content = '' for index, reads_obj in enumerate(reads_objs): idx = str(index) reads_data = reads_obj.get('data')[0].get('data') reads_info = reads_obj.get('data')[0].get('info') reads_ref = str(reads_info[6]) + '/' + str( reads_info[0]) + '/' + str(reads_info[4]) reads_obj_name = str(reads_info[1]) with open( os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'), 'r') as object_content_file: report_template = object_content_file.read() report_template = report_template.replace('_NUM', str(idx)) report_template = report_template.replace( 'OBJECT_NAME', reads_obj_name) if index == 0: report_template = report_template.replace( 'panel-collapse collapse', 'panel-collapse collapse in') objects_content += report_template base_percentages = '' for key, val in reads_data.get('base_percentages').iteritems(): base_percentages += '{}({}%) '.format(key, val) reads_overview_data = collections.OrderedDict() reads_overview_data['Name'] = '{} ({})'.format( reads_obj_name, reads_ref) reads_overview_data['Uploaded File'] = params.get( 'uploaded_files')[index] reads_overview_data['Date Uploaded'] = time.strftime("%c") reads_overview_data['Number of Reads'] = '{:,}'.format( reads_data.get('read_count')) reads_type = reads_info[2].lower() if 'single' in reads_type: reads_overview_data['Type'] = 'Single End' elif 'paired' in reads_type: reads_overview_data['Type'] = 'Paired End' else: reads_overview_data['Type'] = 'Unknown' reads_overview_data['Platform'] = reads_data.get( 'sequencing_tech', 'Unknown') reads_single_genome = str( reads_data.get('single_genome', 'Unknown')) if '0' in reads_single_genome: reads_overview_data['Single Genome'] = 'No' elif '1' in reads_single_genome: reads_overview_data['Single Genome'] = 'Yes' else: reads_overview_data['Single Genome'] = 'Unknown' insert_size_mean = params.get('insert_size_mean', 'Not Specified') if insert_size_mean is not None: reads_overview_data['Insert Size Mean'] = str(insert_size_mean) else: reads_overview_data['Insert Size Mean'] = 'Not Specified' insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified') if insert_size_std_dev is not None: reads_overview_data['Insert Size Std Dev'] = str( insert_size_std_dev) else: reads_overview_data['Insert Size Std Dev'] = 'Not Specified' reads_outward_orientation = str( reads_data.get('read_orientation_outward', 'Unknown')) if '0' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'No' elif '1' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'Yes' else: reads_overview_data['Outward Read Orientation'] = 'Unknown' reads_stats_data = collections.OrderedDict() reads_stats_data['Number of Reads'] = '{:,}'.format( reads_data.get('read_count')) reads_stats_data['Total Number of Bases'] = '{:,}'.format( reads_data.get('total_bases')) reads_stats_data['Mean Read Length'] = str( reads_data.get('read_length_mean')) reads_stats_data['Read Length Std Dev'] = str( reads_data.get('read_length_stdev')) dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \ reads_data.get('read_count')) reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \ .format(str(reads_data.get('number_of_duplicates')), dup_reads_percent) reads_stats_data['Phred Type'] = str(reads_data.get('phred_type')) reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format( reads_data.get('qual_mean')) reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format( str(reads_data.get('qual_min')), str(reads_data.get('qual_max'))) reads_stats_data['GC Percentage'] = str( round(reads_data.get('gc_content') * 100, 2)) + '%' reads_stats_data['Base Percentages'] = base_percentages overview_content = '' for key, val in reads_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>' stats_content = '' for key, val in reads_stats_data.iteritems(): stats_content += '<tr><td><b>{}</b></td>'.format(key) stats_content += '<td>{}</td>'.format(val) stats_content += '</tr>' objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content) objects_content = objects_content.replace('###STATS_CONTENT###', stats_content) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '###TABLE_PANELS_CONTENT###', objects_content) result_file.write(report_template) result_file.close() shutil.copytree( os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'), os.path.join(self.scratch, 'bootstrap-3.3.7')) shutil.copy( os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'), os.path.join(self.scratch, 'jquery-3.2.1.min.js')) matched_files = [] for root, dirnames, filenames in os.walk(self.scratch): for filename in fnmatch.filter(filenames, '*.gz'): matched_files.append(os.path.join(root, filename)) for gz_file in matched_files: print('Removing ' + gz_file) os.remove(gz_file) report_shock_id = self.dfu.file_to_shock({ 'file_path': self.scratch, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report
class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed commend:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running commend:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file} return fastq_file_path def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) return returnVal def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability( params.get('staging_file_subdir_path')) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_sra_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) number_of_reads = object_data.get('data')[0].get('data').get( 'read_count') upload_message += "Reads Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported Reads File: {}\n'.format( params.get('staging_file_subdir_path')) if isinstance(number_of_reads, (int, long)): upload_message += 'Number of Reads: {:,}\n'.format(number_of_reads) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
class variation_importer_utils: def __init__(self, utility_params): self.params = utility_params # self.scratch = utility_params['scratch'] self.scratch = os.path.join(utility_params['scratch'], 'variation_importer_' + str(uuid.uuid4())) os.mkdir(self.scratch) self.service_wiz_url = utility_params['srv-wiz-url'] self.callback_url = utility_params['callback_url'] self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url, token=utility_params['token']) def _create_fake_location_data(self): location = { 'lat': random.uniform(-90, 90), 'lon': random.uniform(-180, 180), 'elevation': random.uniform(0, 100), 'description': "".join([random.choice(string.ascii_letters) for n in xrange(20)]) } return location def _create_fake_straininfo(self, genotype_id): straininfo = { 'source_id': genotype_id, 'location_info': self._create_fake_location_data() } return straininfo def _create_fake_population(self, genotypes): population = {'description': 'Faker population data.', 'strains': []} for genome in genotypes: population['strains'].append(self._create_fake_straininfo(genome)) return population def _create_fake_kinship_matrix(self): kinship = { 'row_ids': ['one', 'two'], 'col_ids': ['one', 'two'], 'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]] } return kinship def _compare(self, s, t): return Counter(s) == Counter(t) def pretend_download_staging_file(self, vcf_filename, scratch): vcf_filepath = os.path.join(scratch, vcf_filename) shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath) return {'copy_file_path': vcf_filepath} def _generate_population(self, location_filepath, genotypes, population_description="None Provided"): locations = pd.read_csv(location_filepath, delimiter='\t') # Drop any missing data from id, latitude, or longitude. locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True) # Compare the location IDs with the genotype IDs if not (self._compare(locations.iloc[:, 0].astype(str).tolist(), genotypes)): log("Location IDs do not match Sample IDs in Variation file!") raise ValueError( "Location IDs do not match Sample IDs in Variation file!") col_names = [x.lower() for x in locations.columns.values] expected_columns = ['id', 'latitude', 'longitude'] optional_columns = ['elevation', 'description'] # CHeck that first three columns match the expected columns. if not (self._compare(col_names[0:3], expected_columns)): raise ValueError("Missing or unexpected column names in {}".format( location_filepath)) # If optional columns are not present, give default value for each. for col in optional_columns: if col not in col_names: if col == 'elevation': locations[col] = 0.0 else: locations[col] = "None provided." population = {'description': population_description, 'strains': []} for idx, row in locations.iterrows(): population['strains'].append({ 'source_id': str(row['id']), 'location_info': { 'lat': row['latitude'], 'lon': row['longitude'], 'elevation': row['elevation'], 'description': row['description'] } }) return population def _validate_vcf(self, vcf_filepath, vcf_version): validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) ## TODO: Make this choice more robust. ## Attempt conversion to 4.1? if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-o") validator_cmd.append(validation_output_dir) else: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version below 4.1. No validation logging.") print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break validator_output.append(line) p.wait() validation_output_filename = [ f for f in os.listdir(validation_output_dir) if f.endswith('.txt') ][0] validation_output_filepath = os.path.join(validation_output_dir, validation_output_filename) if not validation_output_filename: print('Validator did not generate log file!') raise Exception("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filepath)) log("Return code from validator {}".format(p.returncode)) return validation_output_filepath, p.returncode # Retrieve contigs from assembly file. def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'): try: assembly_data = self.dfu.get_objects( {'object_refs': [assembly_ref]})['data'][0]['data'] except Exception as e: print("Unable to retrieve Assembly reference: {}".format( assembly_ref)) raise ValueError(e) raw_contigs = assembly_data['contigs'] contigs = {} # Contigs returns just a dict with key and contig_id # Why did I have this here? Is it necessary? for key, value in raw_contigs.iteritems(): contigs[str(key)] = value['contig_id'] return raw_contigs def _get_version_contigs_genotypes(self, vcf_filepath): contigs = [] genotypes = [] version = '' with (gzip.open if vcf_filepath.endswith('.gz') else open)( vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted." ) raise ValueError( "Invalid VCF. ##fileformat line in meta is improperly formatted." ) version = float(tokens[1][-4:].rstrip()) log("VCF version: {}".format(version)) for line in vcf: if line.startswith("#CHROM"): log("#CHROM encountered, exiting loop.") genotypes = line.split()[9:] log("Number Genotypes in vcf: {}".format(len(genotypes))) break tokens = line.split("=") if tokens[0].startswith('##contig'): contigs.append(tokens[2][:-2]) return version, contigs, genotypes # Arabidopsis ref: 18590/2/8 def _get_assembly_ref_from_genome(self, genome_ref): ga = GenomeAnnotationAPI(self.service_wiz_url) inputs_get_assembly = {'ref': genome_ref} try: assembly_object_ref = ga.get_assembly(inputs_get_assembly) except Exception as e: print( "Unable to retrieve Assembly reference ID from Genome ref_id: {}" .format(genome_ref)) raise Exception(e) return assembly_object_ref def _generate_output_file_list(self): log('Start packing result files') output_files = list() result_file = os.path.join(self.scratch, 'variation_importer_results.zip') excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store'] with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(self.scratch): for file in files: if not (file.endswith(tuple(excluded_extensions)) # file.endswith('.zip') or # file.endswith('.vcf') or # file.endswith('.vcf.gz') or # file.endswith('.html') or # file.endswith('.DS_Store') ): zip_file.write(os.path.join(root, file), file) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Variation Importer' }) log("Importer output generated: {}".format(output_files)) return output_files def _generate_report(self, params, variation_results, variation_file_path): stats_results = self._generate_variation_stats(None, variation_file_path) html_report = self._generate_html_report(variation_results, stats_results) file_links = self._generate_output_file_list() objects = [] if (variation_results['valid_variation_file']): objects = [{ 'ref': variation_results['variation_obj_ref'], 'description': 'Variation Object created by VCF Importer' }] report_params = { 'objects_created': objects, 'message': '', 'direct_html_link_index': 0, 'file_links': file_links, 'html_links': html_report, 'html_window_height': 330, 'workspace_name': params['workspace_name'], 'report_object_name': 'variation_importer_report_' + str(uuid.uuid4()) } kbr_output = self.kbr.create_extended_report(report_params) report_output = { 'report_name': kbr_output['name'], 'report_ref': kbr_output['ref'], 'variation_ref': variation_results['variation_obj_ref'] } log("Returning from _generate_report!") return report_output def _generate_html_report(self, variation_results, stats_output=None): """ _generate_html_report: generate html report from output files """ html_report = list() print("Validation output filepath passed to html report: {}".format( variation_results['validation_output_filepath'])) try: report_dir = os.path.join(self.scratch, 'html') os.mkdir(report_dir) with open(template_dir, 'r') as html, open( variation_results['validation_output_filepath'], 'r') as validation: validation_content = '<p><h4>{} '.format( variation_results['variation_filename']) if variation_results.get('valid_variation_file'): validation_content += '<em><i>is</i> a valid </em> variation file.' else: validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.' validation_content += '</h4></p>' report = html.read() # Discard the first line of the validation file. It is irrelevant. validation.readline() validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>' validation_content += '<ul>' for line in validation.readlines(): validation_content += '<li>{}</li>'.format(line) validation_content += '</ul>' if variation_results.get('invalid_contigs'): validation_content += '<h4>The following Contigs were not found in the reference genome. The possible contigs have been written to the file {}. Please see the associated links to download.</h4>'.format( variation_results.get('genome_ref'), 'valid_contigs.txt') validation_content += '<ul>' for contig in variation_results.get('invalid_contigs'): validation_content += '<li>{}</li>'.format(contig) validation_content += '</ul>' # if not variation_results.get('contigs'): # validation_content += '<h4>No contig information was included in the VCF file header! Please recreate the VCF file with each contig described in the meta description </h4>' report = report.replace('Validation_Results', validation_content) if (stats_output.get('stats_file_dir')): summary_results = '<p><h4>Summary Statistics</h4></p>' summary_results += ''' <table> <tr> <th>Number of SNPs</th> <th>Number of Genotypes </th> </tr> ''' summary_results += '<tr>' summary_results += '<td>{}</td><td>{}</td>'.format( 'To be added later', variation_results['num_genotypes']) summary_results += '</tr></table>' report = report.replace('Variation_Statistics', summary_results) # visualization image_content = '' if (stats_output.get('stats_img_dir')): image_dir = stats_output.get('stats_img_dir') for file in glob.glob(os.path.join(image_dir, '*.png')): shutil.move(file, report_dir) for image in glob.glob(report_dir + "/*.png"): image = image.replace(report_dir + '/', '') caption = image.replace(report_dir + '/', '').replace('.png', '') image_content += '<p style="text-align:center"><img align="center" src="{}" ' \ '></a><a target="_blank"><br>' \ '<p align="center">{}</p></p>'.format(image, caption) else: image_content += 'No visualizations generated.' report = report.replace("Visualization_Results", image_content) except Exception as e: print("Error generating HTML report.") raise report_file_path = os.path.join(report_dir, 'index.html') with open(report_file_path, 'w') as output: output.write(report) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': report_file_path, 'make_handle': 0, 'pack': 'zip' }) log("Variation HTML report to shock ref: {}".format( html_upload_ret)) except: raise ValueError('Error uploading HTML to shock') html_report.append({ 'shock_id': html_upload_ret['shock_id'], 'name': os.path.basename(report_file_path), 'label': os.path.basename(report_file_path), 'description': 'HTML report for Variation Importer' }) return html_report def _generate_variation_stats(self, cmd_line_args, variation_filepath): """ :param commments go here """ file_output_directory = os.path.join(self.scratch, 'stats_' + str(uuid.uuid4())) os.mkdir(file_output_directory) image_output_directory = os.path.join( self.scratch, 'stats_images_' + str(uuid.uuid4())) os.mkdir(image_output_directory) # TODO: Validate user supplied params and build PLINK command plink_cmd = ["plink"] plink_cmd.append('--vcf') plink_cmd.append(variation_filepath) if (cmd_line_args is not None): cmds = cmd_line_args.split(';') for cmd in cmds: plink_cmd.append(cmd) # plink_cmd.append('--recode12') # plink_cmd.append('transpose') # plink_cmd.append('--output-missing-genotype') # plink_cmd.append("0") plink_cmd.append('--freq') plink_cmd.append('--hardy') # plink_cmd.append('gz') plink_cmd.append('--out') plink_cmd.append(variation_filepath) print("PLINK arguments: {}".format(plink_cmd)) plink_output = { "errors": [], "warnings": [] # "notes" : [] } p = subprocess.Popen(plink_cmd, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) while True: line = p.stdout.readline() if not line: break # log(line) tokens = line.split(':') if (tokens[0] == 'Error'): plink_output['errors'].append(line) raise ValueError('PLINK 1.9 error: ' + line) elif (tokens[0] == 'Warning'): plink_output['warnings'].append(line) print(line) # elif(tokens[0] == 'Note'): # plink_output['notes'].append(line) # print(line) p.stdout.close() p.wait() plink_output_filepath = os.path.join(file_output_directory, 'plink_cli_output.txt') with open(plink_output_filepath, 'w') as plink: for data in plink_output: plink.write("{}: {}\n".format(data, plink_output[data])) plink_output_files = [ f for f in os.listdir(self.scratch) if f.startswith(os.path.basename(variation_filepath) + '.') ] for file in plink_output_files: shutil.move(os.path.join(self.scratch, file), file_output_directory) if p.returncode != 0: log("PLINK encountered an error during runtime. Please see log file." ) variation_filename = os.path.basename(variation_filepath) base_filepath = os.path.join(file_output_directory, variation_filename) freq_filepath = base_filepath + '.frq' maf_script_filepath = '/kb/module/lib/kb_variation_importer/Utils/MAF_check.R' hwe_script_filepath = '/kb/module/lib/kb_variation_importer/Utils/HWE.R' log("Frequency filepath: {}".format(freq_filepath)) # TODO: make function to do Rscript calls. # generate visualizations and store in directory maf_command = ['Rscript'] maf_command.append('--no-save') maf_command.append('--vanilla') maf_command.append(maf_script_filepath) maf_command.append(freq_filepath) maf_command.append("Minor Allele Frequencies.png") print("MAF command: {}".format(maf_command)) r = subprocess.Popen(maf_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) r.wait() if r.returncode != 0: log("Error creating MAF histogram in R") hwe_filepath = base_filepath + '.hwe' zoom_filepath = hwe_filepath + '.zoom' log("HWE filepath: {}".format(hwe_filepath)) zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format( hwe_filepath, zoom_filepath) log("Zoom cmd: {}".format(zoom_command)) try: z = subprocess.Popen(zoom_command, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) z.wait() if z.returncode != 0: log("Error creating HWE zoom file.") except Exception as e: log("Error creating zoom HWE file: {}".format(e)) hwe_command = ['Rscript'] hwe_command.append('--no-save') hwe_command.append('--vanilla') hwe_command.append(hwe_script_filepath) hwe_command.append(hwe_filepath) hwe_command.append("Hardy-Weinberg Equilibrium.png") hwe_command.append(zoom_filepath) hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png") print("MAF command: {}".format(hwe_command)) h = subprocess.Popen(hwe_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) h.wait() if h.returncode != 0: log("Error generating HWE Zoom plot") return { 'stats_file_dir': file_output_directory, 'stats_img_dir': image_output_directory } def _save_variation_to_ws(self, workspace_name, variation_object_name, variation_obj, variation_filepath, kinship_matrix): ws_id = self.dfu.ws_name_to_id(workspace_name) try: vcf_shock_return = self.dfu.file_to_shock({ 'file_path': variation_filepath, 'make_handle': 1, 'pack': 'gzip' }) except Exception as e: print("Error uploading file to shock!") raise ValueError(e) variation_obj['variation_file_reference'] = vcf_shock_return.get( 'shock_id') info = self.dfu.save_objects({ 'id': ws_id, 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_obj, 'name': variation_object_name }] })[0] variation_ref = "%s/%s/%s" % (info[6], info[0], info[4]) log("Variation reference created: {}".format(variation_ref)) return variation_ref def validate_vcf(self, params): """ :param params: dict containing all input parameters. """ returnVal = {} valid_vcf_file = True try: # vcf_filepath = self.pretend_download_staging_file( # params['variation_file_subdir_path'], self.scratch).get('copy_file_path') vcf_filepath = self.dfu.download_staging_file({ 'staging_file_subdir_path': params['variation_file_subdir_path'] }).get('copy_file_path') location_filepath = self.pretend_download_staging_file( params['variation_attributes_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['variation_file_subdir_path'])) try: location_filepath = self.pretend_download_staging_file( params['variation_attributes_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['variation_attributes_subdir_path'])) # Check file size log("{} file size: {}".format(vcf_filepath, os.path.getsize(vcf_filepath))) log('\nValidating {}...'.format(vcf_filepath)) vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes( vcf_filepath) if not vcf_contigs: log("No contig data in {} header.".format(vcf_filepath)) raise ValueError( "No contig data in {} header.".format(vcf_filepath)) if (vcf_version < 4.1): log("VCF file is version {}. Must be at least version 4.1".format( vcf_version)) raise ValueError( "VCF file is version {}. Must be at least version 4.1".format( vcf_version)) # Generate population object population = self._generate_population(location_filepath, vcf_genotypes) # Retrieve Assembly object reference associated with genome. try: assembly_ref = self._get_assembly_ref_from_genome( params['genome_ref']) except Exception as e: print("Unable to retrieve {}".format(params['genome_ref'])) raise ValueError(e) # Retrieve contig list from Assembly object. try: assembly_contigs = self._get_contigs_from_assembly(assembly_ref) except Exception as e: print("Unable to retrieve contigs from Assembly ref: {}".format( assembly_ref)) raise ValueError(e) log("Length of assembly contigs: {}".format(len(assembly_contigs))) # Compare contig IDs from VCF to those in the Assembly object invalid_contigs = [] for contig in vcf_contigs: if contig not in assembly_contigs.keys(): invalid_contigs.append(contig) if invalid_contigs: log("Invalid contig IDs found in {}".format(vcf_filepath)) valid_contig_filepath = os.path.join(self.scratch, 'valid_contigs.txt') log("Writing valid contigs to file: {}".format( valid_contig_filepath)) with open(valid_contig_filepath, 'w') as icf: for contig in assembly_contigs: icf.write(contig + '\n') valid_vcf_file = False validation_output_filepath, returncode = self._validate_vcf( vcf_filepath, vcf_version) if returncode != 0: valid_vcf_file = False kinship_matrix = self._create_fake_kinship_matrix() variation_obj_ref = '' if valid_vcf_file: variation_object = { "genome": params['genome_ref'], "population": population, "contigs": vcf_contigs, "comment": "Comments go here", "assay": "Assay data goes gere.", "originator": "PI/Lab info goes here", "pubmed_id": "PubMed ID goes here", "kinship_info": kinship_matrix } variation_obj_ref = self._save_variation_to_ws( params['workspace_name'], params['variation_object_name'], variation_object, vcf_filepath, kinship_matrix) log("Variation object reference: {}".format(variation_obj_ref)) variation_report_metadata = { 'valid_variation_file': valid_vcf_file, 'variation_obj_ref': variation_obj_ref, 'variation_filename': os.path.basename(vcf_filepath), 'validation_output_filepath': validation_output_filepath, 'vcf_version': vcf_version, 'num_genotypes': len(vcf_genotypes), 'num_contigs': len(vcf_contigs), 'invalid_contigs': invalid_contigs } returnVal = self._generate_report(params, variation_report_metadata, vcf_filepath) return returnVal