class simpleapp: ''' Module Name: simpleapp Module Description: A KBase module: simpleapp ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://[email protected]/bio-boris/newapp.git" GIT_COMMIT_HASH = "6ea6b3b856d755d466af415d5f7473a82daf1f26" #BEGIN_CLASS_HEADER #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.sj = sleep_job(self.callback_url) self.sac = simpleapp_client(self.callback_url) self.aj = alans_job(self.callback_url,service_ver='dev') #END_CONSTRUCTOR pass def simple_add(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN simple_add input_number = int(params['base_number']) input_number += 100 output = {'new_number': input_number} path = self.dfu.download_web_file( {'file_url': "http://kbase.us/wp-content/uploads/2016/09/Kbase_Logo_newWeb.png", 'download_type': 'Direct Download'}).get( 'copy_file_path') print("About to do some refdata") return_value = self.aj.run_alans_job({'workspace_name': params['workspace_name']}) print("got") print(return_value) #END simple_add # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method simple_add return value ' + 'output is not type dict as required.') # return the results return [output] def simple_add_multiprocessing(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN simple_add_multiprocessing import random random_nums = random.sample(range(1, 1001), 1000) p = Pool(ncpus=int(params['pool_size'])) print("About to start with pool size of:" + str(params['pool_size'])) f = self.sac.simple_add params = [] for number in random_nums: params_dict = {'base_number': str(number)} params.append(params_dict) print(p.map(f, params)) output = {'new_number': 0} #END simple_add_multiprocessing # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method simple_add_multiprocessing return value ' + 'output is not type dict as required.') # return the results return [output] def simple_add_with_sleep(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN simple_add_with_sleep input_number = int(params['base_number']) input_number += 100 output = {'new_number': input_number} path = self.dfu.download_web_file( {'file_url': "http://kbase.us/wp-content/uploads/2016/09/Kbase_Logo_newWeb.png", 'download_type': 'Direct Download'}).get( 'copy_file_path') print("About to sleep for" + str(params['base_number'])) params = {'input_sleep': int(params['base_number'])} self.sj.simple_sleep(params) #END simple_add_with_sleep # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method simple_add_with_sleep return value ' + 'output is not type dict as required.') # return the results return [output] def simple_add_with_sleep_client(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN simple_add_with_sleep_client #END simple_add_with_sleep_client # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method simple_add_with_sleep_client return value ' + 'output is not type dict as required.') # return the results return [output] def simple_add_hpc_client_group(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN simple_add_hpc_client_group input_number = int(params['base_number']) input_number += 100 output = {'new_number': input_number} path = self.dfu.download_web_file( {'file_url': "http://kbase.us/wp-content/uploads/2016/09/Kbase_Logo_newWeb.png", 'download_type': 'Direct Download'}).get( 'copy_file_path') #END simple_add_hpc_client_group # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method simple_add_hpc_client_group return value ' + 'output is not type dict as required.') # return the results return [output] def simple_add_hpc_client_group_extra_simple(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN simple_add_hpc_client_group_extra_simple input_number = int(params['base_number']) input_number += 100 output = {'new_number': input_number} #END simple_add_hpc_client_group_extra_simple # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method simple_add_hpc_client_group_extra_simple return value ' + 'output is not type dict as required.') # return the results return [output] def example_method_logs(self, ctx, params): """ :param params: instance of type "SimpleParams" (Insert your typespec information here.) -> structure: parameter "base_number" of Long, parameter "workspace_name" of String :returns: instance of type "SimpleResults" -> structure: parameter "new_number" of Long """ # ctx is the context object # return variables are: output #BEGIN example_method_logs import random def sentence_generator(): nouns = ("puppy", "car", "rabbit", "girl", "monkey") verbs = ("runs", "hits", "jumps", "drives", "barfs") adv = ("crazily.", "dutifully.", "foolishly.", "merrily.", "occasionally.") adj = ("adorable", "clueless", "dirty", "odd", "stupid") num = random.randrange(0, 5) sentence = nouns[num] + ' ' + verbs[num] + ' ' + adv[num] + ' ' + adj[num] return sentence chars = 32000000 while (chars > 0): generated = sentence_generator() * 5 chars -= len(generated) print(generated) print("This is the end of the program bro") print("This is the end of the program bro") output = {'new_number': chars} #END example_method_logs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method example_method_logs return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = {'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH} #END_STATUS return [returnVal]
class FastaToAssembly: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print('parsing FASTA file: ' + str(fasta_file_path)) assembly_data = self.parse_fasta(fasta_file_path, params) print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' + str(assembly_data['dna_size']) + 'bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): ''' construct the WS object data to save based on the parsed info and params ''' assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: assembly_data['taxon_ref'] = params['taxon_ref'] if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return assembly_data def parse_fasta(self, fasta_file_path, params): ''' Do the actual work of inspecting each contig ''' # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This fasta file may have amino acids in it instead ' + 'of the required nucleotides.') raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character)) # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The fasta header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list))).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' + str(min_contig_length) + 'bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): ''' removes all contigs less than the min_contig_length provided ''' filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): ''' Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; ''' print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): ''' Setup the input_directory by fetching the files and returning the path to the file''' file_path = None if 'file' in params: file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print('Downloading file from SHOCK node: ' + str(params['shock_id'])) sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print('Downloading file from: ' + str(params['ftp_url'])) sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid fasta could be extracted based on the input parameters') def validate_params(self, params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one fasta file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path, params): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: self._validate_paired_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: self._validate_single_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file} return fastq_file_path def _validate_single_end_advanced_params(self, params): """ _validate_single_end_advanced_params: validate advanced params for single end reads """ if (params.get('insert_size_mean') or params.get('insert_size_std_dev') or params.get('read_orientation_outward')): error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or ' error_msg += '"Reads Orientation Outward" is Paried End Reads specific' raise ValueError(error_msg) if 'interleaved' in params: del params['interleaved'] def _validate_paired_end_advanced_params(self, params): """ _validate_paired_end_advanced_params: validate advanced params for paired end reads """ sequencing_tech = params.get('sequencing_tech') if sequencing_tech in ['PacBio CCS', 'PacBio CLR']: error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" ' error_msg += 'is Single End Reads specific' raise ValueError(error_msg) def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), returnVal['obj_ref']) return returnVal def import_sra_from_web(self, params): ''' import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome required params: download_type: download type for web source fastq file ('Direct Download', 'FTP', 'DropBox', 'Google Drive') workspace_name: workspace name/ID of the object sra_urls_to_add: dict of SRA file URLs required params: file_url: SRA file URL sequencing_tech: sequencing technology name: output reads file name Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_web_params(params) download_type = params.get('download_type') workspace_name = params.get('workspace_name') obj_refs = [] uploaded_files = [] for sra_url_to_add in params.get('sra_urls_to_add'): download_web_file_params = { 'download_type': download_type, 'file_url': sra_url_to_add.get('file_url') } scratch_sra_file_path = self.dfu.download_web_file( download_web_file_params).get('copy_file_path') log('Downloaded web file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add) import_sra_reads_params = sra_url_to_add import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = workspace_name if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) obj_ref = self.ru.upload_reads(import_sra_reads_params).get( 'obj_ref') obj_refs.append(obj_ref) uploaded_files.append(sra_url_to_add.get('file_url')) return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files} def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability( params.get('staging_file_subdir_path')) def validate_import_sra_from_web_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['download_type', 'workspace_name', 'sra_urls_to_add']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if not isinstance(params.get('sra_urls_to_add'), list): raise ValueError('sra_urls_to_add is not type list as required') for sra_url_to_add in params.get('sra_urls_to_add'): for p in ['file_url', 'sequencing_tech', 'name']: if p not in sra_url_to_add: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_refs_list, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of import_sra_from_staging/web) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) objects_created = list() objects_data = list() for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } objects_data.append(self.dfu.get_objects(get_objects_params)) objects_created.append({ 'ref': obj_ref, 'description': 'Imported Reads' }) output_html_files = self.generate_html_report(objects_data, params, uuid_string) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 460, 'report_object_name': 'kb_sra_upload_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def generate_html_report(self, reads_objs, params, uuid_string): """ _generate_html_report: generate html summary report """ log('Start generating html report') pprint(params) result_file_path = os.path.join(self.scratch, 'report.html') html_report = list() objects_content = '' for index, reads_obj in enumerate(reads_objs): idx = str(index) reads_data = reads_obj.get('data')[0].get('data') reads_info = reads_obj.get('data')[0].get('info') reads_ref = str(reads_info[6]) + '/' + str( reads_info[0]) + '/' + str(reads_info[4]) reads_obj_name = str(reads_info[1]) with open( os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'), 'r') as object_content_file: report_template = object_content_file.read() report_template = report_template.replace('_NUM', str(idx)) report_template = report_template.replace( 'OBJECT_NAME', reads_obj_name) if index == 0: report_template = report_template.replace( 'panel-collapse collapse', 'panel-collapse collapse in') objects_content += report_template base_percentages = '' for key, val in reads_data.get('base_percentages').iteritems(): base_percentages += '{}({}%) '.format(key, val) reads_overview_data = collections.OrderedDict() reads_overview_data['Name'] = '{} ({})'.format( reads_obj_name, reads_ref) reads_overview_data['Uploaded File'] = params.get( 'uploaded_files')[index] reads_overview_data['Date Uploaded'] = time.strftime("%c") reads_overview_data['Number of Reads'] = '{:,}'.format( reads_data.get('read_count')) reads_type = reads_info[2].lower() if 'single' in reads_type: reads_overview_data['Type'] = 'Single End' elif 'paired' in reads_type: reads_overview_data['Type'] = 'Paired End' else: reads_overview_data['Type'] = 'Unknown' reads_overview_data['Platform'] = reads_data.get( 'sequencing_tech', 'Unknown') reads_single_genome = str( reads_data.get('single_genome', 'Unknown')) if '0' in reads_single_genome: reads_overview_data['Single Genome'] = 'No' elif '1' in reads_single_genome: reads_overview_data['Single Genome'] = 'Yes' else: reads_overview_data['Single Genome'] = 'Unknown' insert_size_mean = params.get('insert_size_mean', 'Not Specified') if insert_size_mean is not None: reads_overview_data['Insert Size Mean'] = str(insert_size_mean) else: reads_overview_data['Insert Size Mean'] = 'Not Specified' insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified') if insert_size_std_dev is not None: reads_overview_data['Insert Size Std Dev'] = str( insert_size_std_dev) else: reads_overview_data['Insert Size Std Dev'] = 'Not Specified' reads_outward_orientation = str( reads_data.get('read_orientation_outward', 'Unknown')) if '0' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'No' elif '1' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'Yes' else: reads_overview_data['Outward Read Orientation'] = 'Unknown' reads_stats_data = collections.OrderedDict() reads_stats_data['Number of Reads'] = '{:,}'.format( reads_data.get('read_count')) reads_stats_data['Total Number of Bases'] = '{:,}'.format( reads_data.get('total_bases')) reads_stats_data['Mean Read Length'] = str( reads_data.get('read_length_mean')) reads_stats_data['Read Length Std Dev'] = str( reads_data.get('read_length_stdev')) dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \ reads_data.get('read_count')) reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \ .format(str(reads_data.get('number_of_duplicates')), dup_reads_percent) reads_stats_data['Phred Type'] = str(reads_data.get('phred_type')) reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format( reads_data.get('qual_mean')) reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format( str(reads_data.get('qual_min')), str(reads_data.get('qual_max'))) reads_stats_data['GC Percentage'] = str( round(reads_data.get('gc_content') * 100, 2)) + '%' reads_stats_data['Base Percentages'] = base_percentages overview_content = '' for key, val in reads_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>' stats_content = '' for key, val in reads_stats_data.iteritems(): stats_content += '<tr><td><b>{}</b></td>'.format(key) stats_content += '<td>{}</td>'.format(val) stats_content += '</tr>' objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content) objects_content = objects_content.replace('###STATS_CONTENT###', stats_content) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '###TABLE_PANELS_CONTENT###', objects_content) result_file.write(report_template) result_file.close() shutil.copytree( os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'), os.path.join(self.scratch, 'bootstrap-3.3.7')) shutil.copy( os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'), os.path.join(self.scratch, 'jquery-3.2.1.min.js')) matched_files = [] for root, dirnames, filenames in os.walk(self.scratch): for filename in fnmatch.filter(filenames, '*.gz'): matched_files.append(os.path.join(root, filename)) for gz_file in matched_files: print('Removing ' + gz_file) os.remove(gz_file) report_shock_id = self.dfu.file_to_shock({ 'file_path': self.scratch, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report