def _proc_ws_obj_params(self, ctx, params): """ Check the validity of workspace and object params and return them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) dfu = DataFileUtil(self.callback_url) if not isinstance(ws_name_id, int): try: ws_name_id = dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id
def UploadFromMEME(self, ctx, params): """ :param params: instance of type "UploadGibbsInParams" -> structure: parameter "path" of String, parameter "ws_name" of String, parameter "obj_name" of String :returns: instance of type "UploadOutput" -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: output #BEGIN UploadFromMEME print('Extracting motifs') motifList = MU.parse_meme_output(params['path']) print(motifList) MSO = {} MSO['Condition'] = 'Temp' MSO['SequenceSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A', 'C', 'G', 'T'] MSO['Background'] = {} for letter in MSO['Alphabet']: MSO['Background'][letter] = 0.0 MSU.parseMotifList(motifList, MSO) MSU.CheckLength(MSO, params['min_len'], params['max_len']) if 'absolute_locations' in params: for motif in MSO['Motifs']: for loc in motif['Motif_Locations']: if loc['sequence_id'] in params['absolute_locations']: loc['sequence_id'] = params['contig'] absStart = int(params['start']) loc['start'] = absStart loc['end'] = absStart + loc['end'] dfu = DataFileUtil(self.callback_url) save_objects_params = {} save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGeneRegulation.MotifSet', 'data': MSO, 'name': params['obj_name'] }] info = dfu.save_objects(save_objects_params)[0] print('SAVED OBJECT') print(info) motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) print(motif_set_ref) output = {'obj_ref': motif_set_ref} print(output) #END UploadFromMEME # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method UploadFromMEME return value ' + 'output is not type dict as required.') # return the results return [output]
def _save_to_ws_and_report(self, ws_id, source, assembly_data): dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) workspace_id = dfu.ws_name_to_id(self.getWsName()) print("Workspace id: {}".format(workspace_id)) info = dfu.save_objects({ 'id': '18590', # Numerical id of workspace "objects": [{ "type": "KBaseGenomeAnnotations.Assembly-3.0", "data": assembly_data, "name": ws_id }] })[0] #print("Data from save to ws: {}".format(json.dumps(info, indent=2))) assembly_ref = "%s/%s/%s" % (info[6], info[0], info[4]) return assembly_ref
class FastaToAssembly: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print('parsing FASTA file: ' + str(fasta_file_path)) assembly_data = self.parse_fasta(fasta_file_path, params) print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' + str(assembly_data['dna_size']) + 'bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): ''' construct the WS object data to save based on the parsed info and params ''' assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: assembly_data['taxon_ref'] = params['taxon_ref'] if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return assembly_data def parse_fasta(self, fasta_file_path, params): ''' Do the actual work of inspecting each contig ''' # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This fasta file may have amino acids in it instead ' + 'of the required nucleotides.') raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character)) # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The fasta header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list))).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' + str(min_contig_length) + 'bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): ''' removes all contigs less than the min_contig_length provided ''' filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): ''' Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; ''' print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): ''' Setup the input_directory by fetching the files and returning the path to the file''' file_path = None if 'file' in params: file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print('Downloading file from SHOCK node: ' + str(params['shock_id'])) sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print('Downloading file from: ' + str(params['ftp_url'])) sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid fasta could be extracted based on the input parameters') def validate_params(self, params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one fasta file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
class ExprMatrixUtils: """ Constains a set of functions for expression levels calculations. """ PARAM_IN_WS_NAME = 'workspace_name' PARAM_IN_OBJ_NAME = 'output_obj_name' PARAM_IN_EXPSET_REF = 'expressionset_ref' def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) pass def process_params(self, params): """ validates params passed to gen expression matrix method """ for p in [self.PARAM_IN_EXPSET_REF, self.PARAM_IN_OBJ_NAME, self.PARAM_IN_WS_NAME ]: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) ws_name_id = params.get(self.PARAM_IN_WS_NAME) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.ws_id = ws_name_id def get_expressionset_data(self, expressionset_ref): expr_set_obj = self.ws_client.get_objects2( {'objects': [{'ref': expressionset_ref}]})['data'][0] expr_set_obj_type = expr_set_obj.get('info')[2] expr_set_data = dict() expr_set_data['ws_name'] = expr_set_obj.get('info')[7] expr_set_data['obj_name'] = expr_set_obj.get('info')[1] if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expr_set_obj_type): expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id'] expr_obj_refs = list() for expr_obj in expr_set_obj['data']['mapped_expression_ids']: expr_obj_refs.append(expr_obj.values()[0]) expr_set_data['expr_obj_refs'] = expr_obj_refs elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type): items = expr_set_obj.get('data').get('items') expr_obj_refs = list() for item in items: expr_obj_refs.append(item['ref']) expr_obj = self.ws_client.get_objects2( {'objects': [{'ref': expr_obj_refs[0]}]})['data'][0] expr_set_data['genome_ref'] = expr_obj['data']['genome_id'] expr_set_data['expr_obj_refs'] = expr_obj_refs else: raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' + 'KBaseRNASeq.RNASeqExpressionSet ' + 'or KBaseSets.ExpressionSet') return expr_set_data def save_expression_matrix(self, tables, expr_set_data, em_obj_name, hidden = 0): all_rows = {} # build a dictionary of keys only which is a union of all row ids (gene_ids) self.logger.info( '***** length of tables is {0}'.format( len( tables ))) for table in tables: for r in table.keys(): all_rows[r] = [] for gene_id in all_rows.keys(): row = [] for table in tables: if ( gene_id in table ): #logger.info( 'append ' + gene_id ) #logger.info( pformat( table[gene_id])) #all_rows[gene_id].append( table[gene_id] ) row.append( table[gene_id] ) else: #logger.info( 'append 0' ) row.append( 0 ) all_rows[gene_id] = row #logger.info( all_rows[gene_id]) em_data = { 'genome_ref': expr_set_data['genome_ref'], 'scale': 'log2', 'type': 'level', 'data': { 'row_ids': [], 'values': [], 'col_ids': expr_set_data['expr_obj_names'] }, 'feature_mapping' : {}, 'condition_mapping': expr_set_data['condition_map'] } # we need to load row-by-row to preserve the order self.logger.info('loading expression matrix data') for gene_id in all_rows.keys(): em_data['feature_mapping'][gene_id] = gene_id em_data['data']['row_ids'].append(gene_id) em_data['data']['values'].append(all_rows[gene_id]) try: self.logger.info( 'saving em_data em_name {0}'.format(em_obj_name)) obj_info = self.dfu.save_objects({'id': self.ws_id, 'objects': [ { 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': em_data, 'name': em_obj_name, 'hidden': hidden, 'extra_provenance_input_refs': [ em_data.get('genome_ref'), self.params[self.PARAM_IN_EXPSET_REF]] } ]})[0] self.logger.info('ws save return:\n' + pformat(obj_info)) except Exception as e: self.logger.exception(e) raise Exception('Failed Saving Expression Matrix to Workspace') return str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4]) def get_expression_matrix(self, params): self.process_params(params) self.params = params expressionset_ref = params.get(self.PARAM_IN_EXPSET_REF) expr_set_data = self.get_expressionset_data(expressionset_ref) expr_obj_names = list() fpkm_tables = list() tpm_tables = list() condition_map = dict() tpm_table = None for expr_obj_ref in expr_set_data['expr_obj_refs']: try: self.logger.info('*** getting expression set {0} from workspace ****' .format(expr_obj_ref)) expr = self.ws_client.get_objects2( {'objects': [{'ref': expr_obj_ref}]})['data'][0] except Exception, e: self.logger.exception(e) raise Exception('Unable to download expression object {0} from workspace {1}'. format(expr_obj_ref, expr_set_data['ws_name'])) expr_name = expr.get('info')[1] expr_obj_names.append(expr_name) condition_map.update({expr_name: expr.get('data').get('condition')}) num_interp = expr.get('data').get('numerical_interpretation') if num_interp != 'FPKM': raise Exception( 'Did not get expected FPKM value from numerical interpretation key from \ Expression object {0}, instead got '.format(expr_obj_ref, num_interp)) pr_comments = expr.get('data').get('processing_comments', None) # log2 Normalized if pr_comments is not None: self.logger.info('pr_comments are {0}'.format(pr_comments)) fpkm_table = expr.get('data').get('expression_levels') # QUESTION: is this really FPKM levels? self.logger.info('FPKM keycount: {0}'.format(len(fpkm_table.keys()))) fpkm_tables.append(fpkm_table) tpm_table = None # Cufflinks doesn't generate TPM if 'tpm_expression_levels' in expr['data']: # so we need to check for this key tpm_table = expr.get('data').get('tpm_expression_levels') self.logger.info('TPM keycount: {0}'.format(len(tpm_table.keys()))) tpm_tables.append(tpm_table) expr_set_data['expr_obj_names'] = expr_obj_names expr_set_data['condition_map'] = condition_map output_obj_name = params.get(self.PARAM_IN_OBJ_NAME) fpkm_ref = self.save_expression_matrix(fpkm_tables, expr_set_data, '{0}_FPKM_ExpressionMatrix'.format(output_obj_name)) tpm_ref = None if tpm_table is not None: tpm_ref = self.save_expression_matrix(tpm_tables, expr_set_data, '{0}_TPM_ExpressionMatrix'.format(output_obj_name)) return fpkm_ref, tpm_ref
def _proc_upload_reads_params(self, ctx, params): fwdid = params.get('fwd_id') if not fwdid: raise ValueError('No reads file provided') wsid = params.get('wsid') wsname = params.get('wsname') if not self.xor(wsid, wsname): raise ValueError( 'Exactly one of the workspace ID or name must be provided') dfu = DataFileUtil(self.callback_url, token=ctx['token']) if wsname: self.log('Translating workspace name to id') if not isinstance(wsname, six.string_types): raise ValueError('wsname must be a string') wsid = dfu.ws_name_to_id(wsname) self.log('translation done') del wsname objid = params.get('objid') name = params.get('name') if not self.xor(objid, name): raise ValueError( 'Exactly one of the object ID or name must be provided') revid = params.get('rev_id') interleaved = 1 if params.get('interleaved') else 0 kbtype = 'KBaseFile.SingleEndLibrary' single_end = True if interleaved or revid: kbtype = 'KBaseFile.PairedEndLibrary' single_end = False if revid: interleaved = 0 seqtype = params.get('sequencing_tech') if not seqtype: raise ValueError('The sequencing technology must be provided') sg = 1 if 'single_genome' in params and not params['single_genome']: sg = 0 o = { 'sequencing_tech': seqtype, 'single_genome': sg, # 'read_count': params.get('read_count'), # 'read_size': params.get('read_size'), # 'gc_content': params.get('gc_content') } self._add_field(o, params, 'strain') self._add_field(o, params, 'source') ism = params.get('insert_size_mean') self._check_pos(ism, 'insert_size_mean') issd = params.get('insert_size_std_dev') self._check_pos(issd, 'insert_size_std_dev') if not single_end: o.update({ 'insert_size_mean': ism, 'insert_size_std_dev': issd, 'interleaved': interleaved, 'read_orientation_outward': 1 if params.get('read_orientation_outward') else 0 }) return o, wsid, name, objid, kbtype, single_end, fwdid, revid
def upload_html_set(self, ctx, params): """ Upload an HTML file set to the KBase data stores. :param params: instance of type "UploadHTMLSetInput" (Input to the upload_html_set function. Required arguments: One of: wsid - the id of the workspace where the reads will be saved (preferred). wsname - the name of the workspace where the reads will be saved. One of: objid - the id of the workspace object to save over name - the name to which the workspace object will be saved path - the path to the directory with the HTML files. This directory will be compressed and loaded into the KBase stores.) -> structure: parameter "wsid" of Long, parameter "wsname" of String, parameter "objid" of Long, parameter "name" of String, parameter "path" of String :returns: instance of type "UploadHTMLSetOutput" (Output of the upload_html_set function. obj_ref - a reference to the new Workspace object in the form X/Y/Z, where X is the workspace ID, Y is the object ID, and Z is the version.) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: out #BEGIN upload_html_set del ctx wsid = params.get('wsid') wsname = params.get('wsname') if not self.xor(wsid, wsname): raise ValueError( 'Exactly one of the workspace ID or name must be provided') dfu = DataFileUtil(self.callback_url) if wsname: self.log('Translating workspace name to id') if not isinstance(wsname, six.string_types): raise ValueError('wsname must be a string') wsid = dfu.ws_name_to_id(wsname) self.log('translation done') del wsname objid = params.get('objid') name = params.get('name') if not self.xor(objid, name): raise ValueError( 'Exactly one of the object ID or name must be provided') htmlpath = params.get('path') if not htmlpath: raise ValueError('path parameter is required') htmlpath = os.path.abspath(os.path.expanduser(htmlpath)) if not os.path.isdir(htmlpath): raise ValueError('path must be a directory') zipfile = dfu.pack_file({ 'file_path': htmlpath, 'pack': 'zip' })['file_path'] if os.path.getsize(zipfile) > self.MAX_ZIP_SIZE: os.remove(zipfile) raise ValueError('Zipfile from specified directory is greater ' + 'than maximum size allowed: ' + str(self.MAX_ZIP_SIZE)) fh, tf = tempfile.mkstemp(dir=self.scratch) os.close(fh) with open(tf, 'w') as objfile, open(zipfile, 'rb') as z: objfile.write('{"file":"') d = z.read(self.CHUNKSIZE) while d: objfile.write(base64.b64encode(d)) d = z.read(self.CHUNKSIZE) objfile.write('"}') os.remove(zipfile) so = { 'type': 'HTMLFileSetUtils.HTMLFileSet-0.1', # TODO release 'data_json_file': tf } if name: so['name'] = name else: so['objid'] = objid wsio = WsLargeDataIO(self.callback_url, service_ver='dev') # TODO remove dev @IgnorePep8 ret = wsio.save_objects({'id': wsid, 'objects': [so]})[0] os.remove(tf) out = {'obj_ref': str(ret[6]) + '/' + str(ret[0]) + '/' + str(ret[4])} #END upload_html_set # At some point might do deeper type checking... if not isinstance(out, dict): raise ValueError('Method upload_html_set return value ' + 'out is not type dict as required.') # return the results return [out]
def find_motifs(self, ctx, params): """ :param params: instance of type "get_promoter_for_gene_input" (Genome is a KBase genome Featureset is a KBase featureset Promoter_length is the length of promoter requested for all genes) -> structure: parameter "workspace_name" of String, parameter "genome_ref" of String, parameter "featureSet_ref" of String, parameter "promoter_length" of Long :returns: instance of type "get_promoter_for_gene_output_params" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN find_motifs #TODO: have these guys return output paths for key, value in params.iteritems(): print key if 'motif_min_length' not in params: params['motif_min_length'] = 8 if 'motif_max_length' not in params: params['motif_max_length'] = 16 motMin = params['motif_min_length'] motMax = params['motif_max_length'] promoterFastaFilePath = self.get_promoter_for_gene(ctx, params)[0] gibbsCommandList = [] for i in range(motMin, motMax + 1, 2): gibbsCommandList.append( GU.build_gibbs_command(promoterFastaFilePath, i)) for g in gibbsCommandList: GU.run_gibbs_command(g) #gibbsCommand = GU.build_gibbs_command(promoterFastaFilePath) #GU.run_gibbs_command(gibbsCommand) #print(promoterFastaFilePath) homerMotifCommand = HU.build_homer_motif_command(promoterFastaFilePath) homerLocationCommand = HU.build_homer_location_command( promoterFastaFilePath) os.mkdir(self.shared_folder + '/homer_out') #print(homerMotifCommand) HU.run_homer_command(homerMotifCommand) HU.run_homer_command(homerLocationCommand) MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath) MEU.run_meme_command(MEMEMotifCommand) gibbsMotifList = GU.parse_gibbs_output(motMin, motMax) homerMotifList = HU.parse_homer_output() memeMotifList = MEU.parse_meme_output() timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) timestamp = str(timestamp) htmlDir = self.shared_folder + '/html' + timestamp os.mkdir(htmlDir) lineCount = 0 with open(promoterFastaFilePath, 'r') as pFile: for line in pFile: lineCount += 1 numFeat = lineCount / 2 with open(promoterFastaFilePath, 'r') as pFile: fileStr = pFile.read() promHtmlStr = '<html><body> ' + fileStr + ' </body></html>' with open(htmlDir + '/promoters.html', 'w') as promHTML: promHTML.write(promHtmlStr) JsonPath = '/kb/module/work/tmp' subprocess.call([ 'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py', JsonPath + '/gibbs.json', htmlDir + '/gibbs.html', str(numFeat) ]) subprocess.call([ 'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py', JsonPath + '/homer_out/homer.json', htmlDir + '/homer.html', str(numFeat) ]) subprocess.call([ 'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py', JsonPath + '/meme_out/meme.json', htmlDir + '/meme.html', str(numFeat) ]) fullMotifList = [] for h in homerMotifList: add = True for g in gibbsMotifList: if h['Iupac_signature'] == g['Iupac_signature']: add = False break for m in memeMotifList: if m['Iupac_signature'] == h['Iupac_signature']: add = False break if add: fullMotifList.append(h) for g in gibbsMotifList: add = True for m in memeMotifList: if m['Iupac_signature'] == g['Iupac_signature']: add = False break if add: fullMotifList.append(g) for m in memeMotifList: fullMotifList.append(m) #What needs to happen here: #call makeLogo for each of the json outputs(capture these from somewhere) dfu = DataFileUtil(self.callback_url) parsed = ['gibbs.html', 'homer.html', 'meme.html', 'promoters.html'] indexHtmlStr = '<html>' #use js to load the page content for p in parsed: indexHtmlStr += '<head><script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script> <script> $(function(){$("#' + p.replace( '.html', '_content') + '").load("' + p + '"); });</script> ' indexHtmlStr += """<style> body {font-family: Arial;} /* Style the tab */ .tab { overflow: hidden; border: 1px solid #ccc; background-color: #f1f1f1; } /* Style the buttons inside the tab */ .tab button { background-color: inherit; float: left; border: none; outline: none; cursor: pointer; padding: 14px 16px; transition: 0.3s; font-size: 17px; } /* Change background color of buttons on hover */ .tab button:hover { background-color: #ddd; } /* Create an active/current tablink class */ .tab button.active { background-color: #ccc; } /* Style the tab content */ .tabcontent { display: none; padding: 6px 12px; border: 1px solid #ccc; border-top: none; } </style></head> """ indexHtmlStr += '<body>' #adding tabs indexHtmlStr += '<div class="tab">\n' for p in parsed: indexHtmlStr += '<button class="tablinks" onclick="openReport(event, \'' + p.replace( '.html', '_content') + '\')">' + p.replace('.html', '') + '</button>' indexHtmlStr += '</div>' for p in parsed: indexHtmlStr += '<div id="' + p.replace( '.html', '_content') + '" class="tabcontent"></div>' indexHtmlStr += """<script> function openReport(evt, reportName) { var i, tabcontent, tablinks; tabcontent = document.getElementsByClassName("tabcontent"); for (i = 0; i < tabcontent.length; i++) { tabcontent[i].style.display = "none"; } tablinks = document.getElementsByClassName("tablinks"); for (i = 0; i < tablinks.length; i++) { tablinks[i].className = tablinks[i].className.replace(" active", ""); } document.getElementById(reportName).style.display = "block"; evt.currentTarget.className += " active"; } </script>""" #for p in parsed: # indexHtmlStr += '<a href="' + p + '">' + p.replace('.html','') +' Output</a>\n' #indexHtmlStr += '</body></html>' with open(htmlDir + '/index.html', 'w') as html_handle: html_handle.write(str(indexHtmlStr)) #plt.rcParams['figure.dpi'] = 300 #htmlFiles = ['index.html','gibbs.html','homer.html'] #shockParamsList = [] #for f in htmlFiles: # shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'}) try: html_upload_ret = dfu.file_to_shock({ 'file_path': htmlDir, 'make_handle': 0, 'pack': 'zip' }) except: raise ValueError('error uploading HTML file to shock') #Create motif set object from MotifList #TODO set parameters correctly #add narrative support to set MSO = {} MSO['Condition'] = 'Temp' MSO['FeatureSet_ref'] = '123' MSO['Motifs'] = [] MSO['Alphabet'] = ['A', 'C', 'G', 'T'] MSO['Background'] = {} for letter in MSO['Alphabet']: MSO['Background'][letter] = 0.0 MSU.parseMotifList(fullMotifList, MSO) objname = 'MotifSet' + str( int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)) #Pass motif set into this save_objects_params = {} #save_objects_params['id'] = self.ws_info[0] #save_objects_params['id'] = long(params['workspace_name'].split('_')[1]) save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name']) save_objects_params['objects'] = [{ 'type': 'KBaseGwasData.MotifSet', 'data': MSO, 'name': objname }] info = dfu.save_objects(save_objects_params)[0] motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4]) #object_upload_ret = dfu.file_to_shock() reportName = 'identify_promoter_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [{ 'ref': motif_set_ref, 'description': 'Motif Set generated by identify promoter' }], 'message': '', 'direct_html': None, 'direct_html_index': 0, 'file_links': [], 'html_links': [], 'html_window_height': 220, 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # attach to report obj #reportObj['direct_html'] = None reportObj['direct_html'] = '' reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{ 'shock_id': html_upload_ret['shock_id'], #'name': 'promoter_download.zip', 'name': 'index.html', 'label': 'Save promoter_download.zip' }] report = KBaseReport(self.callback_url, token=ctx['token']) #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']}) report_info = report.create_extended_report(reportObj) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END find_motifs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method find_motifs return value ' + 'output is not type dict as required.') # return the results return [output]
class variation_importer_utils: def __init__(self, utility_params): self.params = utility_params # self.scratch = utility_params['scratch'] self.scratch = os.path.join(utility_params['scratch'], 'variation_importer_' + str(uuid.uuid4())) os.mkdir(self.scratch) self.service_wiz_url = utility_params['srv-wiz-url'] self.callback_url = utility_params['callback_url'] self.dfu = DataFileUtil(self.callback_url) self.kbr = KBaseReport(self.callback_url, token=utility_params['token']) def _create_fake_location_data(self): location = { 'lat': random.uniform(-90, 90), 'lon': random.uniform(-180, 180), 'elevation': random.uniform(0, 100), 'description': "".join([random.choice(string.ascii_letters) for n in xrange(20)]) } return location def _create_fake_straininfo(self, genotype_id): straininfo = { 'source_id': genotype_id, 'location_info': self._create_fake_location_data() } return straininfo def _create_fake_population(self, genotypes): population = {'description': 'Faker population data.', 'strains': []} for genome in genotypes: population['strains'].append(self._create_fake_straininfo(genome)) return population def _create_fake_kinship_matrix(self): kinship = { 'row_ids': ['one', 'two'], 'col_ids': ['one', 'two'], 'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]] } return kinship def _compare(self, s, t): return Counter(s) == Counter(t) def pretend_download_staging_file(self, vcf_filename, scratch): vcf_filepath = os.path.join(scratch, vcf_filename) shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath) return {'copy_file_path': vcf_filepath} def _generate_population(self, location_filepath, genotypes, population_description="None Provided"): locations = pd.read_csv(location_filepath, delimiter='\t') # Drop any missing data from id, latitude, or longitude. locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True) # Compare the location IDs with the genotype IDs if not (self._compare(locations.iloc[:, 0].astype(str).tolist(), genotypes)): log("Location IDs do not match Sample IDs in Variation file!") raise ValueError( "Location IDs do not match Sample IDs in Variation file!") col_names = [x.lower() for x in locations.columns.values] expected_columns = ['id', 'latitude', 'longitude'] optional_columns = ['elevation', 'description'] # CHeck that first three columns match the expected columns. if not (self._compare(col_names[0:3], expected_columns)): raise ValueError("Missing or unexpected column names in {}".format( location_filepath)) # If optional columns are not present, give default value for each. for col in optional_columns: if col not in col_names: if col == 'elevation': locations[col] = 0.0 else: locations[col] = "None provided." population = {'description': population_description, 'strains': []} for idx, row in locations.iterrows(): population['strains'].append({ 'source_id': str(row['id']), 'location_info': { 'lat': row['latitude'], 'lon': row['longitude'], 'elevation': row['elevation'], 'description': row['description'] } }) return population def _validate_vcf(self, vcf_filepath, vcf_version): validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4())) os.mkdir(validation_output_dir) if vcf_version >= 4.1: print("Using vcf_validator_linux...") validator_cmd = ["vcf_validator_linux"] validator_cmd.append("-i") validator_cmd.append(vcf_filepath) validator_cmd.append("-o") validator_cmd.append(validation_output_dir) else: print("Using vcftools to validate...") validator_cmd = ["vcf-validator"] validator_cmd.append(vcf_filepath) print("VCF version below 4.1. No validation logging.") print("Validator command: {}".format(validator_cmd)) p = subprocess.Popen(validator_cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) validator_output = [] while True: line = p.stdout.readline() if not line: break validator_output.append(line) p.wait() validation_output_filename = [ f for f in os.listdir(validation_output_dir) if f.endswith('.txt') ][0] validation_output_filepath = os.path.join(validation_output_dir, validation_output_filename) if not validation_output_filename: print('Validator did not generate log file!') raise Exception("Validator did not generate a log file.") log("Validator output filepath: {}".format(validation_output_filepath)) log("Return code from validator {}".format(p.returncode)) return validation_output_filepath, p.returncode # Retrieve contigs from assembly file. def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'): try: assembly_data = self.dfu.get_objects( {'object_refs': [assembly_ref]})['data'][0]['data'] except Exception as e: print("Unable to retrieve Assembly reference: {}".format( assembly_ref)) raise ValueError(e) raw_contigs = assembly_data['contigs'] contigs = {} # Contigs returns just a dict with key and contig_id for key, value in raw_contigs.iteritems(): contigs[str(key)] = value['contig_id'] return raw_contigs def _get_version_contigs_genotypes(self, vcf_filepath): contigs = [] genotypes = [] version = '' with (gzip.open if vcf_filepath.endswith('.gz') else open)( vcf_filepath, 'rt') as vcf: line = vcf.readline() tokens = line.split('=') if not (tokens[0].startswith('##fileformat')): log("Invalid VCF. ##fileformat line in meta is improperly formatted." ) raise ValueError( "Invalid VCF. ##fileformat line in meta is improperly formatted." ) version = float(tokens[1][-4:].rstrip()) log("VCF version: {}".format(version)) for line in vcf: if line.startswith("#CHROM"): log("#CHROM encountered, exiting loop.") genotypes = line.split()[9:] log("Number Genotypes in vcf: {}".format(len(genotypes))) break tokens = line.split("=") if tokens[0].startswith('##contig'): contigs.append(tokens[2][:-2]) return version, contigs, genotypes # Arabidopsis ref: 18590/2/8 def _get_assembly_ref_from_genome(self, genome_ref): ga = GenomeAnnotationAPI(self.service_wiz_url) inputs_get_assembly = {'ref': genome_ref} try: assembly_object_ref = ga.get_assembly(inputs_get_assembly) except Exception as e: print( "Unable to retrieve Assembly reference ID from Genome ref_id: {}" .format(genome_ref)) raise Exception(e) return assembly_object_ref def _generate_output_file_list(self): log('Start packing result files') output_files = list() result_file = os.path.join(self.scratch, 'variation_importer_results.zip') excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store'] with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(self.scratch): for file in files: if not (file.endswith(tuple(excluded_extensions)) # file.endswith('.zip') or # file.endswith('.vcf') or # file.endswith('.vcf.gz') or # file.endswith('.html') or # file.endswith('.DS_Store') ): zip_file.write(os.path.join(root, file), file) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Variation Importer' }) log("Importer output generated: {}".format(output_files)) return output_files def _generate_report(self, params, variation_results, variation_file_path): stats_results = self._generate_variation_stats( params['additional_output_type'], variation_file_path) html_report = self._generate_html_report(variation_results, stats_results) file_links = self._generate_output_file_list() objects = [] if (variation_results['valid_variation_file']): objects = [{ 'ref': variation_results['variation_obj_ref'], 'description': 'Variation Object created by VCF Importer' }] report_params = { 'objects_created': objects, 'message': '', 'direct_html_link_index': 0, 'file_links': file_links, 'html_links': html_report, 'html_window_height': 330, 'workspace_name': params['workspace_name'], 'report_object_name': 'variation_importer_report_' + str(uuid.uuid4()) } kbr_output = self.kbr.create_extended_report(report_params) report_output = { 'report_name': kbr_output['name'], 'report_ref': kbr_output['ref'], 'variation_ref': variation_results['variation_obj_ref'] } log("Returning from _generate_report!") return report_output def _generate_html_report(self, variation_results, stats_output=None): """ _generate_html_report: generate html report from output files """ html_report = list() print("Validation output filepath passed to html report: {}".format( variation_results['validation_output_filepath'])) try: report_dir = os.path.join(self.scratch, 'html') os.mkdir(report_dir) with open(template_dir, 'r') as html, open( variation_results['validation_output_filepath'], 'r') as validation: validation_content = '<p><h4>{} '.format( variation_results['variation_filename']) if variation_results.get('valid_variation_file'): validation_content += '<em><i>is</i> a valid </em> variation file.' else: validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.' validation_content += '</h4></p>' report = html.read() # Discard the first line of the validation file. It is irrelevant. validation.readline() validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>' validation_content += '<ul>' for line in validation.readlines(): validation_content += '<li>{}</li>'.format(line) validation_content += '</ul>' if variation_results.get('invalid_contigs'): validation_content += '<h4>The following Contigs were not found in the reference genome. The possible contigs have been written to the file {}. Please see the associated links to download.</h4>'.format( variation_results.get('genome_ref'), 'valid_contigs.txt') validation_content += '<ul>' for contig in variation_results.get('invalid_contigs'): validation_content += '<li>{}</li>'.format(contig) validation_content += '</ul>' # if not variation_results.get('contigs'): # validation_content += '<h4>No contig information was included in the VCF file header! Please recreate the VCF file with each contig described in the meta description </h4>' report = report.replace('Validation_Results', validation_content) if (stats_output.get('stats_file_dir')): summary_results = '<p><h4>Summary Statistics</h4></p>' summary_results += ''' <table> <tr> <th>Number of SNPs</th> <th>Number of Genotypes </th> </tr> ''' summary_results += '<tr>' summary_results += '<td>{}</td><td>{}</td>'.format( 'To be added later', variation_results['num_genotypes']) summary_results += '</tr></table>' report = report.replace('Variation_Statistics', summary_results) # visualization image_content = '' if (stats_output.get('stats_img_dir')): image_dir = stats_output.get('stats_img_dir') for file in glob.glob(os.path.join(image_dir, '*.png')): shutil.move(file, report_dir) for image in glob.glob(report_dir + "/*.png"): image = image.replace(report_dir + '/', '') caption = image.replace(report_dir + '/', '').replace('.png', '') image_content += '<p style="text-align:center"><img align="center" src="{}" ' \ '></a><a target="_blank"><br>' \ '<p align="center">{}</p></p>'.format(image, caption) else: image_content += 'No visualizations generated.' report = report.replace("Visualization_Results", image_content) except Exception as e: print("Error generating HTML report.") raise report_file_path = os.path.join(report_dir, 'index.html') with open(report_file_path, 'w') as output: output.write(report) try: html_upload_ret = self.dfu.file_to_shock({ 'file_path': report_file_path, 'make_handle': 0, 'pack': 'zip' }) log("Variation HTML report to shock ref: {}".format( html_upload_ret)) except: raise ValueError('Error uploading HTML to shock') html_report.append({ 'shock_id': html_upload_ret['shock_id'], 'name': os.path.basename(report_file_path), 'label': os.path.basename(report_file_path), 'description': 'HTML report for Variation Importer' }) return html_report def _generate_variation_stats(self, additional_output_type, variation_filepath): """ :param commments go here """ file_output_directory = os.path.join(self.scratch, 'stats_' + str(uuid.uuid4())) os.mkdir(file_output_directory) image_output_directory = os.path.join( self.scratch, 'stats_images_' + str(uuid.uuid4())) os.mkdir(image_output_directory) # TODO: Validate user supplied params and build PLINK command plink_cmd = ["plink"] plink_cmd.append('--vcf') plink_cmd.append(variation_filepath) # plink_cmd.append('--recode12') # plink_cmd.append('transpose') # plink_cmd.append('--output-missing-genotype') # plink_cmd.append("0") plink_cmd.append('--freq') plink_cmd.append('--hardy') # plink_cmd.append('gz') plink_cmd.append('--out') plink_cmd.append(variation_filepath) print("PLINK arguments: {}".format(plink_cmd)) plink_output = { "errors": [], "warnings": [] # "notes" : [] } p = subprocess.Popen(plink_cmd, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) while True: line = p.stdout.readline() if not line: break # log(line) tokens = line.split(':') if (tokens[0] == 'Error'): plink_output['errors'].append(line) raise ValueError('PLINK 1.9 error: ' + line) elif (tokens[0] == 'Warning'): plink_output['warnings'].append(line) print(line) # elif(tokens[0] == 'Note'): # plink_output['notes'].append(line) # print(line) p.stdout.close() p.wait() plink_output_filepath = os.path.join(file_output_directory, 'plink_cli_output.txt') with open(plink_output_filepath, 'w') as plink: for data in plink_output: plink.write("{}: {}\n".format(data, plink_output[data])) plink_output_files = [ f for f in os.listdir(self.scratch) if f.startswith(os.path.basename(variation_filepath) + '.') ] for file in plink_output_files: shutil.move(os.path.join(self.scratch, file), file_output_directory) if p.returncode != 0: log("PLINK encountered an error during runtime. Please see log file." ) variation_filename = os.path.basename(variation_filepath) base_filepath = os.path.join(file_output_directory, variation_filename) freq_filepath = base_filepath + '.frq' maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R' hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R' log("Frequency filepath: {}".format(freq_filepath)) # TODO: make function to do Rscript calls. # generate visualizations and store in directory maf_command = ['Rscript'] maf_command.append('--no-save') maf_command.append('--vanilla') maf_command.append(maf_script_filepath) maf_command.append(freq_filepath) maf_command.append("Minor Allele Frequencies.png") r = subprocess.Popen(maf_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) r.wait() if r.returncode != 0: log("Error creating MAF histogram in R") hwe_filepath = base_filepath + '.hwe' zoom_filepath = hwe_filepath + '.zoom' zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format( hwe_filepath, zoom_filepath) try: z = subprocess.Popen(zoom_command, cwd=file_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) z.wait() if z.returncode != 0: log("Error creating HWE zoom file.") except Exception as e: log("Error creating zoom HWE file: {}".format(e)) hwe_command = ['Rscript'] hwe_command.append('--no-save') hwe_command.append('--vanilla') hwe_command.append(hwe_script_filepath) hwe_command.append(hwe_filepath) hwe_command.append("Hardy-Weinberg Equilibrium.png") hwe_command.append(zoom_filepath) hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png") print("MAF command: {}".format(hwe_command)) h = subprocess.Popen(hwe_command, cwd=image_output_directory, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) h.wait() if h.returncode != 0: log("Error generating HWE Zoom plot") return { 'stats_file_dir': file_output_directory, 'stats_img_dir': image_output_directory } def _save_variation_to_ws(self, workspace_name, variation_obj, variation_filepath, kinship_matrix): ws_id = self.dfu.ws_name_to_id(workspace_name) try: vcf_shock_return = self.dfu.file_to_shock({ 'file_path': variation_filepath, 'make_handle': 1, 'pack': 'gzip' }) except Exception as e: print("Error uploading file to shock!") raise ValueError(e) variation_obj['variation_file_reference'] = vcf_shock_return.get( 'shock_id') info = self.dfu.save_objects({ 'id': ws_id, 'objects': [{ 'type': 'KBaseGwasData.Variations', 'data': variation_obj, 'name': 'TestVariationImporterName' }] })[0] variation_ref = "%s/%s/%s" % (info[6], info[0], info[4]) log("Variation reference created: {}".format(variation_ref)) return variation_ref def validate_vcf(self, params): """ :param params: dict containing all input parameters. """ returnVal = {} valid_vcf_file = True try: vcf_filepath = self.pretend_download_staging_file( params['staging_file_subdir_path'], self.scratch).get('copy_file_path') location_filepath = self.pretend_download_staging_file( params['location_file_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['staging_file_subdir_path'])) try: location_filepath = self.pretend_download_staging_file( params['location_file_subdir_path'], self.scratch).get('copy_file_path') except Exception as e: raise Exception("Unable to download {} from staging area.".format( params['location_file_subdir_path'])) # Check file size log("{} file size: {}".format(vcf_filepath, os.path.getsize(vcf_filepath))) log('\nValidating {}...'.format(vcf_filepath)) vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes( vcf_filepath) if not vcf_contigs: log("No contig data in {} header.".format(vcf_filepath)) raise ValueError( "No contig data in {} header.".format(vcf_filepath)) if (vcf_version < 4.1): log("VCF file is version {}. Must be at least version 4.1".format( vcf_version)) raise ValueError( "VCF file is version {}. Must be at least version 4.1".format( vcf_version)) # Generate population object population = self._generate_population(location_filepath, vcf_genotypes) # Retrieve Assembly object reference associated with genome. try: assembly_ref = self._get_assembly_ref_from_genome( params['genome_ref']) except Exception as e: print("Unable to retrieve {}".format(params['genome_ref'])) raise ValueError(e) # Retrieve contig list from Assembly object. try: assembly_contigs = self._get_contigs_from_assembly(assembly_ref) except Exception as e: print("Unable to retrieve contigs from Assembly ref: {}".format( assembly_ref)) raise ValueError(e) log("Length of assembly contigs: {}".format(len(assembly_contigs))) # Compare contig IDs from VCF to those in the Assembly object invalid_contigs = [] for contig in vcf_contigs: if contig not in assembly_contigs.keys(): invalid_contigs.append(contig) if invalid_contigs: log("Invalid contig IDs found in {}".format(vcf_filepath)) valid_contig_filepath = os.path.join(self.scratch, 'valid_contigs.txt') log("Writing valid contigs to file: {}".format( valid_contig_filepath)) with open(valid_contig_filepath, 'w') as icf: for contig in assembly_contigs: icf.write(contig + '\n') valid_vcf_file = False validation_output_filepath, returncode = self._validate_vcf( vcf_filepath, vcf_version) if returncode != 0: valid_vcf_file = False kinship_matrix = self._create_fake_kinship_matrix() variation_obj_ref = '' if valid_vcf_file: variation_object = { "genome": params['genome_ref'], "population": population, "contigs": vcf_contigs, "comment": "Comments go here", "assay": "Assay data goes gere.", "originator": "PI/Lab info goes here", "pubmed_id": "PubMed ID goes here", "kinship_info": kinship_matrix } variation_obj_ref = self._save_variation_to_ws( params['workspace_name'], variation_object, vcf_filepath, kinship_matrix) log("Variation object reference: {}".format(variation_obj_ref)) variation_report_metadata = { 'valid_variation_file': valid_vcf_file, 'variation_obj_ref': variation_obj_ref, 'variation_filename': os.path.basename(vcf_filepath), 'validation_output_filepath': validation_output_filepath, 'vcf_version': vcf_version, 'num_genotypes': len(vcf_genotypes), 'num_contigs': len(vcf_contigs), 'invalid_contigs': invalid_contigs } returnVal = self._generate_report(params, variation_report_metadata, vcf_filepath) return returnVal
class GenomeInterface: def _validate_save_one_genome_params(self, params): """ _validate_save_one_genome_params: validates params passed to save_one_genome method """ log('start validating save_one_genome params') # check for required parameters for p in ['workspace', 'name', 'data']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _check_shock_response(self, response, errtxt): """ _check_shock_response: check shock node response (Copied from DataFileUtil) """ log('start checking shock response') if not response.ok: try: err = json.loads(response.content)['error'][0] except: # this means shock is down or not responding. self.log("Couldn't parse response error content from Shock: " + response.content) response.raise_for_status() raise ValueError(errtxt + str(err)) def _own_handle(self, genome_data, handle_property): """ _own_handle: check that handle_property point to shock nodes owned by calling user """ log('start checking handle {} ownership'.format(handle_property)) if handle_property in genome_data: handle_id = genome_data[handle_property] hs = HandleService(self.handle_url, token=self.token) handles = hs.hids_to_handles([handle_id]) shock_id = handles[0]['id'] # Copy from DataFileUtil.own_shock_node implementation: header = {'Authorization': 'Oauth {}'.format(self.token)} res = requests.get(self.shock_url + '/node/' + shock_id + '/acl/?verbosity=full', headers=header, allow_redirects=True) self._check_shock_response( res, 'Error getting ACLs for Shock node {}: '.format(shock_id)) owner = res.json()['data']['owner']['username'] user_id = self.auth_client.get_user(self.token) if owner != user_id: log('start copying node to owner: {}'.format(user_id)) dfu_shock = self.dfu.copy_shock_node({ 'shock_id': shock_id, 'make_handle': True }) handle_id = dfu_shock['handle']['hid'] genome_data[handle_property] = handle_id def _check_dna_sequence_in_features(self, genome): """ _check_dna_sequence_in_features: check dna sequence in each feature """ log('start checking dna sequence in each feature') if 'features' in genome: features_to_work = {} for feature in genome['features']: if not ('dna_sequence' in feature and feature['dna_sequence']): features_to_work[feature['id']] = feature['location'] if len(features_to_work) > 0: aseq = AssemblySequenceAPI(self.sw_url, token=self.token) get_dna_params = {'requested_features': features_to_work} if 'assembly_ref' in genome: get_dna_params['assembly_ref'] = genome['assembly_ref'] elif 'contigset_ref' in genome: get_dna_params['contigset_ref'] = genome['contigset_ref'] else: # Nothing to do (it may be test genome without contigs)... return dna_sequences = aseq.get_dna_sequences( get_dna_params)['dna_sequences'] for feature in genome['features']: if feature['id'] in dna_sequences: feature['dna_sequence'] = dna_sequences[feature['id']] feature['dna_sequence_length'] = len( feature['dna_sequence']) def __init__(self, config): self.ws_url = config.workspaceURL self.handle_url = config.handleURL self.shock_url = config.shockURL self.sw_url = config.srvWizURL self.token = config.token self.auth_service_url = config.authServiceUrl self.callback_url = config.callbackURL self.ws = Workspace(self.ws_url, token=self.token) self.auth_client = _KBaseAuth(self.auth_service_url) self.dfu = DataFileUtil(self.callback_url) def save_one_genome(self, params): log('start saving genome object') self._validate_save_one_genome_params(params) workspace = params['workspace'] name = params['name'] data = params['data'] # check all handles point to shock nodes owned by calling user self._own_handle(data, 'genbank_handle_ref') self._own_handle(data, 'gff_handle_ref') self._check_dna_sequence_in_features(data) if 'hidden' in params and str( params['hidden']).lower() in ('yes', 'true', 't', '1'): hidden = 1 else: hidden = 0 if isinstance(workspace, int) or workspace.isdigit(): workspace_id = workspace else: workspace_id = self.dfu.ws_name_to_id(workspace) dfu_save_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data': data, 'name': name, 'hidden': hidden }] } dfu_oi = self.dfu.save_objects(dfu_save_params)[0] returnVal = {'info': dfu_oi} return returnVal
class CufflinksUtils: CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/' GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/' def __init__(self, config): """ :param config: :param logger: :param directory: Working directory :param urls: Service urls """ # BEGIN_CONSTRUCTOR self.ws_url = config["workspace-url"] self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.srv_wiz_url = config['srv-wiz-url'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = os.path.join(config['scratch'], str(uuid.uuid4())) self._mkdir_p(self.scratch) self.tool_used = "Cufflinks" self.tool_version = os.environ['VERSION'] # END_CONSTRUCTOR pass def parse_FPKMtracking_calc_TPM(self, filename): """ Generates TPM from FPKM :return: """ fpkm_dict = {} tpm_dict = {} gene_col = 0 fpkm_col = 9 sum_fpkm = 0.0 with open(filename) as f: next(f) for line in f: larr = line.split("\t") gene_id = larr[gene_col] if gene_id != "": fpkm = float(larr[fpkm_col]) sum_fpkm = sum_fpkm + fpkm fpkm_dict[gene_id] = math.log(fpkm + 1, 2) tpm_dict[gene_id] = fpkm if sum_fpkm == 0.0: log("Warning: Unable to calculate TPM values as sum of FPKM values is 0" ) else: for g in tpm_dict: tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2) return fpkm_dict, tpm_dict def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_cufflinks_params(self, params): """ _validate_run_cufflinks_params: Raises an exception if params are invalid """ log('Start validating run_cufflinks params') # check for required parameters for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _run_gffread(self, gff_path, gtf_path): """ _run_gffread: run gffread script ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility """ log('converting gff to gtf') command = self.GFFREAD_TOOLKIT_PATH + '/gffread ' command += "-E {0} -T -o {1}".format(gff_path, gtf_path) self._run_command(command) def _create_gtf_annotation_from_genome(self, genome_ref): """ Create reference annotation file from genome """ ref = self.ws.get_object_subset([{ 'ref': genome_ref, 'included': ['contigset_ref', 'assembly_ref'] }]) if 'contigset_ref' in ref[0]['data']: contig_id = ref[0]['data']['contigset_ref'] elif 'assembly_ref' in ref[0]['data']: contig_id = ref[0]['data']['assembly_ref'] if contig_id is None: raise ValueError( "Genome at {0} does not have reference to the assembly object". format(genome_ref)) print(contig_id) log("Generating GFF file from Genome") try: ret = self.au.get_assembly_as_fasta({'ref': contig_id}) output_file = ret['path'] mapping_filename = c_mapping.create_sanitized_contig_ids( output_file) os.remove(output_file) # get the GFF ret = self.gfu.genome_to_gff({'genome_ref': genome_ref}) genome_gff_file = ret['file_path'] c_mapping.replace_gff_contig_ids(genome_gff_file, mapping_filename, to_modified=True) gtf_ext = ".gtf" if not genome_gff_file.endswith(gtf_ext): gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf' self._run_gffread(genome_gff_file, gtf_path) else: gtf_path = genome_gff_file log("gtf file : " + gtf_path) except Exception: raise ValueError( "Generating GTF file from Genome Annotation object Failed : {}" .format("".join(traceback.format_exc()))) return gtf_path def _get_gtf_file(self, alignment_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch alignment_data = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0]['data'] genome_ref = alignment_data.get('genome_id') # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1] # ws_gtf = genome_name+"_GTF_Annotation" genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_gtf_file_from_genome_ref(self, genome_ref): """ _get_gtf_file: get the reference annotation file (in GTF or GFF3 format) """ result_directory = self.scratch genome_data = self.ws.get_objects2({'objects': [{ 'ref': genome_ref }]})['data'][0]['data'] gff_handle_ref = genome_data.get('gff_handle_ref') if gff_handle_ref: log('getting reference annotation file from genome') annotation_file = self.dfu.shock_to_file({ 'handle_id': gff_handle_ref, 'file_path': result_directory, 'unpack': 'unpack' })['file_path'] else: annotation_file = self._create_gtf_annotation_from_genome( genome_ref) return annotation_file def _get_input_file(self, alignment_ref): """ _get_input_file: get input BAM file from Alignment object """ bam_file_dir = self.rau.download_alignment( {'source_ref': alignment_ref})['destination_dir'] files = os.listdir(bam_file_dir) bam_file_list = [ file for file in files if re.match(r'.*\_sorted\.bam', file) ] if not bam_file_list: bam_file_list = [ file for file in files if re.match(r'.*(?<!sorted)\.bam', file) ] if not bam_file_list: raise ValueError('Cannot find .bam file from alignment {}'.format( alignment_ref)) bam_file_name = bam_file_list[0] bam_file = os.path.join(bam_file_dir, bam_file_name) return bam_file def _generate_command(self, params): """ _generate_command: generate cufflinks command """ cufflinks_command = '/opt/cufflinks/cufflinks' cufflinks_command += (' -q --no-update-check -p ' + str(params.get('num_threads', 1))) if 'max_intron_length' in params and params[ 'max_intron_length'] is not None: cufflinks_command += (' --max-intron-length ' + str(params['max_intron_length'])) if 'min_intron_length' in params and params[ 'min_intron_length'] is not None: cufflinks_command += (' --min-intron-length ' + str(params['min_intron_length'])) if 'overhang_tolerance' in params and params[ 'overhang_tolerance'] is not None: cufflinks_command += (' --overhang-tolerance ' + str(params['overhang_tolerance'])) cufflinks_command += " -o {0} -G {1} {2}".format( params['result_directory'], params['gtf_file'], params['input_file']) log('Generated cufflinks command: {}'.format(cufflinks_command)) return cufflinks_command def _process_rnaseq_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing RNASeqAlignment object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) if '/' not in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_rnaseq_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params['gtf_file'], params['expression_suffix']) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _process_kbasesets_alignment_object(self, params): """ _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object """ log('start processing KBaseSets object\nparams:\n{}'.format( json.dumps(params, indent=1))) alignment_ref = params.get('alignment_ref') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) params['result_directory'] = str(result_directory) # input files params['input_file'] = self._get_input_file(alignment_ref) if not params.get('gtf_file'): params['gtf_file'] = self._get_gtf_file(alignment_ref) command = self._generate_command(params) self._run_command(command) expression_obj_ref = self._save_kbasesets_expression( result_directory, alignment_ref, params.get('workspace_name'), params.get('genome_ref'), params.get('gtf_file'), params.get('expression_suffix')) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_obj_ref, 'alignment_ref': alignment_ref } expression_name = self.ws.get_object_info([{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] widget_params = { "output": expression_name, "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_html_report(self, result_directory, obj_ref): """ _generate_html_report: generate html summary report """ log('Start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_object_type = expression_object.get('info')[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqExpression-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data'][ 'sample_expression_ids']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] Overview_Content += '<p>{}</p>'.format(expression_name) elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): pprint(expression_object) Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format( expression_object.get('info')[1]) Overview_Content += '<br><p>Generated Expression Object:</p>' for expression_ref in expression_object['data']['items']: expression_name = self.ws.get_object_info( [{ "ref": expression_ref['ref'] }], includeMetadata=None)[0][1] condition = expression_ref['label'] Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format( condition, expression_name) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({ 'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Cufflinks App' }) return html_report def _save_rnaseq_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_rnaseq_expression: save Expression object to workspace """ log('start saving Expression object') alignment_object_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_kbasesets_expression(self, result_directory, alignment_ref, workspace_name, genome_ref, gtf_file, expression_suffix): """ _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils and SetAPI """ log('start saving Expression object') alignment_info = self.ws.get_object_info3( {'objects': [{ "ref": alignment_ref }]}) alignment_object_name = alignment_info['infos'][0][1] # set expression name if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_ref = self.eu.upload_expression({ 'destination_ref': workspace_name + '/' + expression_name, 'source_dir': result_directory, 'alignment_ref': alignment_ref, 'tool_used': self.tool_used, 'tool_version': self.tool_version })['obj_ref'] return expression_ref def _save_rnaseq_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_rnaseq_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _save_kbasesets_expression_set(self, alignment_expression_map, alignment_set_ref, workspace_name, expression_set_name): """ _save_kbasesets_expression_set: save ExpressionSet object to workspace """ log('start saving ExpressionSet object') if isinstance(workspace_name, int) or workspace_name.isdigit(): workspace_id = workspace_name else: workspace_id = self.dfu.ws_name_to_id(workspace_name) expression_set_data = self._generate_expression_set_data( alignment_expression_map, alignment_set_ref, expression_set_name) object_type = 'KBaseRNASeq.RNASeqExpressionSet' save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': expression_set_data, 'name': expression_set_name }] } dfu_oi = self.dfu.save_objects(save_object_params)[0] expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) return expression_set_ref def _generate_report(self, obj_ref, workspace_name, result_directory, exprMatrix_FPKM_ref=None, exprMatrix_TPM_ref=None): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report(result_directory, obj_ref) expression_object = self.ws.get_objects2( {'objects': [{ 'ref': obj_ref }]})['data'][0] expression_info = expression_object['info'] expression_data = expression_object['data'] expression_object_type = expression_info[2] if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'Expression generated by Cufflinks' }] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', expression_object_type): objects_created = [{ 'ref': obj_ref, 'description': 'ExpressionSet generated by Cufflinks' }] items = expression_data['items'] for item in items: objects_created.append({ 'ref': item['ref'], 'description': 'Expression generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_FPKM_ref, 'description': 'FPKM ExpressionMatrix generated by Cufflinks' }) objects_created.append({ 'ref': exprMatrix_TPM_ref, 'description': 'TPM ExpressionMatrix generated by Cufflinks' }) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _parse_FPKMtracking(self, filename, metric): result = {} pos1 = 0 if metric == 'FPKM': pos2 = 7 if metric == 'TPM': pos2 = 8 with open(filename) as f: next(f) for line in f: larr = line.split("\t") if larr[pos1] != "": try: result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2) except ValueError: result[larr[pos1]] = math.log(1, 2) return result def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'cufflinks_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.DS_Store')): zip_file.write( os.path.join(root, file), os.path.join(os.path.basename(root), file)) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Cufflinks App' }) return output_files def _generate_expression_data(self, result_directory, alignment_ref, gtf_file, workspace_name, expression_suffix): """ _generate_expression_data: generate Expression object with cufflinks output files """ alignment_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_ref }]})['data'][0] # set expression name alignment_object_name = alignment_data_object['info'][1] if re.match('.*_[Aa]lignment$', alignment_object_name): expression_name = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix expression_name = alignment_object_name + expression_suffix expression_data = { 'id': expression_name, 'type': 'RNA-Seq', 'numerical_interpretation': 'FPKM', 'processing_comments': 'log2 Normalized', 'tool_used': self.tool_used, 'tool_version': self.tool_version } alignment_data = alignment_data_object['data'] condition = alignment_data.get('condition') expression_data.update({'condition': condition}) genome_id = alignment_data.get('genome_id') expression_data.update({'genome_id': genome_id}) read_sample_id = alignment_data.get('read_sample_id') expression_data.update( {'mapped_rnaseq_alignment': { read_sample_id: alignment_ref }}) exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM( os.path.join(result_directory, 'genes.fpkm_tracking')) expression_data.update({'expression_levels': exp_dict}) expression_data.update({'tpm_expression_levels': tpm_exp_dict}) handle = self.dfu.file_to_shock({ 'file_path': result_directory, 'pack': 'zip', 'make_handle': True })['handle'] expression_data.update({'file': handle}) return expression_data def _generate_expression_set_data(self, alignment_expression_map, alignment_set_ref, expression_set_name): """ _generate_expression_set_data: generate ExpressionSet object with cufflinks output files """ alignment_set_data_object = self.ws.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] alignment_set_data = alignment_set_data_object['data'] expression_set_data = { 'tool_used': self.tool_used, 'tool_version': self.tool_version, 'id': expression_set_name, 'alignmentSet_id': alignment_set_ref, 'genome_id': alignment_set_data.get('genome_id'), 'sampleset_id': alignment_set_data.get('sampleset_id') } sample_expression_ids = [] mapped_expression_objects = [] mapped_expression_ids = [] for alignment_expression in alignment_expression_map: alignment_ref = alignment_expression.get('alignment_ref') expression_ref = alignment_expression.get('expression_obj_ref') sample_expression_ids.append(expression_ref) mapped_expression_ids.append({alignment_ref: expression_ref}) alignment_name = self.ws.get_object_info( [{ "ref": alignment_ref }], includeMetadata=None)[0][1] expression_name = self.ws.get_object_info( [{ "ref": expression_ref }], includeMetadata=None)[0][1] mapped_expression_objects.append({alignment_name: expression_name}) expression_set_data['sample_expression_ids'] = sample_expression_ids expression_set_data[ 'mapped_expression_objects'] = mapped_expression_objects expression_set_data['mapped_expression_ids'] = mapped_expression_ids return expression_set_data def _process_alignment_set_object(self, params, alignment_object_type): """ _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object and KBaseSets.ReadsAlignmentSet type object """ log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object' '\nparams:\n{}'.format(json.dumps(params, indent=1))) alignment_set_ref = params.get('alignment_set_ref') if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): params['gtf_file'] = self._get_gtf_file(alignment_set_ref) else: if not '/' in params['genome_ref']: params['genome_ref'] = params['workspace_name'] + '/' + params[ 'genome_ref'] params['gtf_file'] = self._get_gtf_file_from_genome_ref( params['genome_ref']) alignment_set = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) mul_processor_params = [] for alignment in alignment_set["data"]["items"]: alignment_ref = alignment['ref_path'] alignment_upload_params = params.copy() alignment_upload_params['alignment_ref'] = alignment_ref mul_processor_params.append(alignment_upload_params) # use the following when you want to run the cmd sequentially # self._process_kbasesets_alignment_object(mul_processor_params[0]) cpus = min(params.get('num_threads'), multiprocessing.cpu_count()) pool = Pool(ncpus=cpus) log('running _process_alignment_object with {} cpus'.format(cpus)) alignment_expression_map = pool.map( self._process_kbasesets_alignment_object, mul_processor_params) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) expression_items = list() for proc_alignment_return in alignment_expression_map: expression_obj_ref = proc_alignment_return.get( 'expression_obj_ref') alignment_ref = proc_alignment_return.get('alignment_ref') alignment_info = self.ws.get_object_info3({ 'objects': [{ "ref": alignment_ref }], 'includeMetadata': 1 }) condition = alignment_info['infos'][0][10]['condition'] expression_items.append({ "ref": expression_obj_ref, "label": condition, }) expression_name = self.ws.get_object_info( [{ "ref": expression_obj_ref }], includeMetadata=None)[0][1] self._run_command('cp -R {} {}'.format( proc_alignment_return.get('result_directory'), os.path.join(result_directory, expression_name))) expression_set = { "description": "generated by kb_cufflinks", "items": expression_items } expression_set_info = self.set_api.save_expression_set_v1({ "workspace": params['workspace_name'], "output_object_name": params['expression_set_name'], "data": expression_set }) returnVal = { 'result_directory': result_directory, 'expression_obj_ref': expression_set_info['set_ref'] } widget_params = { "output": params.get('expression_set_name'), "workspace": params.get('workspace_name') } returnVal.update(widget_params) return returnVal def _generate_output_object_name(self, params, alignment_object_type, alignment_object_name): """ Generates the output object name based on input object type and name and stores it in params with key equal to 'expression' or 'expression_set' based on whether the input object is an alignment or alignment_set. :param params: module input params :param alignment_object_type: input alignment object type :param alignment_object_name: input alignment object name :param alignment_object_data: input alignment object data """ expression_set_suffix = params['expression_set_suffix'] expression_suffix = params['expression_suffix'] if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): if re.match('.*_[Aa]lignment$', alignment_object_name): params['expression_name'] = re.sub('_[Aa]lignment$', expression_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_name'] = alignment_object_name + expression_suffix if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name): # set expression set name params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$', expression_set_suffix, alignment_object_name) else: # assume user specified suffix params[ 'expression_set_name'] = alignment_object_name + expression_set_suffix def _save_expression_matrix(self, expressionset_ref, workspace_name): """ _save_expression_matrix: save FPKM and TPM ExpressionMatrix """ log('start saving ExpressionMatrix object') expression_set_name = self.ws.get_object_info( [{ "ref": expressionset_ref }], includeMetadata=None)[0][1] output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '', expression_set_name) upload_expression_matrix_params = { 'expressionset_ref': expressionset_ref, 'output_obj_name': output_obj_name_prefix, 'workspace_name': workspace_name } expression_matrix_refs = self.eu.get_expressionMatrix( upload_expression_matrix_params) return expression_matrix_refs def run_cufflinks_app(self, params): log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_cufflinks_params(params) alignment_object_ref = params.get('alignment_object_ref') alignment_object_info = self.ws.get_object_info3( {"objects": [{ "ref": alignment_object_ref }]})['infos'][0] alignment_object_type = alignment_object_info[2] alignment_object_name = alignment_object_info[1] # get output object name self._generate_output_object_name(params, alignment_object_type, alignment_object_name) log('--->\nalignment object type: \n' + '{}'.format(alignment_object_type)) if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type): params.update({'alignment_ref': alignment_object_ref}) returnVal = self._process_rnaseq_alignment_object(params) report_output = self._generate_report( returnVal.get('expression_obj_ref'), params.get('workspace_name'), returnVal.get('result_directory')) returnVal.update(report_output) elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \ re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type): params.update({'alignment_set_ref': alignment_object_ref}) returnVal = self._process_alignment_set_object( params, alignment_object_type) expression_matrix_refs = self._save_expression_matrix( returnVal['expression_obj_ref'], params.get('workspace_name')) returnVal.update(expression_matrix_refs) report_output = self._generate_report( returnVal['expression_obj_ref'], params.get('workspace_name'), returnVal['result_directory'], expression_matrix_refs['exprMatrix_FPKM_ref'], expression_matrix_refs['exprMatrix_TPM_ref']) returnVal.update(report_output) else: raise ValueError( 'None RNASeqAlignment type\nObject info:\n{}'.format( alignment_object_info)) return returnVal
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.dfu = DataFileUtil(self.cfg.callbackURL) def import_file(self, params): # 1) validate parameters self._validate_import_file_params(params) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) # 4) do the upload result = self.upload_genome( shock_service_url=self.cfg.shockURL, handle_service_url=self.cfg.handleURL, workspace_service_url=self.cfg.workspaceURL, callback_url=self.cfg.callbackURL, input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], taxon_wsname=params['taxon_wsname'], taxon_reference=params['taxon_reference'], source=params['source'], genome_type=params['type'], release=params['release']) # 5) generate report output_data_ref = params['workspace_name'] + "/" + params['genome_name'] reportObj = { 'objects_created': [{ 'ref': output_data_ref, 'description': 'KBase Genome object' }], 'text_message': result['report_string'] } reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL']) report_info = reportClient.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # 6) clear the temp directory shutil.rmtree(input_directory) # 7) return the result info = result['genome_info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return details def upload_genome(self, shock_service_url=None, handle_service_url=None, workspace_service_url=None, callback_url=None, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", taxon_wsname='ReferenceTaxons', taxon_reference=None, source=None, release=None, genome_type=None): # retrieve taxon taxonomy, taxon_reference = self._retrieve_taxon( taxon_reference, taxon_wsname, scientific_name) # reading in Fasta file assembly = self._retrieve_fasta_file(input_fasta_file, core_genome_name, scientific_name, source) if taxon_reference is not None: assembly["taxon_ref"] = taxon_reference # reading in GFF file feature_list = self._retrieve_gff_file(input_gff_file) # compile links between features feature_hierarchy = self._generate_feature_hierarchy(feature_list) # retrieve genome feature list (genome_features_list, genome_mrnas_list, genome_cdss_list) = self._retrieve_genome_feature_list( feature_list, feature_hierarchy, assembly) # remove sequences before loading for contig in assembly["contigs"]: del assembly["contigs"][contig]["sequence"] aUtil = AssemblyUtil(callback_url) assembly_ref = aUtil.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file, 'assembly_name': assembly['assembly_id'] }, 'workspace_name': workspace_name, 'assembly_name': assembly['assembly_id'] }) # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file) workspace_id = self.dfu.ws_name_to_id(workspace_name) genome_info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "name": core_genome_name, "type": "KBaseGenomes.Genome", "data": genome }] })[0] report_string = '' return {'genome_info': genome_info, 'report_string': report_string} def _validate_import_file_params(self, params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(file.keys()) raise ValueError(error_msg) # check for valid type param valid_types = ['Reference', 'User upload', 'Representative'] if params.get('type') and params['type'] not in valid_types: error_msg = 'Entered value for type is not one of the valid entries of ' error_msg += '[' + ''.join('"' + str(e) + '", ' for e in valid_types)[0:-2] + ']' raise ValueError(error_msg) def _set_parsed_params(self, params): log('Setting params') # default params default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'type': 'User upload', 'metadata': {} } for field in default_params: if field not in params: params[field] = default_params[field] log(json.dumps(params, indent=1)) return params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: print("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name): """ _retrieve_taxon: retrieve taxonomy and taxon_reference """ taxon_id = -1 taxon_object_name = "unknown_taxon" # retrieve lookup object if scientific name provided if (taxon_reference is None and scientific_name is not "unknown_taxon"): # retrieve taxon lookup object then find taxon id taxon_lookup = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/taxon_lookup"], 'ignore_errors': 0 })['data'][0]['data']['taxon_lookup'] if (scientific_name[0:3] in taxon_lookup and scientific_name in taxon_lookup[scientific_name[0:3]]): taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name] taxon_object_name = "{}_taxon".format(str(taxon_id)) # retrieve Taxon object taxon_info = {} if (taxon_reference is None): taxon_info = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/" + taxon_object_name], 'ignore_errors': 0 })['data'][0] taxon_reference = "{}/{}/{}".format(taxon_info['info'][6], taxon_info['info'][0], taxon_info['info'][4]) else: taxon_info = self.dfu.get_objects({ "object_refs": [taxon_reference], 'ignore_errors': 0 })['data'][0] taxonomy = taxon_info['data']['scientific_lineage'] return taxonomy, taxon_reference def _retrieve_fasta_file(self, input_fasta_file, core_genome_name, scientific_name, source): """ _retrieve_fasta_file: retrieve info from fasta_file https://www.biostars.org/p/710/ """ log("Reading FASTA file") assembly = { "contigs": {}, "dna_size": 0, "gc_content": 0, "md5": [], "base_counts": {} } contig_seq_start = 0 input_file_handle = open(input_fasta_file, 'rb') # alternate header and sequence faiter = (x[1] for x in itertools.groupby(input_file_handle, lambda line: line[0] == ">")) for header in faiter: # drop the ">" header = header.next()[1:].strip() # join all sequence lines to one. seq = "".join(s.strip() for s in faiter.next()) try: fasta_header, fasta_description = header.split(' ', 1) except: fasta_header = header fasta_description = None # Handle record seq = seq.upper() # Build contig objects for Assembly seq_count = dict(collections.Counter(seq)) # to delete at end, but required for now contig_dict = {"sequence": seq} Ncount = 0 if "N" in seq_count: Ncount = seq_count["N"] contig_dict["Ncount"] = Ncount for character in seq_count: if character in assembly["base_counts"]: assembly["base_counts"][character] += seq_count[character] else: assembly["base_counts"][character] = seq_count[character] contig_seq_length = len(seq) assembly["dna_size"] += contig_seq_length contig_gc_length = seq.count("G") contig_gc_length += seq.count("C") contig_dict["gc_content"] = float("{0:.2f}".format( float(contig_gc_length) / float(contig_seq_length))) assembly["gc_content"] += contig_gc_length contig_dict["contig_id"] = fasta_header contig_dict["name"] = fasta_header contig_dict["length"] = contig_seq_length contig_dict["md5"] = hashlib.md5(seq).hexdigest() assembly["md5"].append(contig_dict["md5"]) if fasta_description is not None: contig_dict["description"] = fasta_description contig_dict["is_circular"] = "Unknown" contig_dict["start_position"] = contig_seq_start contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"]) assembly["contigs"][fasta_header] = contig_dict # used for start of next sequence and total gc_content contig_seq_start += contig_seq_length assembly["gc_content"] = float("{0:.2f}".format( float(assembly["gc_content"]) / float(contig_seq_start))) assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest() assembly["assembly_id"] = core_genome_name + "_assembly" assembly["name"] = scientific_name assembly["external_source"] = source assembly["external_source_id"] = os.path.basename(input_fasta_file) assembly["external_source_origination_date"] = str( os.stat(input_fasta_file).st_ctime) assembly["num_contigs"] = len(assembly["contigs"].keys()) assembly["type"] = "Unknown" assembly[ "notes"] = "Note MD5s are generated from uppercasing the sequences" return assembly def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = dict() is_phytozome = 0 is_patric = 0 gff_file_handle = open(input_gff_file, 'rb') current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if ("phytozome" in source_id or "Phytozome" in source_id): is_phytozome = 1 #Checking to see if Phytozome if ("PATRIC" in source_id): is_patric = 1 #PATRIC prepends their contig ids with some gibberish if (is_patric and "|" in contig_id): contig_id = contig_id.split("|", 1)[1] #Features grouped by contigs first if (contig_id not in feature_list): feature_list[contig_id] = list() #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': attributes } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") ftr[key] = value feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if (is_phytozome): self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline if (is_phytozome): self._print_phytozome_gff(input_gff_file, feature_list) return feature_list def _add_missing_identifiers(self, feature_list): #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list.keys(): for i in range(len(feature_list[contig])): if ("ID" not in feature_list[contig][i]): for key in ("transcriptId", "proteinId", "PACid", "pacid", "Parent"): if (key in feature_list[contig][i]): feature_list[contig][i]['ID'] = feature_list[ contig][i][key] break #If the process fails, throw an error for ftr_type in ("gene", "mRNA", "CDS"): if (ftr_type not in feature_list[contig][i]): continue if ("ID" not in feature_list[contig][i]): log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \ feature_list[contig][i]['contig']+"."+ \ feature_list[contig][i]['source']+"."+ \ feature_list[contig][i]['type']+": "+ \ feature_list[contig][i]['attributes']) return feature_list def _generate_feature_hierarchy(self, feature_list): feature_hierarchy = {contig: {} for contig in feature_list} #Need to remember mRNA/gene links for CDSs mRNA_gene_dict = {} exon_list_position_dict = {} for contig in feature_list: for i in range(len(feature_list[contig])): ftr = feature_list[contig][i] if ("gene" in ftr["type"]): feature_hierarchy[contig][ftr["ID"]] = { "utrs": [], "mrnas": [], "cdss": [], "index": i } if ("UTR" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ ftr["Parent"]]]["utrs"].append({ "id": ftr["ID"], "index": i }) if ("RNA" in ftr["type"]): feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({ "id": ftr["ID"], "index": i, "cdss": [] }) mRNA_gene_dict[ftr["ID"]] = ftr["Parent"] exon_list_position_dict[ftr["ID"]] = len( feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1 if ("CDS" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\ [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } ) return feature_hierarchy def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list.keys(): ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list def _update_phytozome_features(self, feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list.keys(): feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("ID", "PACid", "pacid"): if (key in feature_list[contig][i]): old_id = feature_list[contig][i][key] break if (old_id is None): #This should be an error print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\ feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes]) continue #Retain old_id feature_position_dict[old_id] = i #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("Name" in feature_list[contig][i]): feature_list[contig][i]["ID"] = feature_list[contig][i][ "Name"] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended #4) CDS appended with an incremented digit CDS_count_dict = dict() mRNA_parent_dict = dict() for contig in feature_list.keys(): for ftr in feature_list[contig]: if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] if ("CDS" in ftr["type"]): #Increment CDS identifier if (ftr["ID"] not in CDS_count_dict): CDS_count_dict[ftr["ID"]] = 1 else: CDS_count_dict[ftr["ID"]] += 1 ftr["ID"] = ftr["ID"] + "." + str( CDS_count_dict[ftr["ID"]]) #Recall new mRNA id for parent ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]] return feature_list def _print_phytozome_gff(self, input_gff_file, feature_list): #Write modified feature ids to new file input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz" try: print "Printing to new file: " + input_gff_file gff_file_handle = gzip.open(input_gff_file, 'wb') except: print "Failed to open" for contig in sorted(feature_list.iterkeys()): for ftr in feature_list[contig]: #Re-build attributes attributes_dict = {} for attribute in ftr["attributes"].split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") if (ftr[key] != value): value = ftr[key] attributes_dict[key] = value ftr["attributes"] = ";".join(key + "=" + attributes_dict[key] for key in attributes_dict.keys()) new_line = "\t".join( str(ftr[key]) for key in [ 'contig', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]) gff_file_handle.write(new_line) gff_file_handle.close() return def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy, assembly): genome_features_list = list() genome_mrnas_list = list() genome_cdss_list = list() genome_translation_issues = list() for contig in feature_hierarchy: for gene in feature_hierarchy[contig]: #We only iterate through the gene objects #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly ftr = feature_list[contig][feature_hierarchy[contig][gene] ["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] gene_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Add non-optional terms gene_ftr["mrnas"] = list() gene_ftr["cdss"] = list() gene_ftr["ontology_terms"] = dict() #Retaining longest sequences for gene feature longest_protein_length = 0 longest_protein_sequence = "" for mRNA in feature_hierarchy[contig][gene]["mrnas"]: ######################################################## # Construct mRNA Ftr ######################################################## ftr = feature_list[contig][mRNA["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] mRNA_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Modify mrna object for use in mrna array #Objects will be un-used until further notice mRNA_ftr['parent_gene'] = gene_ftr['id'] #If there are CDS, then New CDS ID without incrementation as they were aggregated if (len(mRNA['cdss']) > 0): mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS" else: mRNA_ftr['cds'] = "" #Add to mrnas array genome_mrnas_list.append(mRNA_ftr) #Add ids to gene_ftr arrays gene_ftr["mrnas"].append(mRNA_ftr["id"]) ######################################################## # Construct transcript, protein sequence, UTR, CDS locations ######################################################## #At time of writing, all of this aggregation should probably be done in a single function cds_exons_locations_array = list() cds_cdna_sequence = str() protein_sequence = str() if (len(mRNA["cdss"]) > 0): (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \ self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues) UTRs = list() if ("utrs" in feature_hierarchy[contig][gene] and len(feature_hierarchy[contig][gene]["utrs"]) > 0): for UTR in feature_hierarchy[contig][gene]["utrs"]: ftr = feature_list[contig][UTR["index"]] if ("Parent" in ftr and ftr["Parent"] == mRNA_ftr["id"]): UTRs.append(ftr) mrna_exons_locations_array = copy.deepcopy( cds_exons_locations_array) mrna_transcript_sequence = str(cds_cdna_sequence) if (len(UTRs) > 0): (mrna_exons_locations_array, mrna_transcript_sequence) = \ self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence) #Update sequence and locations mRNA_ftr["dna_sequence"] = mrna_transcript_sequence mRNA_ftr["dna_sequence_length"] = len( mrna_transcript_sequence) mRNA_ftr["location"] = mrna_exons_locations_array mRNA_ftr["md5"] = hashlib.md5( mRNA_ftr["dna_sequence"]).hexdigest() #Remove DNA del mRNA_ftr["dna_sequence"] del mRNA_ftr["dna_sequence_length"] #Skip CDS if not present if (len(mRNA["cdss"]) == 0): continue #Remove asterix representing stop codon if present if (len(protein_sequence) > 0 and protein_sequence[-1] == '*'): protein_sequence = protein_sequence[:-1] #Save longest sequence if (len(protein_sequence) > longest_protein_length): longest_protein_length = len(protein_sequence) longest_protein_sequence = protein_sequence ######################################################## # Construct CDS Ftr ######################################################## CDS_ftr = dict() CDS_ftr['type'] = 'CDS' #New CDS ID without incrementation as they were aggregated CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS' #Add gene/mrna links CDS_ftr['parent_gene'] = gene_ftr['id'] CDS_ftr['parent_mrna'] = mRNA_ftr['id'] #Update sequence and locations CDS_ftr["dna_sequence"] = cds_cdna_sequence CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence) CDS_ftr["location"] = cds_exons_locations_array CDS_ftr["md5"] = hashlib.md5( CDS_ftr["dna_sequence"]).hexdigest() #Add protein CDS_ftr["protein_translation"] = str( protein_sequence).upper() CDS_ftr["protein_translation_length"] = len( CDS_ftr["protein_translation"]) #Only generate md5 for dna sequences #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest() #Add empty non-optional fields for populating in future CDS_ftr["ontology_terms"] = dict() if ("aliases" not in CDS_ftr): CDS_ftr["aliases"] = list() if ("function" not in CDS_ftr): CDS_ftr["function"] = "" #Add to cdss array genome_cdss_list.append(CDS_ftr) #Add ids to gene_ftr arrays gene_ftr["cdss"].append(CDS_ftr["id"]) gene_ftr["protein_translation"] = longest_protein_sequence gene_ftr["protein_translation_length"] = longest_protein_length genome_features_list.append(gene_ftr) msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format( len(genome_features_list), len(genome_mrnas_list), len(genome_cdss_list)) msg += "{} mRNA(s) had errors during translation".format( len(genome_translation_issues)) log(msg) return genome_features_list, genome_mrnas_list, genome_cdss_list def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome["features"] = genome_features_list genome["cdss"] = genome_cdss_list genome["mrnas"] = genome_mrnas_list genome["source"] = source genome["domain"] = "Eukaryota" genome["genetic_code"] = 1 genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] if taxon_reference is not None: genome["taxon_ref"] = taxon_reference genome["taxonomy"] = taxonomy gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) gff_handle_ref = gff_file_to_shock['handle']['hid'] genome['gff_handle_ref'] = gff_handle_ref return genome def _convert_ftr_object(self, old_ftr, contig): new_ftr = dict() new_ftr["id"] = old_ftr["ID"] dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]], IUPAC.ambiguous_dna) # reverse complement if (old_ftr["strand"] == "-"): dna_sequence = dna_sequence.reverse_complement() old_start = old_ftr["start"] old_ftr["start"] = old_ftr["end"] old_ftr["end"] = old_start new_ftr["dna_sequence"] = str(dna_sequence).upper() new_ftr["dna_sequence_length"] = len(dna_sequence) new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest() new_ftr["location"] = [[ old_ftr["contig"], old_ftr["start"], old_ftr["strand"], len(dna_sequence) ]] new_ftr["type"] = old_ftr["type"] new_ftr["aliases"] = list() for key in ("transcriptId", "proteinId", "PACid", "pacid"): if (key in old_ftr.keys()): new_ftr["aliases"].append(key + ":" + old_ftr[key]) return new_ftr def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence): #create copies of locations and transcript utrs_exons = list(exons) utr_exon_sequence = exon_sequence five_prime_dna_sequence = "" three_prime_dna_sequence = "" five_prime_locations = list() three_prime_locations = list() for UTR in (utr_list): contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"] UTR_ftr = self._convert_ftr_object( UTR, contig_sequence ) #reverse-complementation for negative strands done here #aggregate sequences and locations if ("five_prime" in UTR_ftr["id"]): five_prime_dna_sequence += UTR_ftr["dna_sequence"] five_prime_locations.append(UTR_ftr["location"][0]) if ("three_prime" in UTR_ftr["id"]): three_prime_dna_sequence += UTR_ftr["dna_sequence"] three_prime_locations.append(UTR_ftr["location"][0]) #Handle five_prime UTRs if (len(five_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file) five_prime_locations = sorted(five_prime_locations, key=lambda x: x[1]) #Merge last UTR with CDS if "next" to each other if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \ ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ): #Remove last UTR last_five_prime_location = five_prime_locations[-1] five_prime_locations = five_prime_locations[:-1] #"Add" last UTR to first exon utrs_exons[0][1] = last_five_prime_location[1] utrs_exons[0][3] += last_five_prime_location[3] #Prepend other UTRs if available if (len(five_prime_locations) > 0): utrs_exons = five_prime_locations + utrs_exons utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence #Handle three_prime UTRs if (len(three_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file three_prime_locations = sorted(three_prime_locations, key=lambda x: x[1]) #Merge first UTR with CDS if "next to each other if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \ ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ): #Remove first UTR first_three_prime_location = three_prime_locations[0] three_prime_locations = three_prime_locations[1:] #"Add" first UTR to last exon utrs_exons[-1][3] += first_three_prime_location[3] #Append other UTRs if available if (len(three_prime_locations) > 0): utrs_exons = utrs_exons + three_prime_locations utr_exon_sequence += three_prime_dna_sequence return (utrs_exons, utr_exon_sequence) def _cds_aggregation_translation(self, cds_list, feature_list, assembly, issues): dna_sequence = "" locations = list() # collect phases, and lengths of exons # right now, this is only for the purpose of error reporting phases = list() exons = list() #Saving parent mRNA identifier Parent_mRNA = cds_list[0]["id"] for CDS in (cds_list): ftr = feature_list[CDS["index"]] phases.append(ftr["phase"]) Parent_mRNA = ftr["Parent"] contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"] CDS_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here exons.append(len(CDS_ftr["dna_sequence"])) # Remove base(s) according to phase, but only for first CDS if (CDS == cds_list[0] and int(ftr["phase"]) != 0): log("Adjusting phase for first CDS: " + CDS["id"]) CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][ int(ftr["phase"]):] #aggregate sequences and locations dna_sequence += CDS_ftr["dna_sequence"] locations.append(CDS_ftr["location"][0]) # translate sequence dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna) rna_sequence = dna_sequence_obj.transcribe() # incomplete gene model with no start codon if str(rna_sequence.upper())[:3] not in codon_table.start_codons: msg = "Missing start codon for {}. Possibly incomplete gene model.".format( Parent_mRNA) log(msg) # You should never have this problem, needs to be reported rather than "fixed" codon_count = len(str(rna_sequence)) % 3 if codon_count != 0: msg = "Number of bases for RNA sequence for {} ".format( Parent_mRNA) msg += "is not divisible by 3. " msg += "The resulting protein may well be mis-translated." log(msg) issues.append(Parent_mRNA) protein_sequence = Seq("") try: protein_sequence = rna_sequence.translate() except CodonTable.TranslationError as te: log("TranslationError for: " + feature_object["id"], phases, exons, " : " + str(te)) return (locations, dna_sequence.upper(), str(protein_sequence).upper())
class GenericsUtil: def _validate_fetch_data_params(self, params): """ _validate_fetch_data_params: validates params passed to fetch_data method """ log('start validating fetch_data params') # check for required parameters for p in ['obj_ref']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _validate_import_matrix_from_excel_params(self, params): """ _validate_import_matrix_from_excel_params: validates params passed to import_matrix_from_excel method """ log('start validating import_matrix_from_excel params') # check for required parameters for p in ['obj_type', 'matrix_name', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) obj_type = params.get('obj_type') if obj_type not in MATRIX_TYPE: raise ValueError('Unknown matrix object type: {}'.format(obj_type)) if params.get('input_file_path'): file_path = params.get('input_file_path') elif params.get('input_shock_id'): file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') elif params.get('input_staging_file_path'): file_path = self.dfu.download_staging_file({ 'staging_file_subdir_path': params.get('input_staging_file_path') }).get('copy_file_path') else: error_msg = "Must supply either a input_shock_id or input_file_path " error_msg += "or input_staging_file_path" raise ValueError(error_msg) refs_key = [ 'col_conditionset_ref', 'row_conditionset_ref', 'genome_ref', 'diff_expr_matrix_ref' ] refs = {k: v for k, v in params.items() if k in refs_key} return (obj_type, file_path, params.get('workspace_name'), params.get('matrix_name'), refs) def _upload_to_shock(self, file_path): """ _upload_to_shock: upload target file to shock using DataFileUtil """ log('Start uploading file to shock: {}'.format(file_path)) file_to_shock_params = {'file_path': file_path, 'pack': 'zip'} shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id') return shock_id def _upload_dir_to_shock(self, directory): """ _upload_dir_to_shock: upload target dir to shock using DataFileUtil """ log('Start uploading directory to shock: {}'.format(directory)) file_to_shock_params = {'file_path': directory, 'pack': 'zip'} shock_file = self.dfu.file_to_shock(file_to_shock_params) shock_id = shock_file.get('shock_id') return shock_id def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _generate_html_string(self, df): """ _generate_html_string: generating a html string from df template used: https://developers.google.com/chart/interactive/docs/gallery/table https://developers.google.com/chart/interactive/docs/reference#formatters """ dtypes = df.dtypes columns = df.columns column_str = '' number_columns = [] for idx, column in enumerate(columns): dtype = dtypes[idx].name if 'int' in dtype or 'float' in dtype: column_str += "data.addColumn('number', '{}')\n".format(column) number_columns.append(column) else: column_str += "data.addColumn('string', '{}')\n".format(column) data_str = "data.addRows({})".format(df.values.tolist()) formatter_str = '' for number_column in number_columns: mean = round(df[number_column].mean(), 2) column_n = columns.tolist().index(number_column) formatter_str += "var formatter_{} = ".format(column_n) formatter_str += "new google.visualization.BarFormat({base: " formatter_str += str(mean) formatter_str += ", width: 120});\n" formatter_str += "formatter_{}.format(data, {});\n".format( column_n, column_n) return column_str, data_str, formatter_str def _find_between(self, s, start, end): """ _find_between: find string in between start and end """ return re.search('{}(.*){}'.format(start, end), s).group(1) def _find_type_spec(self, obj_type): """ _find_type_spec: find body spec of type """ obj_type_name = self._find_between(obj_type, '\.', '\-') type_info = self.wsClient.get_type_info(obj_type) type_spec = type_info.get('spec_def') type_spec_list = type_spec.split(obj_type_name + ';') obj_type_spec = type_spec_list[0].split('structure')[-1] log('Found spec for {}\n{}\n'.format(obj_type, obj_type_spec)) return obj_type_spec def _find_constraints(self, obj_type): """ _find_constraints: retrieve constraints (@contains, rowsum, unique) """ type_info = self.wsClient.get_type_info(obj_type) type_desc = type_info.get('description') constraints = {'contains': [], 'rowsum': [], 'unique': []} unique = [ item.split('\n')[0].strip() for item in type_desc.split('@unique')[1:] ] constraints['unique'] = unique contains = [ item.split('\n')[0].strip() for item in type_desc.split('@contains')[1:] ] constraints['contains'] = contains return constraints def _find_generics_type(self, obj_type): """ _find_generics_type: try to find generics type in an object """ log('Start finding generics type and name') obj_type_spec = self._find_type_spec(obj_type) if not obj_type_spec: raise ValueError('Cannot retrieve spec for: {}'.format(obj_type)) generics_types = [ generics_type for generics_type in GENERICS_TYPE if generics_type in obj_type_spec ] if not generics_types: error_msg = 'Cannot find generics type in spec:\n{}\n'.format( obj_type_spec) raise ValueError(error_msg) generics_module = dict() for generics_type in generics_types: for item in obj_type_spec.split(generics_type)[1:]: generics_type_name = item.split(';')[0].strip().split( ' ')[-1].strip() generics_module.update({generics_type_name: generics_type}) log('Found generics type:\n{}\n'.format(generics_module)) return generics_module def _convert_data(self, data, generics_module): """ _convert_data: convert data to df based on data_type """ data_types = generics_module.values() if not set(GENERICS_TYPE) >= set(data_types): raise ValueError( 'Found unknown generics data type in:\n{}\n'.format( data_types)) if data_types == ['FloatMatrix2D']: key = generics_module.keys()[generics_module.values().index( 'FloatMatrix2D')] values = data[key]['values'] index = data[key]['row_ids'] columns = data[key]['col_ids'] df = pd.DataFrame(values, index=index, columns=columns) # elif 'FloatMatrix2D' in data_types: # default case # key = generics_module.keys()[generics_module.values().index('FloatMatrix2D')] # values = data[key]['values'] # index = data[key]['row_ids'] # columns = data[key]['col_ids'] # df = pd.DataFrame(values, index=index, columns=columns) else: raise ValueError('Unexpected Error') return df.to_json() def _retrieve_data(self, obj_ref, generics_module=None): """ _retrieve_data: retrieve object data and return a dataframe in json format """ log('Start retrieving data') obj_source = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0] obj_info = obj_source.get('info') obj_data = obj_source.get('data') if not generics_module: generics_module = self._find_generics_type(obj_info[2]) try: data = { k: v for k, v in obj_data.items() if k in generics_module.keys() } except KeyError: raise ValueError('Retrieved wrong generics type name') data_matrix = self._convert_data(data, generics_module) return data_matrix def _get_col_cond_list(self, col_mapping, col_conditionset_ref, cols): """ _get_col_cond_list: generate col condition list for excel """ col_cond_list = [] conditionset_data = self.dfu.get_objects( {"object_refs": [col_conditionset_ref]})['data'][0]['data'] col_condition_names = [ factor.get('factor') for factor in conditionset_data.get('factors') ] for col in cols: condition_id = col_mapping.get(col) if condition_id: col_cond_list.append( conditionset_data.get('conditions').get(condition_id)) else: col_cond_list.append([''] * len(col_condition_names)) col_cond_list = map(list, zip(*col_cond_list)) for idx, col_array in enumerate(col_cond_list): col_array.insert(0, col_condition_names[idx]) return col_cond_list def _get_row_cond_list(self, row_mapping, row_conditionset_ref, rows): """ _get_row_cond_list: generate row condition list for excel """ row_cond_list = [] conditionset_data = self.dfu.get_objects( {"object_refs": [row_conditionset_ref]})['data'][0]['data'] row_condition_names = [ factor.get('factor') for factor in conditionset_data.get('factors') ] row_cond_list.append(row_condition_names) for row in rows: condition_id = row_mapping.get(row) if condition_id: row_cond_list.append( conditionset_data.get('conditions').get(condition_id)) else: row_cond_list.append([''] * len(row_condition_names)) return row_cond_list def _get_data_list(self, cols, rows, values): """ _get_data_list: generate data value list for excel """ data_arrays = [] cols.insert(0, '') data_arrays.append(cols) for idx, row in enumerate(rows): values[idx].insert(0, row) data_arrays += values return data_arrays def _merge_cond_list(self, excel_list, col_cond_list, row_cond_list): """ _merge_cond_list: merge lists for excel """ col_cond_len = len(col_cond_list) for item in excel_list[:col_cond_len]: row_len = len(row_cond_list[0]) if row_cond_list else 0 item[0:0] = [''] * row_len if row_cond_list: for idx, item in enumerate(excel_list[col_cond_len:]): item[0:0] = row_cond_list[idx] def _is_number(s): """ _is_number: string is a numeric """ try: float(s) return True except ValueError: pass return False def _gen_excel(self, excel_list, obj_name): """ _gen_excel: create excel """ result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) log('Start writing to file: {}'.format(file_path)) workbook = xlsxwriter.Workbook(file_path, {'nan_inf_to_errors': True}) worksheet = workbook.add_worksheet() row = 1 for data_entry in excel_list: for idx, cell_data in enumerate(data_entry): worksheet.write(row, idx, cell_data) row += 1 workbook.close() return file_path def _write_mapping_sheet(self, file_path, sheet_name, mapping, index): """ _write_mapping_sheet: write mapping to sheet """ df_dict = collections.OrderedDict() df_dict[index[0]] = [] df_dict[index[1]] = [] for key, value in mapping.items(): df_dict.get(index[0]).append(key) df_dict.get(index[1]).append(value) df = pd.DataFrame.from_dict(df_dict) with pd.ExcelWriter(file_path, engine='openpyxl') as writer: writer.book = load_workbook(file_path) df.to_excel(writer, sheet_name=sheet_name) def _filter_constraints(self, constraints, data): contains_constraints = constraints.get('contains') filtered_constraints = [] for contains_constraint in contains_constraints: in_values = contains_constraint.split(' ')[1:] missing_key = True for in_value in in_values: if in_value.startswith('values'): search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1) unique_list = search_value.split('.') key = unique_list[0] elif ':' in in_value: key = in_value.split(':')[0] else: unique_list = in_value.split('.') key = unique_list[0] if key in data: missing_key = False break if missing_key: filtered_constraints.append(contains_constraint) for x in filtered_constraints: contains_constraints.remove(x) return constraints def _retrieve_value(self, data, value): log('Getting value for {}'.format(value)) retrieve_data = [] m_data = DotMap(data) if value.startswith( 'values'): # TODO: nested values e.g. values(values(ids)) search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1) unique_list = search_value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = m_data_cp.values() elif ':' in value: obj_ref = getattr(m_data, value.split(':')[0]) if obj_ref: included = value.split(':')[1] included = '/' + included.replace('.', '/') ref_data = self.wsClient.get_objects2( {'objects': [{ 'ref': obj_ref, 'included': [included] }]})['data'][0]['data'] m_ref_data = DotMap(ref_data) if ref_data: if '*' not in included: for key in included.split('/')[1:]: m_ref_data = getattr(m_ref_data, key) else: keys = included.split('/')[1:] m_ref_data = [ x.get(keys[2]) for x in ref_data.get(keys[0]) ] # TODO: only works for 2 level nested data like '/features/[*]/id' retrieve_data = list(m_ref_data) else: unique_list = value.split('.') m_data_cp = m_data.copy() for attr in unique_list: m_data_cp = getattr(m_data_cp, attr) retrieve_data = list(m_data_cp) log('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20])) return retrieve_data def _generate_report(self, matrix_obj_ref, workspace_name): """ _generate_report: generate summary report """ report_params = { 'message': '', 'objects_created': [{ 'ref': matrix_obj_ref, 'description': 'Imported Matrix' }], 'workspace_name': workspace_name, 'report_object_name': 'import_matrix_from_excel_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _validate(self, constraints, data): """ _validate: validate data """ validated = True failed_constraints = {'contains': [], 'rowsum': [], 'unique': []} unique_constraints = constraints.get('unique') for unique_constraint in unique_constraints: retrieved_value = self._retrieve_value(data, unique_constraint) if len(set(retrieved_value)) != len(retrieved_value): validated = False failed_constraints['unique'].append(unique_constraint) contains_constraints = constraints.get('contains') for contains_constraint in contains_constraints: value = contains_constraint.split(' ')[0] in_values = contains_constraint.split(' ')[1:] retrieved_in_values = [] for in_value in in_values: retrieved_in_values += self._retrieve_value(data, in_value) if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)): validated = False failed_constraints['contains'].append(contains_constraint) return validated, failed_constraints def _process_mapping_sheet(self, file_path, sheet_name): """ _process_mapping: process mapping sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return dict() else: mapping = {value[0]: value[1] for value in df.values.tolist()} return mapping def _process_conditionset_sheet(self, file_path, sheet_name, matrix_name, workspace_id): """ _process_conditionset_sheet: process condition set sheet """ try: df = pd.read_excel(file_path, sheet_name=sheet_name) except XLRDError: return '' else: obj_name = '{}_{}'.format(sheet_name, matrix_name) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name)) df.to_excel(file_path) import_condition_set_params = { 'output_obj_name': obj_name, 'output_ws_id': workspace_id, 'input_file_path': file_path } ref = self.cu.file_to_condition_set(import_condition_set_params) return ref.get('condition_set_ref') def _file_to_data(self, file_path, refs, matrix_name, workspace_id): log('Start reading and converting excel file data') data = refs try: pd.read_excel(file_path) except XLRDError: # TODO: convert csv file to excel log('Found csv file') raise ValueError('Please provide .xlsx file only') # processing data sheet try: df = pd.read_excel(file_path, sheet_name='data') except XLRDError: raise ValueError('Cannot find <data> sheetss') else: df.fillna(0, inplace=True) matrix_data = { 'row_ids': df.index.tolist(), 'col_ids': df.columns.tolist(), 'values': df.values.tolist() } data.update({'data': matrix_data}) # processing col/row_mapping col_mapping = self._process_mapping_sheet(file_path, 'col_mapping') data.update({'col_mapping': col_mapping}) row_mapping = self._process_mapping_sheet(file_path, 'row_mapping') data.update({'row_mapping': row_mapping}) # processing col/row_conditionset col_conditionset_ref = self._process_conditionset_sheet( file_path, 'col_conditionset', matrix_name, workspace_id) data.update({'col_conditionset_ref': col_conditionset_ref}) row_conditionset_ref = self._process_conditionset_sheet( file_path, 'row_conditionset', matrix_name, workspace_id) data.update({'row_conditionset_ref': row_conditionset_ref}) # processing metadata metadata = self._process_mapping_sheet(file_path, 'metadata') data.update(metadata) return data def _build_header_str(self, factor_names): header_str = '' width = 100.0 / len(factor_names) header_str += '<tr class="header">' header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format( width) for factor_name in factor_names: header_str += '<th style="width:{0:.2f}%;"'.format(width) header_str += '>{}</th>'.format(factor_name) header_str += '</tr>' return header_str def _build_html_str(self, row_mapping, conditionset_data, row_ids): log('Start building html replacement') factor_names = [ factor.get('factor') for factor in conditionset_data.get('factors') ] header_str = self._build_header_str(factor_names) table_str = '' conditions = conditionset_data.get('conditions') for feature_id, factor_id in row_mapping.items(): if feature_id in row_ids: feature_conditions = conditions.get(factor_id) table_str += '<tr>' table_str += '<td>{}</td>'.format(feature_id) for feature_condition in feature_conditions: table_str += '<td>{}</td>'.format(feature_condition) table_str += '</tr>' return header_str, table_str def _generate_search_html_report(self, header_str, table_str): html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'search.html') shutil.copy2(os.path.join(os.path.dirname(__file__), 'kbase_icon.png'), output_directory) shutil.copy2( os.path.join(os.path.dirname(__file__), 'search_icon.png'), output_directory) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'search_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '//HEADER_STR', header_str) report_template = report_template.replace( '//TABLE_STR', table_str) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Search Matrix App' }) return html_report def _generate_search_report(self, header_str, table_str, workspace_name): log('Start creating report') output_html_files = self._generate_search_html_report( header_str, table_str) report_params = { 'message': '', 'workspace_name': workspace_name, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_matrix_filter_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _filter_value_data(self, value_data, feature_ids): filtered_value_data = dict() filtered_value_data['col_ids'] = value_data['col_ids'] feature_ids = feature_ids.split(',') filtered_value_data['row_ids'] = feature_ids filtered_value_data['values'] = list() values = value_data['values'] row_ids = value_data['row_ids'] for feature_id in feature_ids: idx = row_ids.index(feature_id) filtered_value_data['values'].append(values[idx]) return filtered_value_data def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.wsClient = workspaceService(self.ws_url, token=self.token) self.cu = ConditionUtils(self.callback_url, service_ver="dev") def filter_matrix(self, params): """ filter_matrix: create sub-matrix based on input feature_ids or group by factor name arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name feature_ids: string of feature ids that result matrix contains filtered_matrix_name: name of newly created filtered matrix object """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') feature_ids = params.get('feature_ids') filtered_matrix_name = params.get('filtered_matrix_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_info = matrix_source.get('info') matrix_data = matrix_source.get('data') matrix_type = self._find_between(matrix_info[2], '\.', '\-') value_data = matrix_data.get('data') filtered_value_data = self._filter_value_data(value_data, feature_ids) matrix_data['data'] = filtered_value_data if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name filtered_matrix_obj_ref = self.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(matrix_type), 'obj_name': filtered_matrix_name, 'data': matrix_data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]} report_output = self._generate_report(filtered_matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def search_matrix(self, params): """ search_matrix: generate a HTML report that allows users to select feature ids arguments: matrix_obj_ref: object reference of a matrix workspace_name: workspace name """ matrix_obj_ref = params.get('matrix_obj_ref') workspace_name = params.get('workspace_name') matrix_source = self.dfu.get_objects({"object_refs": [matrix_obj_ref]})['data'][0] matrix_data = matrix_source.get('data') row_mapping = matrix_data.get('row_mapping') row_conditionset_ref = matrix_data.get('row_conditionset_ref') row_ids = matrix_data['data']['row_ids'] if not (row_mapping and row_conditionset_ref): raise ValueError( 'Matrix obejct is missing either row_mapping or row_conditionset_ref' ) conditionset_data = self.dfu.get_objects( {"object_refs": [row_conditionset_ref]})['data'][0]['data'] header_str, table_str = self._build_html_str(row_mapping, conditionset_data, row_ids) returnVal = self._generate_search_report(header_str, table_str, workspace_name) return returnVal def import_matrix_from_excel(self, params): """ import_matrix_from_excel: import matrix object from excel arguments: obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix matrix_name: matrix object name workspace_name: workspace name matrix object to be saved to input_shock_id: file shock id or input_file_path: absolute file path or input_staging_file_path: staging area file path optional arguments: col_conditionset_ref: column ConditionSet reference row_conditionset_ref: row ConditionSet reference genome_ref: genome reference matrix_obj_ref: Matrix reference """ (obj_type, file_path, workspace_name, matrix_name, refs) = self._validate_import_matrix_from_excel_params(params) if not isinstance(workspace_name, int): workspace_id = self.dfu.ws_name_to_id(workspace_name) else: workspace_id = workspace_name data = self._file_to_data(file_path, refs, matrix_name, workspace_id) matrix_obj_ref = self.save_object({ 'obj_type': 'KBaseMatrices.{}'.format(obj_type), 'obj_name': matrix_name, 'data': data, 'workspace_name': workspace_id })['obj_ref'] returnVal = {'matrix_obj_ref': matrix_obj_ref} report_output = self._generate_report(matrix_obj_ref, workspace_name) returnVal.update(report_output) return returnVal def save_object(self, params): """ save_object: validate data constraints and save matrix object arguments: obj_type: saving object data type obj_name: saving object name data: data to be saved workspace_name: workspace name matrix object to be saved to return: obj_ref: object reference """ log('Starting saving object') obj_type = params.get('obj_type') module_name = obj_type.split('.')[0] type_name = obj_type.split('.')[1] types = self.wsClient.get_module_info({ 'mod': module_name }).get('types') for module_type in types: if self._find_between(module_type, '\.', '\-') == type_name: obj_type = module_type break data = dict((k, v) for k, v in params.get('data').iteritems() if v) validate = self.validate_data({'obj_type': obj_type, 'data': data}) if not validate.get('validated'): log('Data failed type checking') failed_constraints = validate.get('failed_constraints') error_msg = 'Object {} failed type checking:\n'.format( params.get('obj_name')) if failed_constraints.get('unique'): unique_values = failed_constraints.get('unique') error_msg += 'Object should have unique field: {}\n'.format( unique_values) if failed_constraints.get('contains'): contained_values = failed_constraints.get('contains') for contained_value in contained_values: subset_value = contained_value.split(' ')[0] super_value = ' '.join(contained_value.split(' ')[1:]) error_msg += 'Object field [{}] should contain field [{}]\n'.format( super_value, subset_value) raise ValueError(error_msg) workspace_name = params.get('workspace_name') if not isinstance(workspace_name, int): ws_name_id = self.dfu.ws_name_to_id(workspace_name) else: ws_name_id = workspace_name info = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": obj_type, "data": data, "name": params.get('obj_name') }] })[0] return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])} def validate_data(self, params): """ validate_data: validate data arguments: obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1' data: obj data to be validated return: validated: True or False """ constraints = self._find_constraints(params.get('obj_type')) data = params.get('data') constraints = self._filter_constraints(constraints, data) validated, failed_constraints = self._validate(constraints, data) returnVal = { 'validated': validated, 'failed_constraints': failed_constraints } return returnVal def generate_matrix_html(self, params): """ generate_matrix_html: generate a html page for given data arguments: df: a pandas dataframe return: html_string: html as a string format """ column_str, data_str, formatter_str = self._generate_html_string( params.get('df')) with open( os.path.join(os.path.dirname(__file__), 'matrix_page_template.html'), 'r') as matrix_page_template_file: html_string = matrix_page_template_file.read() html_string = html_string.replace('// ADD_COL', column_str) html_string = html_string.replace('// ADD_DATA', data_str) html_string = html_string.replace('// ADD_FORMATTER', formatter_str) returnVal = {'html_string': html_string} return returnVal def fetch_data(self, params): """ fetch_data: fetch generics data as pandas dataframe for a generics data object arguments: obj_ref: generics object reference optional arguments: generics_module: the generics data module to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; generics_module should be {'data': 'FloatMatrix2D', 'condition_set_ref': 'condition_set_ref'} return: data_matrix: a pandas dataframe in json format """ log('--->\nrunning GenericsUtil.fetch_data\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_fetch_data_params(params) try: data_matrix = self._retrieve_data(params.get('obj_ref'), params.get('generics_module')) except Exception: error_msg = 'Running fetch_data returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Please try to specify generics type and name as generics_module\n' raise ValueError(error_msg) returnVal = {'data_matrix': data_matrix} return returnVal def export_matrix(self, params): """ export_matrix: univeral downloader for matrix data object arguments: obj_ref: generics object reference optional arguments: generics_module: select the generics data to be retrieved from e.g. for an given data type like below: typedef structure { FloatMatrix2D data; condition_set_ref condition_set_ref; } SomeGenericsMatrix; and only data is needed generics_module should be {'data': 'FloatMatrix2D'} """ log('Start exporting matrix') if 'input_ref' in params: params['obj_ref'] = params.pop('input_ref') obj_source = self.dfu.get_objects( {"object_refs": [params.get('obj_ref')]})['data'][0] obj_data = obj_source.get('data') result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_source.get('info')[1])) data_matrix = self.fetch_data(params).get('data_matrix') df = pd.read_json(data_matrix) df.to_excel(file_path, sheet_name='data') if obj_data.get('col_mapping'): self._write_mapping_sheet(file_path, 'col_mapping', obj_data.get('col_mapping'), ['col_name', 'condition_name']) obj_data.pop('col_mapping') if obj_data.get('row_mapping'): self._write_mapping_sheet(file_path, 'row_mapping', obj_data.get('row_mapping'), ['row_name', 'condition_name']) obj_data.pop('row_mapping') try: obj_data.pop('data') except KeyError: log('Missing key [data]') self._write_mapping_sheet(file_path, 'metadata', obj_data, ['name', 'value']) shock_id = self._upload_to_shock(file_path) return {'shock_id': shock_id}
class EditAlignmentSet: """ Constains a set of functions for expression levels calculations. """ PARAM_IN_WS_NAME_ID = 'workspace_name' PARAM_IN_OBJ_NAME_ID = 'output_object_name' PARAM_IN_ALIGNSET_REF = 'alignment_set_ref' PARAM_IN_ALIGNS_ADD = 'alignments_to_add' PARAM_IN_ALIGNS_RM = 'alignments_to_remove' def __init__(self, config, logger=None): self.config = config self.logger = logger self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'EAS_' + str(uuid.uuid4())) self.ws_url = config['workspace-url'] self.ws_client = Workspace(self.ws_url) self.dfu = DataFileUtil(self.callback_url) self.setAPI = SetAPI(self.callback_url) pass def _process_params(self, params): """ validates params passed to gen expression matrix method """ for p in [ self.PARAM_IN_ALIGNSET_REF, self.PARAM_IN_OBJ_NAME_ID, self.PARAM_IN_WS_NAME_ID ]: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) ws_name_id = params.get(self.PARAM_IN_WS_NAME_ID) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD) alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM) if alignments_to_add is None and alignments_to_remove is None: raise ValueError( 'Either "alignments_to_remove" or "alignments_to_add" should be given' ) return ws_name_id def _get_type_from_obj_info(self, info): return info[2].split('-')[0] def _get_obj_info(self, ref): return self.ws_client.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0] def _get_set_items(self, alignment_set_ref): obj_info = self._get_obj_info(alignment_set_ref) obj_type = self._get_type_from_obj_info(obj_info) if obj_type in ['KBaseSets.ReadsAlignmentSet']: set_data = self.setAPI.get_reads_alignment_set_v1( {'ref': alignment_set_ref}) items = set_data['data']['items'] elif obj_type in ['KBaseRNASeq.RNASeqAlignmentSet']: alignmentset_obj = self.ws_client.get_objects2( {'objects': [{ 'ref': alignment_set_ref }]})['data'][0] """ Add each alignment object to align_item and add it to items list """ items = list() for alignment_ref in alignmentset_obj['data']['sample_alignments']: align_item = dict() align_item['ref'] = alignment_ref items.append(align_item) else: raise ValueError( '"alignment_set_ref" should be of type KBaseSets.ReadsAlignmentSet or ' + 'KBaseRNASeq.RNASeqAlignmentSet') return items def _add_alignments(self, alignment_set_items, alignment_refs_list): for alignment_ref in alignment_refs_list: found = False for set_item in alignment_set_items: if set_item.get('ref') == alignment_ref: print('{} already in the input Alignment Set. Not added'. format(alignment_ref)) found = True break if not found: alignment_set_items.append({'ref': alignment_ref}) return alignment_set_items def _remove_alignments(self, input_alignment_set, alignment_set_items, alignments_to_remove): for input_item in input_alignment_set: if not (input_item.get('ref') in alignments_to_remove): alignment_set_items.append(input_item) return alignment_set_items def _save_alignment_set(self, ws_name, obj_name, set_data): res = self.setAPI.save_reads_alignment_set_v1({ "workspace": ws_name, "output_object_name": obj_name, "data": set_data }) return res.get('set_ref') def edit_alignment_set(self, params): ws_name_id = self._process_params(params) obj_name = params.get(self.PARAM_IN_OBJ_NAME_ID) alignment_set_ref = params.get(self.PARAM_IN_ALIGNSET_REF) print('INPUT ALIGNMENT SET REF: ' + alignment_set_ref) input_alignment_set = self._get_set_items(alignment_set_ref) alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM, None) alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD, None) set_items = list() if alignments_to_remove is not None: set_items = self._remove_alignments(input_alignment_set, set_items, alignments_to_remove) if alignments_to_add is not None: set_items = self._add_alignments(set_items, alignments_to_add) set_data = { 'description': 'Edited from {}'.format(alignment_set_ref), 'items': set_items } output_alignment_set_ref = self._save_alignment_set( ws_name_id, obj_name, set_data) return output_alignment_set_ref
def upload_genome(shock_service_url=None, handle_service_url=None, workspace_service_url=None, callback_url=None, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", taxon_wsname='ReferenceTaxons', taxon_reference=None, source=None, release=None, genome_type=None): assembly_ref = None gff_handle_ref = None time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) dfUtil = DataFileUtil(callback_url) ########################################### #Retrieve taxon #Taxon lookup dependent on full genus #Example: Athaliana Arabidopsis thaliana ########################################### #default to taxon_id = -1 taxon_object_name = "unknown_taxon" #Retrieve lookup object if scientific name provided if (taxon_reference is None and scientific_name is not "unknown_taxon"): #Need to retrieve taxon lookup object then find taxon id taxon_lookup = dfUtil.get_objects({ 'object_refs': [taxon_wsname + "/taxon_lookup"], 'ignore_errors': 0 })['data'][0]['data']['taxon_lookup'] if (scientific_name[0:3] in taxon_lookup and scientific_name in taxon_lookup[scientific_name[0:3]]): taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name] taxon_object_name = "%s_taxon" % (str(taxon_id)) #Retrieve Taxon object taxon_info = {} if (taxon_reference is None): taxon_info = dfUtil.get_objects({ 'object_refs': [taxon_wsname + "/" + taxon_object_name], 'ignore_errors': 0 })['data'][0] taxon_reference = "%s/%s/%s" % (taxon_info['info'][6], taxon_info['info'][0], taxon_info['info'][4]) else: taxon_info = dfUtil.get_objects([{ "object_refs": [taxon_reference], 'ignore_errors': 0 }])['data'][0] taxonomy = taxon_info['data']['scientific_lineage'] ########################################### #End taxonomy retrieval ########################################### ########################################### #Create logger ########################################### logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) # send messages to sys.stderr streamHandler = logging.StreamHandler(sys.stderr) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s" ) formatter.converter = time.gmtime streamHandler.setFormatter(formatter) logger.addHandler(streamHandler) ########################################### #End logger creation ########################################### ########################################## #Reading in Fasta file, Code taken from https://www.biostars.org/p/710/ ########################################## logger.info("Reading FASTA file.") assembly = { "contigs": {}, "dna_size": 0, "gc_content": 0, "md5": [], "base_counts": {} } contig_seq_start = 0 input_file_handle = open(input_fasta_file, 'rb') # alternate header and sequence faiter = (x[1] for x in itertools.groupby(input_file_handle, lambda line: line[0] == ">")) for header in faiter: # drop the ">" header = header.next()[1:].strip() # join all sequence lines to one. seq = "".join(s.strip() for s in faiter.next()) try: fasta_header, fasta_description = header.split(' ', 1) except: fasta_header = header fasta_description = None #Handle record seq = seq.upper() #Build contig objects for Assembly seq_count = dict(collections.Counter(seq)) #to delete at end, but required for now contig_dict = {"sequence": seq} Ncount = 0 if "N" in seq_count: Ncount = seq_count["N"] contig_dict["Ncount"] = Ncount for character in seq_count: if character in assembly["base_counts"]: assembly["base_counts"][character] += seq_count[character] else: assembly["base_counts"][character] = seq_count[character] contig_seq_length = len(seq) assembly["dna_size"] += contig_seq_length contig_gc_length = seq.count("G") contig_gc_length += seq.count("C") contig_dict["gc_content"] = float("{0:.2f}".format( float(contig_gc_length) / float(contig_seq_length))) assembly["gc_content"] += contig_gc_length contig_dict["contig_id"] = fasta_header contig_dict["name"] = fasta_header contig_dict["length"] = contig_seq_length contig_dict["md5"] = hashlib.md5(seq).hexdigest() assembly["md5"].append(contig_dict["md5"]) if fasta_description is not None: contig_dict["description"] = fasta_description contig_dict["is_circular"] = "Unknown" contig_dict["start_position"] = contig_seq_start contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"]) assembly["contigs"][fasta_header] = contig_dict #used for start of next sequence and total gc_content contig_seq_start += contig_seq_length assembly["gc_content"] = float("{0:.2f}".format( float(assembly["gc_content"]) / float(contig_seq_start))) assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest() assembly["assembly_id"] = core_genome_name + "_assembly" assembly["name"] = scientific_name assembly["external_source"] = source assembly["external_source_id"] = os.path.basename(input_fasta_file) assembly["external_source_origination_date"] = str( os.stat(input_fasta_file).st_ctime) assembly["num_contigs"] = len(assembly["contigs"].keys()) assembly["type"] = "Unknown" assembly[ "notes"] = "Note MD5s are generated from uppercasing the sequences" if taxon_reference is not None: assembly["taxon_ref"] = taxon_reference logger.info("Reading GFF file.") header = list() feature_list = dict() original_CDS_count = dict() original_feature_ids = dict() # gff_file_handle = gzip.open(input_gff_file, 'rb') gff_file_handle = open(input_gff_file, 'rb') current_line = gff_file_handle.readline() gff_object = dict() while (current_line != ''): current_line = current_line.strip() if (current_line.startswith("##") or current_line.startswith("#!")): header.append(current_line) if ('headers' not in gff_object): gff_object['headers'] = list() gff_object['headers'].append(current_line) else: if ('features' not in gff_object): gff_object['features'] = list() contig_id, source_id, feature_type, start, end, score, strand, phase, attributes = current_line.split( '\t') attributes_dict = dict() for attribute in attributes.split(";"): if (attribute == "" or "=" not in attribute): continue key, value = attribute.split("=", 1) attributes_dict[key] = value #ID should be transferred from Name or Parent old_id = None for key in ("ID", "PACid", "pacid"): if (key in attributes_dict): old_id = attributes_dict[key] break if (old_id is None): eprint( "Cannot find unique ID, PACid, or pacid in GFF attributes: " + attributes) continue if ("Name" in attributes_dict): attributes_dict["ID"] = attributes_dict["Name"] else: attributes_dict["ID"] = original_feature_ids[ attributes_dict["Parent"]] + "." + feature_type #if CDS have to increment if (feature_type == "CDS"): if (attributes_dict["ID"] not in original_CDS_count): original_CDS_count[attributes_dict["ID"]] = 1 else: original_CDS_count[attributes_dict["ID"]] += 1 attributes_dict["ID"] += "." + str( original_CDS_count[attributes_dict["ID"]]) #Update parent if ("Parent" in attributes_dict): attributes_dict["Parent"] = original_feature_ids[ attributes_dict["Parent"]] original_feature_ids[old_id] = attributes_dict["ID"] #recreate line for GFF partial_line, attributes = current_line.rsplit('\t', 1) new_line = partial_line + "\t" + ";".join( key + "=" + attributes_dict[key] for key in attributes_dict.keys()) gff_object['features'].append(new_line) if (contig_id not in assembly["contigs"]): logger.warn("Missing contig: " + contig_id) if (contig_id not in feature_list): feature_list[contig_id] = list() feature = { 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase } for attribute in attributes.split(";"): if (attribute == "" or "=" not in attribute): continue key, value = attribute.split("=", 1) feature[key] = value #Append contig identifier feature["contig"] = contig_id feature_list[contig_id].append(feature) current_line = gff_file_handle.readline() gff_file_handle.close() #Writing updated lines to gff_file_handle input_gff_file = input_gff_file.replace("gene", "edited_gene") gff_file_handle = gzip.open(input_gff_file, 'wb') if ('headers' in gff_object): gff_file_handle.write("\n".join(gff_object["headers"])) gff_file_handle.write("\n".join(gff_object["features"])) gff_file_handle.close() #New code inserted to better handle feature identifiers #Start by extracting and group them first features_identifiers_dict = dict() features_identifiers_list = list() features_identifiers_count = dict() features_parents_dict = dict() features_name_id_dict = dict() CDS_count = dict() for contig in sorted(feature_list): for feature in feature_list[contig]: #We're only considering gene, mRNA, and CDS for brevity's sake if (feature["type"] not in ("gene", "mRNA", "CDS")): continue #gene and mRNA always have name, CDS do not if ("Name" not in feature): feature["Name"] = None #Update parent following name/id switch if ("Parent" in feature and feature["Parent"] in features_name_id_dict): feature["Parent"] = features_name_id_dict[feature["Parent"]] #ID should be transferred to Name, but need to maintain parent if (feature["Name"] is not None): features_name_id_dict[feature["ID"]] = feature["Name"] feature["ID"] = feature["Name"] else: feature["ID"] = feature["Parent"] + "." + feature["type"] #if CDS have to increment if (feature["type"] == "CDS"): if (feature["ID"] not in CDS_count): CDS_count[feature["ID"]] = 1 else: CDS_count[feature["ID"]] += 1 feature["ID"] += "." + str(CDS_count[feature["ID"]]) #Collect if (feature["type"] == "gene"): features_identifiers_dict[feature["ID"]] = dict() if (feature["type"] == "mRNA"): features_identifiers_dict[feature["Parent"]][ feature["ID"]] = dict() features_parents_dict[feature["ID"]] = feature["Parent"] if (feature["type"] == "CDS"): features_identifiers_dict[features_parents_dict[ feature["Parent"]]][feature["Parent"]][feature["ID"]] = 1 features_identifiers_list.append(feature) features_identifiers_count[ feature["ID"]] = len(features_identifiers_list) - 1 updated_features_identifiers_dict = dict() updated_features_list = list() updated_features_identifiers_count = dict() updated_features_parents_dict = dict() updated_CDS_count = dict() for gene in sorted(features_identifiers_dict): #retrieve original object gene_ftr = features_identifiers_list[features_identifiers_count[gene]] #store gene updated_features_identifiers_dict[gene_ftr["ID"]] = dict() updated_features_list.append(gene_ftr) updated_features_identifiers_count[ gene_ftr["ID"]] = len(updated_features_list) - 1 for mRNA in sorted(features_identifiers_dict[gene], key=lambda x: features_identifiers_count[x]): #retrieve feature mRNA_ftr = features_identifiers_list[ features_identifiers_count[mRNA]] if ("PAC" in mRNA[0:3]): if ("Name" in mRNA_ftr): mRNA_ftr["ID"] = mRNA_ftr["Name"] updated_features_identifiers_dict[gene_ftr["ID"]][ mRNA_ftr["ID"]] = dict() updated_features_parents_dict[mRNA_ftr["ID"]] = mRNA_ftr["Parent"] updated_features_list.append(mRNA_ftr) updated_features_identifiers_count[ mRNA_ftr["ID"]] = len(updated_features_list) - 1 for CDS in sorted(features_identifiers_dict[gene][mRNA], key=lambda x: features_identifiers_count[x]): #retrieve feature CDS_ftr = features_identifiers_list[ features_identifiers_count[CDS]] if ("PAC" in CDS[0:3]): CDS_ftr["ID"] = mRNA_ftr["ID"] + ".CDS" if (CDS_ftr["ID"] not in updated_CDS_count): updated_CDS_count[CDS_ftr["ID"]] = 1 else: updated_CDS_count[CDS_ftr["ID"]] += 1 CDS_ftr["ID"] += "." + str( updated_CDS_count[CDS_ftr["ID"]]) CDS_ftr["Parent"] = mRNA_ftr["ID"] updated_features_identifiers_dict[gene_ftr["ID"]][ mRNA_ftr["ID"]][CDS_ftr["ID"]] = 1 updated_features_parents_dict[ CDS_ftr["ID"]] = CDS_ftr["Parent"] updated_features_list.append(CDS_ftr) updated_features_identifiers_count[ CDS_ftr["ID"]] = len(updated_features_list) - 1 genome_features_list = list() genome_mrnas_list = list() genome_cdss_list = list() for gene in sorted(updated_features_identifiers_dict): #retrieve updated object gene_ftr = updated_features_list[ updated_features_identifiers_count[gene]] gene_object = convert_ftr_object( gene_ftr, assembly["contigs"][gene_ftr["contig"]]["sequence"]) gene_object["type"] = "gene" #New terms, TODO, move to end of gene loop gene_object["cdss"] = list() gene_object["mrnas"] = list() #use function of longest CDS for gene longest_protein_length = 0 longest_protein_sequence = "" for mRNA in sorted( updated_features_identifiers_dict[gene], key=lambda x: updated_features_identifiers_count[x]): #retrieve updated object mRNA_ftr = updated_features_list[ updated_features_identifiers_count[mRNA]] feature_object = convert_ftr_object( mRNA_ftr, assembly["contigs"][mRNA_ftr["contig"]]["sequence"]) feature_object['parent_gene'] = gene_object['id'] mrna_object = copy.deepcopy(feature_object) cds_object = copy.deepcopy(feature_object) cds_object['id'] = mrna_object['id'] + ".CDS" mrna_object['cds'] = cds_object['id'] cds_object['parent_mrna'] = mrna_object['id'] del mrna_object["dna_sequence"] del mrna_object["dna_sequence_length"] cds_object["ontology_terms"] = dict() gene_object["mrnas"].append(mrna_object["id"]) gene_object["cdss"].append(cds_object["id"]) #CDS aggregation needs to be done in order to build protein sequence and list of locations CDS_list = sorted( updated_features_identifiers_dict[gene][mRNA], key=lambda x: updated_features_identifiers_count[x]) dna_sequence = "" locations = list() #collect phases, and lengths of exons #right now, this is only for the purpose of error reporting phases = list() exons = list() for CDS in (CDS_list): #retrieve updated partial CDS add_ftr = updated_features_list[ updated_features_identifiers_count[CDS]] phases.append(add_ftr["phase"]) add_ftr_obj = convert_ftr_object( add_ftr, assembly["contigs"][add_ftr["contig"]]["sequence"]) exons.append(len(add_ftr_obj["dna_sequence"])) #Remove base(s) according to phase, but only for first CDS if (CDS == CDS_list[0] and int(add_ftr["phase"]) != 0): logger.info("Adjusting phase for first CDS: " + CDS) add_ftr_obj["dna_sequence"] = add_ftr_obj["dna_sequence"][ int(add_ftr["phase"]):] dna_sequence += add_ftr_obj["dna_sequence"] locations.append(add_ftr_obj["location"][0]) #translate sequence dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna) rna_sequence = dna_sequence_obj.transcribe() #Incomplete gene model with no start codon #Translate as is if str(rna_sequence.upper())[:3] not in codon_table.start_codons: logger.info("Missing start codon for " + feature_object["id"] + " Assuming incomplete gene model.") #temp_seq = 'AUG'+str(rna_sequence.upper())[3:] #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna) #You should never have this problem, needs to be reported rather than "fixed" codon_count = len(str(rna_sequence)) % 3 if codon_count != 0: logger.info( "Number of bases for RNA sequence for " + feature_object["id"] + " is not divisible by 3. The resulting protein may well be mis-translated." ) #temp_seq = str(rna_sequence.upper())+"N" #if codon_count == 1: # temp_seq+="N" #new_codon_count=len(temp_seq) % 3 #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna) protein_sequence = Seq("") try: protein_sequence = rna_sequence.translate() #cds=True) except CodonTable.TranslationError as te: logger.info("TranslationError for: " + feature_object["id"], phases, exons, " : " + str(te)) cds_object["protein_translation"] = str(protein_sequence).upper() cds_object["protein_translation_length"] = len( cds_object["protein_translation"]) cds_object["md5"] = hashlib.md5( cds_object["protein_translation"]).hexdigest() if (cds_object["protein_translation_length"] > longest_protein_length): longest_protein_length = cds_object[ "protein_translation_length"] longest_protein_sequence = cds_object["protein_translation"] del cds_object["dna_sequence"] del cds_object["dna_sequence_length"] if ("aliases" not in cds_object): cds_object["aliases"] = list() if ("function" not in cds_object): cds_object["function"] = "" #End of mRNA loop genome_mrnas_list.append(mrna_object) genome_cdss_list.append(cds_object) #End of gene loop gene_object["ontology_terms"] = dict() gene_object["protein_translation"] = longest_protein_sequence gene_object["protein_translation_length"] = longest_protein_length genome_features_list.append(gene_object) #remove sequences before loading for contig in assembly["contigs"]: del assembly["contigs"][contig]["sequence"] # assembly_string = simplejson.dumps(assembly, sort_keys=True, indent=4, ensure_ascii=False) # assembly_file = open("Bulk_Phytozome_Upload/"+assembly["name"]+'.json', 'w+') # assembly_file.write(assembly_string) # assembly_file.close() if (assembly_ref == None): #Upload FASTA to shock #Need to gunzip file first gunzipped_fasta_file = input_fasta_file # gunzipped_fasta_file=input_fasta_file[0:-3] # with gzip.open(input_fasta_file, 'rb') as f_in: # with open(gunzipped_fasta_file, 'wb') as f_out: # shutil.copyfileobj(f_in, f_out) token = os.environ.get('KB_AUTH_TOKEN') logger.info("Attempting Assembly save for %s" % (assembly["assembly_id"])) aUtil = AssemblyUtil(callback_url) assembly_ref = aUtil.save_assembly_from_fasta({ 'file': { 'path': gunzipped_fasta_file, 'assembly_name': assembly['assembly_id'] }, 'workspace_name': workspace_name, 'assembly_name': assembly['assembly_id'] }) logger.info("Assembly saved for %s" % (assembly["name"])) #Remove gunzipped file #os.remove(input_fasta_file[0:-3]) genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome["features"] = genome_features_list genome["cdss"] = genome_cdss_list genome["mrnas"] = genome_mrnas_list genome["source"] = source genome["domain"] = "Eukaryota" genome["genetic_code"] = 1 genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] if taxon_reference is not None: genome["taxon_ref"] = taxon_reference genome["taxonomy"] = taxonomy UserMeta = dict() UserMeta['Taxonomy'] = taxonomy UserMeta['Source'] = source UserMeta['Domain'] = "Eukaryota" UserMeta['Source ID'] = core_genome_name UserMeta['Name'] = scientific_name UserMeta['Genetic code'] = 1 UserMeta['GC content'] = assembly["gc_content"] UserMeta['Size'] = assembly["dna_size"] UserMeta['Number contigs'] = assembly['num_contigs'] #id_source_version_array = core_genome_name.split("_") #version = "_".join(id_source_version_array[2:]) #UserMeta['Version']=version #UserMeta['url']=''; if (gff_handle_ref == None): token = os.environ.get('KB_AUTH_TOKEN') file_upload = dfUtil.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) gff_handle_ref = file_upload['handle']['hid'] genome['gff_handle_ref'] = gff_handle_ref # genome_string = simplejson.dumps(genome, sort_keys=True, indent=4, ensure_ascii=False) # genome_file = open("Bulk_Phytozome_Upload/"+core_genome_name+'.json', 'w+') # genome_file.write(genome_string) # genome_file.close() logger.info("Attempting Genome save for %s" % (core_genome_name)) workspace_id = dfUtil.ws_name_to_id(workspace_name) genome_info = dfUtil.save_objects({ "id": workspace_id, "objects": [{ "name": core_genome_name, "type": "KBaseGenomes.Genome", "data": genome }] })[0] logger.info("Genome saved for %s" % (core_genome_name)) return {'genome_info': genome_info, 'report_string': ""}