def upload_interleaved_reads(callback_url, reads_file, ws_name, reads_obj_name, source_reads_upa): """ callback_url = as usual. reads_file = full path to the reads file to upload ws_name = the workspace to use for uploading the reads file reads_obj_name = the name of the new reads object to save as source_reads = if not None, the source UPA for the original reads file. """ # unfortunately, the ReadsUtils only accepts uncompressed fq files- this should # be fixed on the KBase side dfu = DataFileUtil(callback_url) reads_unpacked = dfu.unpack_file({'file_path': reads_file})['file_path'] ru = ReadsUtils(callback_url) new_reads_upa = ru.upload_reads({ 'fwd_file': reads_unpacked, 'interleaved': 1, 'wsname': ws_name, 'name': reads_obj_name, 'source_reads_ref': source_reads_upa })['obj_ref'] print('saved ' + str(reads_unpacked) + ' to ' + str(new_reads_upa)) return new_reads_upa
class FastaToAssembly: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) # Note added X due to kb|g.1886.fasta self.valid_chars = "-ACGTUWSMKRYBDHVNX" self.amino_acid_specific_characters = "PLIFQE" def import_fasta(self, ctx, params): print('validating parameters') self.validate_params(params) print('staging input files') fasta_file_path = self.stage_input(params) if 'min_contig_length' in params: min_contig_length = int(params['min_contig_length']) print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)') fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length) print('parsing FASTA file: ' + str(fasta_file_path)) assembly_data = self.parse_fasta(fasta_file_path, params) print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' + str(assembly_data['dna_size']) + 'bp') print('saving assembly to KBase') # save file to shock and build handle fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path) # construct the output object assembly_object_to_save = self.build_assembly_object(assembly_data, fasta_file_handle_info, params) # save to WS and return if 'workspace_id' in params: workspace_id = int(params['workspace_id']) else: workspace_id = self.dfu.ws_name_to_id(params['workspace_name']) assembly_info = self.save_assembly_object(workspace_id, params['assembly_name'], assembly_object_to_save) return assembly_info def build_assembly_object(self, assembly_data, fasta_file_handle_info, params): ''' construct the WS object data to save based on the parsed info and params ''' assembly_data['assembly_id'] = params['assembly_name'] assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid'] assembly_data['fasta_handle_info'] = fasta_file_handle_info assembly_data['type'] = 'Unknown' if 'type' in params: assembly_data['type'] = params['type'] if 'taxon_ref' in params: assembly_data['taxon_ref'] = params['taxon_ref'] if 'external_source' in params: assembly_data['external_source'] = params['external_source'] if 'external_source_id' in params: assembly_data['external_source_id'] = params['external_source_id'] if 'external_source_origination_date' in params: assembly_data['external_source_origination_date'] = params['external_source_origination_date'] return assembly_data def parse_fasta(self, fasta_file_path, params): ''' Do the actual work of inspecting each contig ''' # variables to store running counts of things total_length = 0 base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0} md5_list = [] # map from contig_id to contig_info all_contig_data = {} extra_contig_info = {} if'contig_info' in params: extra_contig_info = params['contig_info'] for record in SeqIO.parse(fasta_file_path, "fasta"): # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()), # id='gi|113968346|ref|NC_008321.1|', # name='gi|113968346|ref|NC_008321.1|', # description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome', # dbxrefs=[]) sequence = str(record.seq).upper() contig_info = { 'contig_id': record.id, 'name': record.id, 'description': record.description[len(record.id):].strip(), 'length': len(record.seq) } # 1) compute sequence character statistics running total total_length += contig_info['length'] sequence_count_table = dict(Counter(sequence)) for character in sequence_count_table: if character in base_counts: base_counts[character] = base_counts[character] + sequence_count_table[character] else: base_counts[character] = sequence_count_table[character] if character not in self.valid_chars: if character in self.amino_acid_specific_characters: raise ValueError('This fasta file may have amino acids in it instead ' + 'of the required nucleotides.') raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character)) # 2) record number of 'N' characters (only set if there are some) Ncount = 0 if 'N' in sequence_count_table: Ncount = sequence_count_table['N'] contig_info['Ncount'] = Ncount # 2b) record if the contig is circular if record.id in extra_contig_info: if 'is_circ' in extra_contig_info[record.id]: contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ']) if 'description' in extra_contig_info[record.id]: contig_info['description'] = str(extra_contig_info[record.id]['description']) # 3) record md5 checksum contig_md5 = md5(sequence).hexdigest() contig_info['md5'] = contig_md5 md5_list.append(contig_md5) # 4) record the all important GC to ~3 significant digits GC_count = 0 for base in ['G', 'C']: if base in sequence_count_table: GC_count += sequence_count_table[base] contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5) # 5) add to contig list if contig_info['contig_id'] in all_contig_data: raise ValueError('The fasta header key ' + contig_info['contig_id'] + 'appears more than once in the file') all_contig_data[contig_info['contig_id']] = contig_info # Aggregate stats for the data total_gc_content = None if total_length > 0: total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5) assembly_data = { 'md5': md5(",".join(sorted(md5_list))).hexdigest(), 'base_counts': base_counts, 'dna_size': total_length, 'gc_content': total_gc_content, 'contigs': all_contig_data, 'num_contigs': len(all_contig_data) } return assembly_data def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' rows = 0 rows_added = 0 for record in fasta_record_iter: rows += 1 if len(record.seq) >= min_contig_length: rows_added += 1 yield record print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' + str(min_contig_length) + 'bp.') def filter_contigs_by_length(self, fasta_file_path, min_contig_length): ''' removes all contigs less than the min_contig_length provided ''' filtered_fasta_file_path = fasta_file_path + '.filtered.fa' fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta') SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length), filtered_fasta_file_path, 'fasta') return filtered_fasta_file_path def save_assembly_object(self, workspace_id, assembly_name, obj_data): print('Saving Assembly to Workspace') sys.stdout.flush() obj_info = self.dfu.save_objects({'id': workspace_id, 'objects': [{'type': 'KBaseGenomeAnnotations.Assembly', 'data': obj_data, 'name': assembly_name }] })[0] return obj_info def save_fasta_file_to_shock(self, fasta_file_path): ''' Given the path to the file, upload to shock and return Handle information returns: typedef structure { string shock_id; Handle handle; string node_file_name; string size; } FileToShockOutput; ''' print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK') sys.stdout.flush() return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1}) def stage_input(self, params): ''' Setup the input_directory by fetching the files and returning the path to the file''' file_path = None if 'file' in params: file_path = os.path.abspath(params['file']['path']) elif 'shock_id' in params: print('Downloading file from SHOCK node: ' + str(params['shock_id'])) sys.stdout.flush() input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4())) os.makedirs(input_directory) file_name = self.dfu.shock_to_file({'file_path': input_directory, 'shock_id': params['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) elif 'ftp_url' in params: print('Downloading file from: ' + str(params['ftp_url'])) sys.stdout.flush() file_path = self.dfu.download_web_file({'file_url': params['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] # extract the file if it is compressed if file_path is not None: unpacked_file = self.dfu.unpack_file({'file_path': file_path}) return unpacked_file['file_path'] raise ValueError('No valid fasta could be extracted based on the input parameters') def validate_params(self, params): for key in ('workspace_name', 'assembly_name'): if key not in params: raise ValueError('required "' + key + '" field was not defined') # one and only one of either 'file', 'shock_id', or ftp_url is required input_count = 0 for key in ('file', 'shock_id', 'ftp_url'): if key in params and params[key] is not None: input_count = input_count + 1 if key == 'file': if not isinstance(params[key], dict) or 'path' not in params[key]: raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"') if input_count == 0: raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"') if input_count > 1: raise ValueError('required exactly one fasta file as input source, you set more than one of ' + 'these fields: "file", "shock_id", or "ftp_url"')
def stage_input(self, params): ''' Setup the input_directory by fetching the files and uncompressing if needed. ''' # construct the input directory where we stage files input_directory = os.path.join(self.cfg.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4())) os.makedirs(input_directory) # at this point, the 'file' input is validated, so we don't have to catch any special cases # we expect one and only one of path, shock_id, or ftp_url # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory file = params['file'] genbank_file_path = None if 'path' in file and file['path'] is not None: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = file['path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file print('Downloading file from SHOCK node: ' + str(self.cfg.shockURL) + ' - ' + str(file['shock_id'])) sys.stdout.flush() dfUtil = DataFileUtil(self.cfg.callbackURL) file_name = dfUtil.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) if 'ftp_url' in file and file['ftp_url'] is not None: # Note that the Transform originally had a script_utils.download_from_urls method # that, if the url is a folder, pulls all subfiles. That code recently broke when # fetching from NCBI (not clear if it is our issue or NCBI), but for now just # support the most common case- an FTP to a single file. print('Downloading file from: ' + str(file['ftp_url'])) sys.stdout.flush() url = urlparse(file['ftp_url']) if url.scheme != 'ftp' and url.scheme != 'http': raise ValueError('Only FTP/HTTP servers are supported') file_name = 'genome.gbk' if url.path != '': file_name = url.path.split('/')[-1] req = urllib2.Request(file['ftp_url']) response = urllib2.urlopen(req) file_data = response.read() genbank_file_path = os.path.join(input_directory, file_name) with open(genbank_file_path, "w") as genbank_file: genbank_file.write(file_data) # extract the file if it is compressed if genbank_file_path is not None: print("staged input file =" + genbank_file_path) sys.stdout.flush() dfUtil = DataFileUtil(self.cfg.callbackURL) dfUtil.unpack_file({ 'file_path': genbank_file_path }) else: raise ValueError('No valid files could be extracted based on the input') return input_directory
def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params): """ :param params: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Params" (KButil_Build_InSilico_Metagenomes_with_Grinder() ** ** Use Grinder to generate in silico shotgun metagenomes) -> structure: parameter "workspace_name" of type "workspace_name" (** The workspace object refs are of form: ** ** objects = ws.get_objects([{'ref': params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means the entire name combining the workspace id and the object name ** "id" is a numerical identifier of the workspace or object, and should just be used for workspace ** "name" is a string identifier of a workspace or object. This is received from Narrative.), parameter "input_refs" of type "data_obj_ref", parameter "output_name" of type "data_obj_name", parameter "desc" of String, parameter "num_reads_per_lib" of Long, parameter "population_percs" of String, parameter "read_len_mean" of Long, parameter "read_len_stddev" of Double, parameter "pairs_flag" of Long, parameter "mate_orientation" of String, parameter "insert_len_mean" of Long, parameter "insert_len_stddev" of Double, parameter "mutation_dist" of String, parameter "mutation_ratio" of String, parameter "qual_good" of Long, parameter "qual_bad" of Long, parameter "len_bias_flag" of Long, parameter "random_seed" of Long :returns: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" -> structure: parameter "report_name" of type "data_obj_name", parameter "report_ref" of type "data_obj_ref" """ # ctx is the context object # return variables are: returnVal #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder #### STEP 0: basic init ## console = [] invalid_msgs = [] report_text = '' self.log(console, 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ') self.log(console, "\n" + pformat(params)) # Auth token = ctx['token'] headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # API Clients #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' wsClient = workspaceService(self.workspaceURL, token=token) readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI(url=self.serviceWizardURL, token=ctx['token']) # for dynamic service auClient = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) dfu = DFUClient(self.callbackURL) # param checks required_params = [ 'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib', 'population_percs', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'mate_orientation', 'insert_len_mean', 'insert_len_stddev', 'mutation_dist', 'mutation_ratio', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in required_params: if arg not in params or params[arg] == None or params[arg] == '': raise ValueError("Must define required param: '" + arg + "'") # cast to str unpredictable numerical params (mostly used in string context) numerical_params = [ 'num_reads_per_lib', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in numerical_params: if arg not in params or params[arg] == None or params[arg] == '': continue params[arg] = str(params[arg]) # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects'] = [] for input_ref in params['input_refs']: provenance[0]['input_ws_objects'].append(input_ref) # set the output paths timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) if not os.path.exists(output_dir): os.makedirs(output_dir) html_output_dir = os.path.join(output_dir, 'html') if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) #### STEP 1: Parse population_percs and write to file ## abundance_str = params['population_percs'].strip() abundance_file_path = os.path.join(output_dir, 'my_abundances.txt') abundance_config_num_libs = 0 abundance_config_num_libs_set = False grinder_genome_ids = [] header = [] out_buf = [] for row in abundance_str.split("\n"): cols = re.split(r'\s+', row) if cols[0].upper() == "GENOME": for col in cols: if col == '': continue header.append(col) continue grinder_genome_ids.append(cols[0]) self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'") # DEBUG out_row = [] for col in cols: if col == '': continue elif col == '%': continue elif col.endswith('%'): col = col.rstrip('%') out_row.append(col) out_buf.append("\t".join(out_row)) num_samples = len(out_row) - 1 # first col is genome id if not abundance_config_num_libs_set: abundance_config_num_libs_set = True abundance_config_num_libs = num_samples elif num_samples != abundance_config_num_libs: invalid_msgs.append( "inconsistent number of samples in population_percs input field" ) # data validation if abundance_config_num_libs == 0: invalid_msgs.append( "unable to find sample percentages in population_percs input field" ) sample_sums = [] for row_i, abund_row_str in enumerate(out_buf): abund_row = abund_row_str.split() for sample_i, abund in enumerate(abund_row[1:]): if row_i == 0: sample_sums.append(0) #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i)) # DEBUG sample_sums[sample_i] += float(abund) for sample_i, sample_sum in enumerate(sample_sums): if sample_sum < 99.5 or sample_sum > 100.5: self.log( invalid_msgs, "Sample: " + str(sample_i + 1) + " " + header[sample_i + 1] + " proportions is not summing to 100.0. Summing to: " + str(sample_sum)) if len(invalid_msgs) == 0: with open(abundance_file_path, 'w') as abundance_fh: for out_line in out_buf: abundance_fh.write(out_line + "\n") # DEBUG with open(abundance_file_path, 'r') as abundance_fh: for out_line in abundance_fh.readlines(): out_line = out_line.rstrip() self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'") #### STEP 2: get genome scaffold sequences ## if len(invalid_msgs) == 0: genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna') read_buf_size = 65536 write_buf_size = 65536 accepted_input_types = ["KBaseGenomes.Genome"] genome_refs = params['input_refs'] genome_obj_names = [] genome_sci_names = [] assembly_refs = [] for i, input_ref in enumerate(genome_refs): # genome obj info try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_ref }]})[0] input_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_obj_info[TYPE_I]) # remove trailing version genome_obj_names.append(input_obj_info[NAME_I]) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref + ')' + str(e)) if input_obj_type not in accepted_input_types: raise ValueError("Input object of type '" + input_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) # genome obj data try: genome_obj = wsClient.get_objects([{ 'ref': input_ref }])[0]['data'] genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError("unable to fetch genome: " + input_ref) # Get assembly_refs if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." self.log(console, msg) self.log(invalid_msgs, msg) continue elif 'assembly_ref' in genome_obj and genome_obj[ 'assembly_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING assembly_ref: " + str( genome_obj['assembly_ref']) self.log(console, msg) assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj[ 'contigset_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING contigset_ref: " + str( genome_obj['contigset_ref']) self.log(console, msg) assembly_refs.append(genome_obj['contigset_ref']) # get fastas for scaffolds if len(invalid_msgs) == 0: contig_file_paths = [] for genome_i, input_ref in enumerate(genome_refs): contig_file = auClient.get_assembly_as_fasta({ 'ref': assembly_refs[genome_i] }).get('path') sys.stdout.flush() contig_file_path = dfu.unpack_file({'file_path': contig_file})['file_path'] contig_file_paths.append(contig_file_path) # reformat FASTA IDs for Grinder with open(genomes_src_db_file_path, 'w', write_buf_size) as genomes_src_db_fh: for genome_i, contig_file_path in enumerate(contig_file_paths): #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path) # DEBUG #contig_ids = [] with open(contig_file_path, 'r', read_buf_size) as contig_fh: genome_seq = '' contig_seq = '' contig_seqs = [] for contig_line in contig_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): #contig_id = contig_line.strip()[1:].split(' ')[0] #contig_ids.append(contig_id) #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n") if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' continue else: #genomes_src_db_fh.write(contig_line) contig_seq += contig_line if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' # write joined contigs to file genome_seq = "NNNNNNNNNN".join( contig_seqs ) # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins genome_seq = genome_seq.upper( ) # grinder might require upper case? genomes_src_db_fh.write(">" + grinder_genome_ids[genome_i] + "\n") genomes_src_db_fh.write(genome_seq + "\n") genome_seq = '' contig_seqs = [] # DEBUG #for contig_id in contig_ids: # self.log(console, "\tCONTIG_ID: "+contig_id) # DEBUG # DEBUG toggle = 0 with open(genomes_src_db_file_path, 'r', write_buf_size) as genomes_src_db_fh: for contig_line in genomes_src_db_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): self.log(console, 'GENOMES_SRC_DB: ' + contig_line) genome_id = contig_line[1:] toggle = 0 elif toggle == 0: #elif genome_id == 'G3': self.log( console, 'GENOMES_SRC_DB: ' + contig_line[0:50] + '...') toggle += 1 #### STEP 3: Run Grinder ## if len(invalid_msgs) == 0: cmd = [] cmd.append(self.GRINDER) # output cmd.append('-base_name') cmd.append(params['output_name']) cmd.append('-output_dir') cmd.append(output_dir) # contigs input cmd.append('-reference_file') cmd.append(genomes_src_db_file_path) # abundances cmd.append('-abundance_file') cmd.append(abundance_file_path) # library size cmd.append('-total_reads') cmd.append(str(params['num_reads_per_lib'])) # num libraries (overridden by abundance file?) cmd.append('-num_libraries') cmd.append(str(abundance_config_num_libs)) # read and insert lens cmd.append('-read_dist') cmd.append(str(params['read_len_mean'])) cmd.append('normal') cmd.append(str(params['read_len_stddev'])) if str(params['pairs_flag']) == '1': cmd.append('-insert_dist') cmd.append(str(params['insert_len_mean'])) cmd.append('normal') cmd.append(str(params['insert_len_stddev'])) # mate orientation cmd.append('-mate_orientation') cmd.append(params['mate_orientation']) # genome len bias cmd.append('-length_bias') cmd.append(str(params['len_bias_flag'])) # mutation model cmd.append('-mutation_dist') cmd.append(str(params['mutation_dist'])) cmd.append('-mutation_ratio') cmd.append(str(params['mutation_ratio'])) # qual scores cmd.append('-fastq_output') cmd.append('1') cmd.append('-qual_levels') cmd.append(str(params['qual_good'])) cmd.append(str(params['qual_bad'])) # skip contig joins cmd.append('-exclude_chars') cmd.append('NX') # explicitly request bidirectional cmd.append('-unidirectional') cmd.append('0') # random seed if 'random_seed' in params and params[ 'random_seed'] != None and params['random_seed'] != '': cmd.append('-random_seed') cmd.append(str(params['random_seed'])) # RUN cmd_str = " ".join(cmd) self.log(console, "===========================================") self.log(console, "RUNNING: " + cmd_str) self.log(console, "===========================================") cmdProcess = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) outputlines = [] while True: line = cmdProcess.stdout.readline() outputlines.append(line) if not line: break self.log(console, line.replace('\n', '')) cmdProcess.stdout.close() cmdProcess.wait() self.log(console, 'return code: ' + str(cmdProcess.returncode) + '\n') if cmdProcess.returncode != 0: raise ValueError('Error running kb_grinder, return code: ' + str(cmdProcess.returncode) + '\n') #report_text += "\n".join(outputlines) #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr # capture output for report and paths to out files report_text_buf = [] struct_file_paths = [] struct_file_names = [] fastq_file_paths = [] for out_line in outputlines: out_line = out_line.rstrip() if 'Community structure' in out_line: clean_line = out_line.lstrip() struct_file_path = re.split(r'\s+', clean_line)[3] struct_file_paths.append(struct_file_path) struct_file_names.append(struct_file_path.split('/')[-1]) self.log(console, "STRUCT_FILE_NAME: '" + struct_file_path.split('/')[-1]) # DEBUG elif 'FASTQ file' in out_line: clean_line = out_line.lstrip() fastq_file_paths.append(re.split(r'\s+', clean_line)[3]) else: report_text_buf.append(out_line) report_text += "\n".join(report_text_buf) #### STEP 4: Upload Read Libs and create reads set ## if len(invalid_msgs) == 0: lib_obj_refs = [] lib_obj_names = [] readsSet_items = [] for sample_i, fastq_file_path in enumerate(fastq_file_paths): if not os.path.isfile (fastq_file_path) \ or os.path.getsize (fastq_file_path) == 0: raise ValueError("empty read lib generated: " + fastq_file_path) else: # lib obj name if len(fastq_file_paths) == 0: output_obj_name = params['output_name'] else: if str(params['pairs_flag']) == '1': output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".PairedEndLib" else: output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".SingleEndLib" lib_obj_names.append(output_obj_name) # upload lib and get obj ref self.log( console, 'Uploading trimmed paired reads: ' + output_obj_name) sequencing_tech = 'artificial reads' if str(params['pairs_flag']) == '1': interleaved = 1 else: interleaved = 0 lib_obj_ref = readsUtils_Client.upload_reads({ 'wsname': str(params['workspace_name']), 'name': output_obj_name, 'fwd_file': fastq_file_path, 'interleaved': interleaved, 'sequencing_tech': sequencing_tech })['obj_ref'] lib_obj_refs.append(lib_obj_ref) os.remove(fastq_file_path) # free up disk # add to readsSet readsSet_items.append({ 'ref': lib_obj_ref, 'label': output_obj_name }) # create readsset readsSet_obj_ref = None if len(lib_obj_refs) > 1: readsSet_obj = { 'description': "Grinder Metagenome from " + " ".join(genome_obj_names), 'items': readsSet_items } readsSet_obj_name = params['output_name'] readsSet_obj_ref = setAPI_Client.save_reads_set_v1({ 'workspace_name': params['workspace_name'], 'output_object_name': readsSet_obj_name, 'data': readsSet_obj })['set_ref'] #### STEP 5: Build report ## reportName = 'kb_grinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], #'text_message': '', # or is it 'message'? 'message': '', # or is it 'text_message'? 'direct_html': '', #'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # message if len(invalid_msgs) > 0: report_text = "\n".join(invalid_msgs) reportObj['message'] = report_text if len(invalid_msgs) == 0: # objs if readsSet_obj_ref != None: reportObj['objects_created'].append({ 'ref': readsSet_obj_ref, 'desc': params['output_name'] + " ReadsSet" }) for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs): reportObj['objects_created'].append({ 'ref': lib_obj_ref, 'desc': lib_obj_names[lib_obj_i] }) # downloadable data for data_i, data_path in enumerate(struct_file_paths): try: upload_ret = dfu.file_to_shock({ 'file_path': data_path, #'pack': 'zip'}) 'make_handle': 0 }) except: raise ValueError('error uploading ' + data_path + ' file to shock') reportObj['file_links'].append({ 'shock_id': upload_ret['shock_id'], 'name': struct_file_names[data_i], 'label': struct_file_names[data_i] }) # html report """ try: html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir, 'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading html report to shock') reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': params['output_name']+' HTML' } ] """ # save report object # SERVICE_VER = 'release' reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']}) report_info = reportClient.create_extended_report(reportObj) returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END KButil_Build_InSilico_Metagenomes_with_Grinder # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.dfu = DataFileUtil(self.cfg.callbackURL) def import_file(self, params): # 1) validate parameters self._validate_import_file_params(params) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) # 4) do the upload result = self.upload_genome( shock_service_url=self.cfg.shockURL, handle_service_url=self.cfg.handleURL, workspace_service_url=self.cfg.workspaceURL, callback_url=self.cfg.callbackURL, input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], taxon_wsname=params['taxon_wsname'], taxon_reference=params['taxon_reference'], source=params['source'], genome_type=params['type'], release=params['release']) # 5) generate report output_data_ref = params['workspace_name'] + "/" + params['genome_name'] reportObj = { 'objects_created': [{ 'ref': output_data_ref, 'description': 'KBase Genome object' }], 'text_message': result['report_string'] } reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL']) report_info = reportClient.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # 6) clear the temp directory shutil.rmtree(input_directory) # 7) return the result info = result['genome_info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info, 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return details def upload_genome(self, shock_service_url=None, handle_service_url=None, workspace_service_url=None, callback_url=None, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", taxon_wsname='ReferenceTaxons', taxon_reference=None, source=None, release=None, genome_type=None): # retrieve taxon taxonomy, taxon_reference = self._retrieve_taxon( taxon_reference, taxon_wsname, scientific_name) # reading in Fasta file assembly = self._retrieve_fasta_file(input_fasta_file, core_genome_name, scientific_name, source) if taxon_reference is not None: assembly["taxon_ref"] = taxon_reference # reading in GFF file feature_list = self._retrieve_gff_file(input_gff_file) # compile links between features feature_hierarchy = self._generate_feature_hierarchy(feature_list) # retrieve genome feature list (genome_features_list, genome_mrnas_list, genome_cdss_list) = self._retrieve_genome_feature_list( feature_list, feature_hierarchy, assembly) # remove sequences before loading for contig in assembly["contigs"]: del assembly["contigs"][contig]["sequence"] aUtil = AssemblyUtil(callback_url) assembly_ref = aUtil.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file, 'assembly_name': assembly['assembly_id'] }, 'workspace_name': workspace_name, 'assembly_name': assembly['assembly_id'] }) # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file) workspace_id = self.dfu.ws_name_to_id(workspace_name) genome_info = self.dfu.save_objects({ "id": workspace_id, "objects": [{ "name": core_genome_name, "type": "KBaseGenomes.Genome", "data": genome }] })[0] report_string = '' return {'genome_info': genome_info, 'report_string': report_string} def _validate_import_file_params(self, params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(file.keys()) raise ValueError(error_msg) # check for valid type param valid_types = ['Reference', 'User upload', 'Representative'] if params.get('type') and params['type'] not in valid_types: error_msg = 'Entered value for type is not one of the valid entries of ' error_msg += '[' + ''.join('"' + str(e) + '", ' for e in valid_types)[0:-2] + ']' raise ValueError(error_msg) def _set_parsed_params(self, params): log('Setting params') # default params default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'type': 'User upload', 'metadata': {} } for field in default_params: if field not in params: params[field] = default_params[field] log(json.dumps(params, indent=1)) return params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: print("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name): """ _retrieve_taxon: retrieve taxonomy and taxon_reference """ taxon_id = -1 taxon_object_name = "unknown_taxon" # retrieve lookup object if scientific name provided if (taxon_reference is None and scientific_name is not "unknown_taxon"): # retrieve taxon lookup object then find taxon id taxon_lookup = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/taxon_lookup"], 'ignore_errors': 0 })['data'][0]['data']['taxon_lookup'] if (scientific_name[0:3] in taxon_lookup and scientific_name in taxon_lookup[scientific_name[0:3]]): taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name] taxon_object_name = "{}_taxon".format(str(taxon_id)) # retrieve Taxon object taxon_info = {} if (taxon_reference is None): taxon_info = self.dfu.get_objects({ 'object_refs': [taxon_wsname + "/" + taxon_object_name], 'ignore_errors': 0 })['data'][0] taxon_reference = "{}/{}/{}".format(taxon_info['info'][6], taxon_info['info'][0], taxon_info['info'][4]) else: taxon_info = self.dfu.get_objects({ "object_refs": [taxon_reference], 'ignore_errors': 0 })['data'][0] taxonomy = taxon_info['data']['scientific_lineage'] return taxonomy, taxon_reference def _retrieve_fasta_file(self, input_fasta_file, core_genome_name, scientific_name, source): """ _retrieve_fasta_file: retrieve info from fasta_file https://www.biostars.org/p/710/ """ log("Reading FASTA file") assembly = { "contigs": {}, "dna_size": 0, "gc_content": 0, "md5": [], "base_counts": {} } contig_seq_start = 0 input_file_handle = open(input_fasta_file, 'rb') # alternate header and sequence faiter = (x[1] for x in itertools.groupby(input_file_handle, lambda line: line[0] == ">")) for header in faiter: # drop the ">" header = header.next()[1:].strip() # join all sequence lines to one. seq = "".join(s.strip() for s in faiter.next()) try: fasta_header, fasta_description = header.split(' ', 1) except: fasta_header = header fasta_description = None # Handle record seq = seq.upper() # Build contig objects for Assembly seq_count = dict(collections.Counter(seq)) # to delete at end, but required for now contig_dict = {"sequence": seq} Ncount = 0 if "N" in seq_count: Ncount = seq_count["N"] contig_dict["Ncount"] = Ncount for character in seq_count: if character in assembly["base_counts"]: assembly["base_counts"][character] += seq_count[character] else: assembly["base_counts"][character] = seq_count[character] contig_seq_length = len(seq) assembly["dna_size"] += contig_seq_length contig_gc_length = seq.count("G") contig_gc_length += seq.count("C") contig_dict["gc_content"] = float("{0:.2f}".format( float(contig_gc_length) / float(contig_seq_length))) assembly["gc_content"] += contig_gc_length contig_dict["contig_id"] = fasta_header contig_dict["name"] = fasta_header contig_dict["length"] = contig_seq_length contig_dict["md5"] = hashlib.md5(seq).hexdigest() assembly["md5"].append(contig_dict["md5"]) if fasta_description is not None: contig_dict["description"] = fasta_description contig_dict["is_circular"] = "Unknown" contig_dict["start_position"] = contig_seq_start contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"]) assembly["contigs"][fasta_header] = contig_dict # used for start of next sequence and total gc_content contig_seq_start += contig_seq_length assembly["gc_content"] = float("{0:.2f}".format( float(assembly["gc_content"]) / float(contig_seq_start))) assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest() assembly["assembly_id"] = core_genome_name + "_assembly" assembly["name"] = scientific_name assembly["external_source"] = source assembly["external_source_id"] = os.path.basename(input_fasta_file) assembly["external_source_origination_date"] = str( os.stat(input_fasta_file).st_ctime) assembly["num_contigs"] = len(assembly["contigs"].keys()) assembly["type"] = "Unknown" assembly[ "notes"] = "Note MD5s are generated from uppercasing the sequences" return assembly def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = dict() is_phytozome = 0 is_patric = 0 gff_file_handle = open(input_gff_file, 'rb') current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if ("phytozome" in source_id or "Phytozome" in source_id): is_phytozome = 1 #Checking to see if Phytozome if ("PATRIC" in source_id): is_patric = 1 #PATRIC prepends their contig ids with some gibberish if (is_patric and "|" in contig_id): contig_id = contig_id.split("|", 1)[1] #Features grouped by contigs first if (contig_id not in feature_list): feature_list[contig_id] = list() #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': attributes } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") ftr[key] = value feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if (is_phytozome): self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline if (is_phytozome): self._print_phytozome_gff(input_gff_file, feature_list) return feature_list def _add_missing_identifiers(self, feature_list): #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list.keys(): for i in range(len(feature_list[contig])): if ("ID" not in feature_list[contig][i]): for key in ("transcriptId", "proteinId", "PACid", "pacid", "Parent"): if (key in feature_list[contig][i]): feature_list[contig][i]['ID'] = feature_list[ contig][i][key] break #If the process fails, throw an error for ftr_type in ("gene", "mRNA", "CDS"): if (ftr_type not in feature_list[contig][i]): continue if ("ID" not in feature_list[contig][i]): log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \ feature_list[contig][i]['contig']+"."+ \ feature_list[contig][i]['source']+"."+ \ feature_list[contig][i]['type']+": "+ \ feature_list[contig][i]['attributes']) return feature_list def _generate_feature_hierarchy(self, feature_list): feature_hierarchy = {contig: {} for contig in feature_list} #Need to remember mRNA/gene links for CDSs mRNA_gene_dict = {} exon_list_position_dict = {} for contig in feature_list: for i in range(len(feature_list[contig])): ftr = feature_list[contig][i] if ("gene" in ftr["type"]): feature_hierarchy[contig][ftr["ID"]] = { "utrs": [], "mrnas": [], "cdss": [], "index": i } if ("UTR" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ ftr["Parent"]]]["utrs"].append({ "id": ftr["ID"], "index": i }) if ("RNA" in ftr["type"]): feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({ "id": ftr["ID"], "index": i, "cdss": [] }) mRNA_gene_dict[ftr["ID"]] = ftr["Parent"] exon_list_position_dict[ftr["ID"]] = len( feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1 if ("CDS" in ftr["type"]): feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\ [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } ) return feature_hierarchy def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list.keys(): ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list def _update_phytozome_features(self, feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list.keys(): feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("ID", "PACid", "pacid"): if (key in feature_list[contig][i]): old_id = feature_list[contig][i][key] break if (old_id is None): #This should be an error print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\ feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes]) continue #Retain old_id feature_position_dict[old_id] = i #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("Name" in feature_list[contig][i]): feature_list[contig][i]["ID"] = feature_list[contig][i][ "Name"] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended #4) CDS appended with an incremented digit CDS_count_dict = dict() mRNA_parent_dict = dict() for contig in feature_list.keys(): for ftr in feature_list[contig]: if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] if ("CDS" in ftr["type"]): #Increment CDS identifier if (ftr["ID"] not in CDS_count_dict): CDS_count_dict[ftr["ID"]] = 1 else: CDS_count_dict[ftr["ID"]] += 1 ftr["ID"] = ftr["ID"] + "." + str( CDS_count_dict[ftr["ID"]]) #Recall new mRNA id for parent ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]] return feature_list def _print_phytozome_gff(self, input_gff_file, feature_list): #Write modified feature ids to new file input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz" try: print "Printing to new file: " + input_gff_file gff_file_handle = gzip.open(input_gff_file, 'wb') except: print "Failed to open" for contig in sorted(feature_list.iterkeys()): for ftr in feature_list[contig]: #Re-build attributes attributes_dict = {} for attribute in ftr["attributes"].split(";"): attribute = attribute.strip() #Sometimes empty string if (attribute == ""): continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) elif (" " in attribute): key, value = attribute.split(" ", 1) else: log("Warning: attribute " + attribute + " cannot be separated into key,value pair") if (ftr[key] != value): value = ftr[key] attributes_dict[key] = value ftr["attributes"] = ";".join(key + "=" + attributes_dict[key] for key in attributes_dict.keys()) new_line = "\t".join( str(ftr[key]) for key in [ 'contig', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ]) gff_file_handle.write(new_line) gff_file_handle.close() return def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy, assembly): genome_features_list = list() genome_mrnas_list = list() genome_cdss_list = list() genome_translation_issues = list() for contig in feature_hierarchy: for gene in feature_hierarchy[contig]: #We only iterate through the gene objects #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly ftr = feature_list[contig][feature_hierarchy[contig][gene] ["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] gene_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Add non-optional terms gene_ftr["mrnas"] = list() gene_ftr["cdss"] = list() gene_ftr["ontology_terms"] = dict() #Retaining longest sequences for gene feature longest_protein_length = 0 longest_protein_sequence = "" for mRNA in feature_hierarchy[contig][gene]["mrnas"]: ######################################################## # Construct mRNA Ftr ######################################################## ftr = feature_list[contig][mRNA["index"]] contig_sequence = assembly["contigs"][ ftr["contig"]]["sequence"] mRNA_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here #Modify mrna object for use in mrna array #Objects will be un-used until further notice mRNA_ftr['parent_gene'] = gene_ftr['id'] #If there are CDS, then New CDS ID without incrementation as they were aggregated if (len(mRNA['cdss']) > 0): mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS" else: mRNA_ftr['cds'] = "" #Add to mrnas array genome_mrnas_list.append(mRNA_ftr) #Add ids to gene_ftr arrays gene_ftr["mrnas"].append(mRNA_ftr["id"]) ######################################################## # Construct transcript, protein sequence, UTR, CDS locations ######################################################## #At time of writing, all of this aggregation should probably be done in a single function cds_exons_locations_array = list() cds_cdna_sequence = str() protein_sequence = str() if (len(mRNA["cdss"]) > 0): (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \ self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues) UTRs = list() if ("utrs" in feature_hierarchy[contig][gene] and len(feature_hierarchy[contig][gene]["utrs"]) > 0): for UTR in feature_hierarchy[contig][gene]["utrs"]: ftr = feature_list[contig][UTR["index"]] if ("Parent" in ftr and ftr["Parent"] == mRNA_ftr["id"]): UTRs.append(ftr) mrna_exons_locations_array = copy.deepcopy( cds_exons_locations_array) mrna_transcript_sequence = str(cds_cdna_sequence) if (len(UTRs) > 0): (mrna_exons_locations_array, mrna_transcript_sequence) = \ self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence) #Update sequence and locations mRNA_ftr["dna_sequence"] = mrna_transcript_sequence mRNA_ftr["dna_sequence_length"] = len( mrna_transcript_sequence) mRNA_ftr["location"] = mrna_exons_locations_array mRNA_ftr["md5"] = hashlib.md5( mRNA_ftr["dna_sequence"]).hexdigest() #Remove DNA del mRNA_ftr["dna_sequence"] del mRNA_ftr["dna_sequence_length"] #Skip CDS if not present if (len(mRNA["cdss"]) == 0): continue #Remove asterix representing stop codon if present if (len(protein_sequence) > 0 and protein_sequence[-1] == '*'): protein_sequence = protein_sequence[:-1] #Save longest sequence if (len(protein_sequence) > longest_protein_length): longest_protein_length = len(protein_sequence) longest_protein_sequence = protein_sequence ######################################################## # Construct CDS Ftr ######################################################## CDS_ftr = dict() CDS_ftr['type'] = 'CDS' #New CDS ID without incrementation as they were aggregated CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS' #Add gene/mrna links CDS_ftr['parent_gene'] = gene_ftr['id'] CDS_ftr['parent_mrna'] = mRNA_ftr['id'] #Update sequence and locations CDS_ftr["dna_sequence"] = cds_cdna_sequence CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence) CDS_ftr["location"] = cds_exons_locations_array CDS_ftr["md5"] = hashlib.md5( CDS_ftr["dna_sequence"]).hexdigest() #Add protein CDS_ftr["protein_translation"] = str( protein_sequence).upper() CDS_ftr["protein_translation_length"] = len( CDS_ftr["protein_translation"]) #Only generate md5 for dna sequences #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest() #Add empty non-optional fields for populating in future CDS_ftr["ontology_terms"] = dict() if ("aliases" not in CDS_ftr): CDS_ftr["aliases"] = list() if ("function" not in CDS_ftr): CDS_ftr["function"] = "" #Add to cdss array genome_cdss_list.append(CDS_ftr) #Add ids to gene_ftr arrays gene_ftr["cdss"].append(CDS_ftr["id"]) gene_ftr["protein_translation"] = longest_protein_sequence gene_ftr["protein_translation_length"] = longest_protein_length genome_features_list.append(gene_ftr) msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format( len(genome_features_list), len(genome_mrnas_list), len(genome_cdss_list)) msg += "{} mRNA(s) had errors during translation".format( len(genome_translation_issues)) log(msg) return genome_features_list, genome_mrnas_list, genome_cdss_list def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, genome_features_list, genome_cdss_list, genome_mrnas_list, source, assembly, taxon_reference, taxonomy, input_gff_file): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome["features"] = genome_features_list genome["cdss"] = genome_cdss_list genome["mrnas"] = genome_mrnas_list genome["source"] = source genome["domain"] = "Eukaryota" genome["genetic_code"] = 1 genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] if taxon_reference is not None: genome["taxon_ref"] = taxon_reference genome["taxonomy"] = taxonomy gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) gff_handle_ref = gff_file_to_shock['handle']['hid'] genome['gff_handle_ref'] = gff_handle_ref return genome def _convert_ftr_object(self, old_ftr, contig): new_ftr = dict() new_ftr["id"] = old_ftr["ID"] dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]], IUPAC.ambiguous_dna) # reverse complement if (old_ftr["strand"] == "-"): dna_sequence = dna_sequence.reverse_complement() old_start = old_ftr["start"] old_ftr["start"] = old_ftr["end"] old_ftr["end"] = old_start new_ftr["dna_sequence"] = str(dna_sequence).upper() new_ftr["dna_sequence_length"] = len(dna_sequence) new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest() new_ftr["location"] = [[ old_ftr["contig"], old_ftr["start"], old_ftr["strand"], len(dna_sequence) ]] new_ftr["type"] = old_ftr["type"] new_ftr["aliases"] = list() for key in ("transcriptId", "proteinId", "PACid", "pacid"): if (key in old_ftr.keys()): new_ftr["aliases"].append(key + ":" + old_ftr[key]) return new_ftr def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence): #create copies of locations and transcript utrs_exons = list(exons) utr_exon_sequence = exon_sequence five_prime_dna_sequence = "" three_prime_dna_sequence = "" five_prime_locations = list() three_prime_locations = list() for UTR in (utr_list): contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"] UTR_ftr = self._convert_ftr_object( UTR, contig_sequence ) #reverse-complementation for negative strands done here #aggregate sequences and locations if ("five_prime" in UTR_ftr["id"]): five_prime_dna_sequence += UTR_ftr["dna_sequence"] five_prime_locations.append(UTR_ftr["location"][0]) if ("three_prime" in UTR_ftr["id"]): three_prime_dna_sequence += UTR_ftr["dna_sequence"] three_prime_locations.append(UTR_ftr["location"][0]) #Handle five_prime UTRs if (len(five_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file) five_prime_locations = sorted(five_prime_locations, key=lambda x: x[1]) #Merge last UTR with CDS if "next" to each other if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \ ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ): #Remove last UTR last_five_prime_location = five_prime_locations[-1] five_prime_locations = five_prime_locations[:-1] #"Add" last UTR to first exon utrs_exons[0][1] = last_five_prime_location[1] utrs_exons[0][3] += last_five_prime_location[3] #Prepend other UTRs if available if (len(five_prime_locations) > 0): utrs_exons = five_prime_locations + utrs_exons utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence #Handle three_prime UTRs if (len(three_prime_locations) > 0): #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file three_prime_locations = sorted(three_prime_locations, key=lambda x: x[1]) #Merge first UTR with CDS if "next to each other if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \ ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ): #Remove first UTR first_three_prime_location = three_prime_locations[0] three_prime_locations = three_prime_locations[1:] #"Add" first UTR to last exon utrs_exons[-1][3] += first_three_prime_location[3] #Append other UTRs if available if (len(three_prime_locations) > 0): utrs_exons = utrs_exons + three_prime_locations utr_exon_sequence += three_prime_dna_sequence return (utrs_exons, utr_exon_sequence) def _cds_aggregation_translation(self, cds_list, feature_list, assembly, issues): dna_sequence = "" locations = list() # collect phases, and lengths of exons # right now, this is only for the purpose of error reporting phases = list() exons = list() #Saving parent mRNA identifier Parent_mRNA = cds_list[0]["id"] for CDS in (cds_list): ftr = feature_list[CDS["index"]] phases.append(ftr["phase"]) Parent_mRNA = ftr["Parent"] contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"] CDS_ftr = self._convert_ftr_object( ftr, contig_sequence ) #reverse-complementation for negative strands done here exons.append(len(CDS_ftr["dna_sequence"])) # Remove base(s) according to phase, but only for first CDS if (CDS == cds_list[0] and int(ftr["phase"]) != 0): log("Adjusting phase for first CDS: " + CDS["id"]) CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][ int(ftr["phase"]):] #aggregate sequences and locations dna_sequence += CDS_ftr["dna_sequence"] locations.append(CDS_ftr["location"][0]) # translate sequence dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna) rna_sequence = dna_sequence_obj.transcribe() # incomplete gene model with no start codon if str(rna_sequence.upper())[:3] not in codon_table.start_codons: msg = "Missing start codon for {}. Possibly incomplete gene model.".format( Parent_mRNA) log(msg) # You should never have this problem, needs to be reported rather than "fixed" codon_count = len(str(rna_sequence)) % 3 if codon_count != 0: msg = "Number of bases for RNA sequence for {} ".format( Parent_mRNA) msg += "is not divisible by 3. " msg += "The resulting protein may well be mis-translated." log(msg) issues.append(Parent_mRNA) protein_sequence = Seq("") try: protein_sequence = rna_sequence.translate() except CodonTable.TranslationError as te: log("TranslationError for: " + feature_object["id"], phases, exons, " : " + str(te)) return (locations, dna_sequence.upper(), str(protein_sequence).upper())