class GenbankToGenome: def __init__(self, config): self.cfg = config self.gi = GenomeInterface(config) self.dfu = DataFileUtil(config.callbackURL) self.aUtil = AssemblyUtil(config.callbackURL) self.ws = Workspace(config.workspaceURL) self._messages = [] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.generate_parents = False self.generate_ids = False self.genes = OrderedDict() self.mrnas = OrderedDict() self.cdss = OrderedDict() self.noncoding = [] self.ontologies_present = defaultdict(dict) self.ontology_events = list() self.skiped_features = Counter() self.feature_counts = Counter() self.orphan_types = Counter() self.contig_seq = {} self.circ_contigs = set() self.features_spaning_zero = set() self.genome_warnings = [] self.genome_suspect = False self.defects = Counter() self.spoofed_genes = 0 self.excluded_features = ('source', 'exon', 'fasta_record') self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.re_api_url = config.re_api_url # dict with feature 'id's that have been used more than once. self.used_twice_identifiers = {} self.default_params = { 'source': 'Genbank', 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'taxon_lookup_obj_name': self.cfg.raw['taxon-lookup-object-name'], 'ontology_wsname': self.cfg.raw['ontology-workspace-name'], 'ontology_GO_obj_name': self.cfg.raw['ontology-gene-ontology-obj-name'], 'ontology_PO_obj_name': self.cfg.raw['ontology-plant-ontology-obj-name'], 'release': None, 'genetic_code': 11, 'generate_ids_if_needed': 0, 'metadata': {} } @property def messages(self): return "\n".join(self._messages) def refactored_import(self, ctx, params): # 1) validate parameters and extract defaults self.validate_params(params) # 2) construct the input directory staging area input_directory = self.stage_input(params) # 3) update default params self.default_params.update(params) params = self.default_params self.generate_parents = params.get('generate_missing_genes') self.generate_ids = params.get('generate_ids_if_needed') if params.get('genetic_code'): self.code_table = params['genetic_code'] # 4) Do the upload files = self._find_input_files(input_directory) consolidated_file = self._join_files_skip_empty_lines(files) genome = self.parse_genbank(consolidated_file, params) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params['metadata'], }) ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}" logging.info(f"Genome saved to {ref}") # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = {'genome_ref': ref, 'genome_info': info} return details @staticmethod def validate_params(params): if 'workspace_name' not in params: raise ValueError('required "workspace_name" field was not defined') if 'genome_name' not in params: raise ValueError('required "genome_name" field was not defined') if 'file' not in params: raise ValueError('required "file" field was not defined') # one and only one of 'path', 'shock_id', or 'ftp_url' is required file = params['file'] if not isinstance(file, dict): raise ValueError('required "file" field must be a map/dict') sources = ('path', 'shock_id', 'ftp_url') n_valid_fields = sum(1 for f in sources if file.get(f)) if n_valid_fields < 1: raise ValueError(f'required "file" field must include one source: ' f'{", ".join(sources)}') if n_valid_fields > 1: raise ValueError( f'required "file" field has too many sources specified: ' f'{", ".join(file.keys())}') if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError(f"Invalid genetic code specified: {params}") def stage_input(self, params): """ Setup the input_directory by fetching the files and uncompressing if needed. """ # construct the input directory where we stage files input_directory = os.path.join( self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}') os.makedirs(input_directory) # at this point, the 'file' input is validated, so we don't have to catch any special cases # we expect one and only one of path, shock_id, or ftp_url # determine how to get the file: if it is from shock, download it. If it # is just sitting there, then use it. Move the file to the staging input directory file = params['file'] genbank_file_path = None if file.get('path') is not None: # copy the local file to the input staging directory # (NOTE: could just move it, but then this method would have the side effect of moving your # file which another SDK module might have an open handle on) local_file_path = file['path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file logging.info( f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}' ) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] genbank_file_path = os.path.join(input_directory, file_name) if 'ftp_url' in file and file['ftp_url'] is not None: logging.info('Downloading file from: ' + str(file['ftp_url'])) local_file_path = self.dfu.download_web_file({ 'file_url': file['ftp_url'], 'download_type': 'FTP' })['copy_file_path'] genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path)) shutil.copy2(local_file_path, genbank_file_path) # extract the file if it is compressed if genbank_file_path is not None: logging.info("staged input file =" + genbank_file_path) self.dfu.unpack_file({'file_path': genbank_file_path}) else: raise ValueError( 'No valid files could be extracted based on the input') return input_directory def parse_genbank(self, file_path, params): logging.info("Saving original file to shock") shock_res = self.dfu.file_to_shock({ 'file_path': file_path, 'make_handle': 1, 'pack': 'gzip', }) # Write and save assembly file assembly_ref = self._save_assembly(file_path, params) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] genome = { "id": params['genome_name'], "original_source_file_name": os.path.basename(file_path), "assembly_ref": assembly_ref, "gc_content": assembly_data['gc_content'], "dna_size": assembly_data['dna_size'], "md5": assembly_data['md5'], "genbank_handle_ref": shock_res['handle']['hid'], "publications": set(), "contig_ids": [], "contig_lengths": [], } genome['source'], genome['genome_tiers'] = self.gi.determine_tier( params['source']) if params.get('genome_type'): genome['genome_type'] = params['genome_type'] # Set taxonomy-related fields in the genome # Also validates the given taxon ID if params.get('taxon_id'): set_taxon_data(int(params['taxon_id']), self.re_api_url, genome) else: set_default_taxon_data(genome) dates = [] # Parse data from genbank file contigs = Bio.SeqIO.parse(file_path, "genbank") for record in contigs: r_annot = record.annotations logging.info("parsing contig: " + record.id) try: dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y")) except (TypeError, ValueError): pass genome['contig_ids'].append(record.id) genome['contig_lengths'].append(len(record)) genome["publications"] |= self._get_pubs(r_annot) # only do the following once(on the first contig) if "source_id" not in genome: genome["source_id"] = record.id.split('.')[0] organism = r_annot.get('organism', 'Unknown Organism') if params.get('scientific_name'): genome['scientific_name'] = params['scientific_name'] else: genome['scientific_name'] = organism self.code_table = genome['genetic_code'] genome["molecule_type"] = r_annot.get('molecule_type', 'DNA') genome['notes'] = r_annot.get('comment', "").replace('\\n', '\n') self._parse_features(record, genome['source']) genome.update(self.get_feature_lists()) genome['num_contigs'] = len(genome['contig_ids']) # add dates dates.sort() if dates: genome['external_source_origination_date'] = time.strftime( "%d-%b-%Y", dates[0]) if dates[0] != dates[-1]: genome['external_source_origination_date'] += " _ " + \ time.strftime("%d-%b-%Y", dates[-1]) if self.ontologies_present: genome['ontologies_present'] = dict(self.ontologies_present) genome["ontology_events"] = self.ontology_events genome['feature_counts'] = dict(self.feature_counts) # can't serialize a set genome['publications'] = list(genome['publications']) if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] / float(len(genome['cdss'])) > 0.02): self.genome_warnings.append( warnings["genome_inc_translation"].format( self.defects['cds_seq_not_matching'], len(genome['cdss']))) self.genome_suspect = 1 if self.defects['bad_parent_loc']: self.genome_warnings.append( f"There were {self.defects['bad_parent_loc']} parent/child " "relationships that were not able to be determined. Some of " "these may have splice variants that may be valid relationships." ) if self.defects['spoofed_genes']: self.genome_warnings.append(warnings['spoofed_genome'].format( self.defects['spoofed_genes'])) genome['suspect'] = 1 if self.defects['not_trans_spliced']: self.genome_warnings.append( warnings['genome_not_trans_spliced'].format( self.defects['not_trans_spliced'])) genome['suspect'] = 1 if self.genome_warnings: genome['warnings'] = self.genome_warnings if self.genome_suspect: genome['suspect'] = 1 logging.info(f"Feature Counts: {genome['feature_counts']}") return genome def _save_assembly(self, genbank_file, params): """Convert genbank file to fasta and sve as assembly""" contigs = Bio.SeqIO.parse(genbank_file, "genbank") assembly_id = f"{params['genome_name']}_assembly" fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta" out_contigs = [] extra_info = defaultdict(dict) for in_contig in contigs: if in_contig.annotations.get('topology', "") == 'circular': extra_info[in_contig.id]['is_circ'] = 1 self.circ_contigs.add(in_contig.id) elif in_contig.annotations.get('topology', "") == 'linear': extra_info[in_contig.id]['is_circ'] = 0 out_contigs.append(in_contig) self.contig_seq[in_contig.id] = in_contig.seq.upper() assembly_ref = params.get("use_existing_assembly") if assembly_ref: if not re.match("\d+\/\d+\/\d+", assembly_ref): raise ValueError( f"Assembly ref: {assembly_ref} is not a valid format. Must" f" be in numerical <ws>/<object>/<version> format.") ret = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]: raise ValueError( f"{assembly_ref} is not a reference to an assembly") unmatched_ids = list() unmatched_ids_md5s = list() for current_contig in self.contig_seq.keys(): current_contig_md5 = hashlib.md5( str(self.contig_seq[current_contig]).encode( 'utf8')).hexdigest() if current_contig in ret['data']['contigs']: if current_contig_md5 != ret['data']['contigs'][ current_contig]['md5']: unmatched_ids_md5s.append(current_contig) else: unmatched_ids.append(current_contig) if len(unmatched_ids) > 0: raise ValueError(warnings['assembly_ref_extra_contigs'].format( ", ".join(unmatched_ids))) if len(unmatched_ids_md5s) > 0: raise ValueError(warnings["assembly_ref_diff_seq"].format( ", ".join(unmatched_ids_md5s))) logging.info(f"Using supplied assembly: {assembly_ref}") return assembly_ref logging.info("Saving sequence as Assembly object") Bio.SeqIO.write(out_contigs, fasta_file, "fasta") assembly_ref = self.aUtil.save_assembly_from_fasta({ 'file': { 'path': fasta_file }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_id, 'type': params.get('genome_type', 'isolate'), 'contig_info': extra_info }) logging.info(f"Assembly saved to {assembly_ref}") return assembly_ref def _find_input_files(self, input_directory): logging.info("Scanning for Genbank Format files.") valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"] files = os.listdir(os.path.abspath(input_directory)) logging.info("Genbank Files : " + ", ".join(files)) genbank_files = [ x for x in files if os.path.splitext(x)[-1].lower() in valid_extensions ] if len(genbank_files) == 0: raise Exception( f"The input directory does not have any files with one of the " f"following extensions {','.join(valid_extensions)}.") logging.info(f"Found {len(genbank_files)} genbank files") input_files = [] for genbank_file in genbank_files: input_files.append(os.path.join(input_directory, genbank_file)) return input_files def _join_files_skip_empty_lines(self, input_files): """ Applies strip to each line of each input file. Args: input_files: Paths to input files in Genbank format. Returns: Path to resulting file (currenly it's the same file as input). """ if len(input_files) == 0: raise ValueError("NO GENBANK FILE") temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined") if not os.path.exists(temp_dir): os.makedirs(temp_dir) ret_file = os.path.join(temp_dir, os.path.basename(input_files[0])) # take in Genbank file and remove all empty lines from it. with open(ret_file, 'w', buffering=2**20) as f_out: for input_file in input_files: with open(input_file, 'r') as f_in: for line in f_in: line = line.rstrip('\r\n') if line.strip(): f_out.write(line + '\n') return ret_file def _get_pubs(self, r_annotations): """Get a contig's publications""" pub_list = [] for in_pub in r_annotations.get('references', []): # don't add blank pubs if not in_pub.authors: continue out_pub = [ 0, # pmid "", # source in_pub.title, "", # web address "", # date in_pub.authors, in_pub.journal, ] date_match = re.match("\((\d{4})\)", in_pub.journal) if date_match: out_pub[4] = date_match.group(1) if in_pub.pubmed_id: out_pub[0:4] = [ int(in_pub.pubmed_id), "PubMed", in_pub.title, f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}" ] pub_list.append(tuple(out_pub)) logging.info(f"Parsed {len(pub_list)} publication records") return set(pub_list) def _get_id(self, feat, tags=None): """Assign a id to a feature based on the first tag that exists""" _id = "" if not tags: tags = ['locus_tag', 'kbase_id'] for t in tags: _id = feat.qualifiers.get(t, [""])[0] if _id: break if not _id: if feat.type == 'gene': if not self.generate_ids: raise ValueError( f"Unable to find a valid id for gene " f"among these tags: {', '.join(tags)}. Correct the " f"file or rerun with generate_ids\n {feat}") self.orphan_types['gene'] += 1 _id = f"gene_{self.orphan_types['gene']}" if 'rna' in feat.type.lower() or feat.type in { 'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR' }: _id = f"gene_{self.orphan_types['gene']}" return _id def _parse_features(self, record, source): def _location(feat): """Convert to KBase style location objects""" strand_trans = ("", "+", "-") loc = [] for part in feat.location.parts: contig_id = part.ref if part.ref else record.id if part.strand >= 0: begin = int(part.start) + 1 else: begin = int(part.end) loc.append( (contig_id, begin, strand_trans[part.strand], len(part))) return loc def _warn(message): if message not in out_feat.get('warnings', []): out_feat['warnings'] = out_feat.get('warnings', []) + [message] def _check_suspect_location(parent=None): if 'trans_splicing' in out_feat.get('flags', []): return if out_feat['location'] == sorted( out_feat['location'], reverse=(in_feature.location.strand == -1)): return if record.id in self.circ_contigs and \ in_feature.location.start == 0 \ and in_feature.location.end == len(record): self.features_spaning_zero.add(out_feat['id']) return if parent and parent['id'] in self.features_spaning_zero: return _warn(warnings['not_trans_spliced']) self.defects['not_trans_spliced'] += 1 for in_feature in record.features: if in_feature.type in self.excluded_features: self.skiped_features[in_feature.type] += 1 continue feat_seq = self._get_seq(in_feature, record.id) if source == "Ensembl": _id = self._get_id(in_feature, ['gene', 'locus_tag']) else: _id = self._get_id(in_feature) # The following is common to all the feature types out_feat = { "id": "_".join([_id, in_feature.type]), "location": _location(in_feature), "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } if not _id: out_feat['id'] = in_feature.type # validate input feature # note that end is the larger number regardless of strand if int(in_feature.location.end) > len(record): self.genome_warnings.append( warnings["coordinates_off_end"].format(out_feat['id'])) self.genome_suspect = 1 continue for piece in in_feature.location.parts: if not isinstance(piece.start, ExactPosition) \ or not isinstance(piece.end, ExactPosition): _warn(warnings["non_exact_coordinates"]) self.feature_counts[in_feature.type] += 1 # add optional fields if 'note' in in_feature.qualifiers: out_feat['note'] = in_feature.qualifiers["note"][0] out_feat.update(self._get_aliases_flags_functions(in_feature)) ont, db_xrefs = self._get_ontology_db_xrefs(in_feature) if ont: out_feat['ontology_terms'] = ont if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'inference' in in_feature.qualifiers: out_feat['inference_data'] = parse_inferences( in_feature.qualifiers['inference']) _check_suspect_location(self.genes.get(_id)) # add type specific features if in_feature.type == 'CDS': self.process_cds(_id, feat_seq, in_feature, out_feat) elif in_feature.type == 'gene': self.process_gene(_id, out_feat) elif in_feature.type == 'mRNA': self.process_mrna(_id, out_feat) else: self.noncoding.append( self.process_noncoding(_id, in_feature.type, out_feat)) def get_feature_lists(self): """sort genes into their final arrays""" coding = [] for g in self.genes.values(): if len(g['cdss']): if g['mrnas'] and len(g['mrnas']) != len(g['cdss']): msg = "The length of the mrna and cdss arrays are not equal" g['warnings'] = g.get('warnings', []) + [msg] # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in g: g[key] = list(set(g[key])) if not g['mrnas']: del g['mrnas'] del g['type'] coding.append(g) self.feature_counts["protein_encoding_gene"] += 1 else: del g['mrnas'], g['cdss'] self.noncoding.append(g) self.feature_counts["non_coding_genes"] += 1 self.feature_counts["non_coding_features"] = len(self.noncoding) return { 'features': coding, 'non_coding_features': self.noncoding, 'cdss': list(self.cdss.values()), 'mrnas': list(self.mrnas.values()) } def _get_seq(self, feat, contig): """Extract the DNA sequence for a feature""" seq = [] for part in feat.location.parts: strand = part.strand # handle trans-splicing across contigs if part.ref: part_contig = part.ref else: part_contig = contig if strand >= 0: seq.append( str(self.contig_seq[part_contig][part.start:part.end])) else: seq.append( str(self.contig_seq[part_contig] [part.start:part.end].reverse_complement())) return "".join(seq) def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError(f"{ontology_type} is not a supported ontology") if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = defaultdict(dict) db_xrefs = [] for key in ("GO_process", "GO_function", "GO_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.qualifiers.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') for ref in feature.qualifiers.get('db_xref', []): if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') elif ":" not in ref: db_xrefs.append(tuple(["Unknown_Source", ref])) else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), sorted(db_xrefs) @staticmethod def _get_aliases_flags_functions(feat): """Get the values for aliases flags and features from qualifiers""" alias_keys = { 'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'EC_number', 'gene_synonym' } result = defaultdict(list) for key, val_list in feat.qualifiers.items(): if key in alias_keys: result['aliases'].extend([(key, val) for val in val_list]) # flags have no other information associated with them if val_list == ['']: result['flags'].append(key) if key == 'function': result['functional_descriptions'].extend( val_list[0].split('; ')) if key == 'product': result['functions'] = val_list return result def _find_parent_gene(self, potential_id, feature): """Unfortunately, Genbank files don't have a parent ID and the features can be out of order at times. To account for this, the this function works backwards from the end of list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum number of tries""" if potential_id in self.genes: lookup_attempts = 0 while lookup_attempts < MAX_PARENT_LOOKUPS: if is_parent(self.genes[potential_id], feature): return potential_id lookup_attempts += 1 try: potential_id = list( self.genes.keys())[-(lookup_attempts + 1)] except IndexError: break # no more genes that could match exist self.defects['bad_parent_loc'] += 1 return None def assign_new_id(self, _id): """given a feature id that has already been used, add a unique modifier to it""" _id_modifier = self.used_twice_identifiers.get(_id, 1) self.used_twice_identifiers[_id] = _id_modifier + 1 return _id + "." + str(_id_modifier) def process_gene(self, _id, out_feat): out_feat.update({ "id": _id, "type": 'gene', "mrnas": [], 'cdss': [], }) if _id in self.genes: _id = self.assign_new_id(_id) out_feat.update({"id": _id}) # raise ValueError(f"Duplicate gene ID: {_id}") self.genes[_id] = out_feat def process_noncoding(self, gene_id, feat_type, out_feat): out_feat["type"] = feat_type # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: if 'children' not in self.genes[gene_id]: self.genes[gene_id]['children'] = [] out_feat['id'] += "_" + str( len(self.genes[gene_id]['children']) + 1) self.genes[gene_id]['children'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types[feat_type] += 1 out_feat['id'] += "_" + str(self.orphan_types[feat_type]) return out_feat def process_mrna(self, gene_id, out_feat): if gene_id not in self.genes and self.generate_parents: self.process_gene(gene_id, copy.copy(out_feat)) gene_id = self._find_parent_gene(gene_id, out_feat) if gene_id: out_feat['id'] = "_".join( (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1))) self.genes[gene_id]['mrnas'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['mrna'] += 1 out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}" out_feat['warnings'] = out_feat.get('warnings', []) + [ 'Unable to find parent gene for ' + str(out_feat['id']) ] self.mrnas[out_feat['id']] = out_feat def process_cds(self, gene_id, feat_seq, in_feature, out_feat): # Associate CDS with parents cds_warnings = out_feat.get('warnings', []) validated_gene_id = self._find_parent_gene(gene_id, out_feat) if validated_gene_id: out_feat['id'] = "_".join( (validated_gene_id, "CDS", str(len(self.genes[validated_gene_id]['cdss']) + 1))) self.genes[validated_gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = validated_gene_id elif self.generate_parents and gene_id not in self.genes: new_feat = copy.copy(out_feat) new_feat['id'] = gene_id new_feat['warnings'] = [warnings['spoofed_gene']] self.orphan_types['gene'] += 1 self.defects['spoofed_genes'] += 1 self.process_gene(new_feat['id'], new_feat) out_feat['id'] = "_".join( (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1))) self.genes[gene_id]['cdss'].append(out_feat['id']) out_feat['parent_gene'] = gene_id else: self.orphan_types['cds'] += 1 out_feat['id'] = f"CDS_{self.orphan_types['cds']}" cds_warnings.append( f"Unable to find parent gene for {out_feat['id']}") # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1 mrna_id = out_feat["id"].replace('CDS', 'mRNA') if mrna_id in self.mrnas: if not is_parent(self.mrnas[mrna_id], out_feat): cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id)) self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get( 'warnings', []) + [warnings['cds_mrna_mrna']] self.defects['bad_parent_loc'] += 1 else: out_feat['parent_mrna'] = mrna_id self.mrnas[mrna_id]['cds'] = out_feat['id'] # process protein prot_seq = in_feature.qualifiers.get("translation", [""])[0] # allow a little slack to account for frameshift and stop codon if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4: cds_warnings.append(warnings["inconsistent_CDS_length"].format( len(feat_seq), len(prot_seq))) self.genome_warnings.append( warnings['genome_inc_CDS_length'].format( out_feat['id'], len(feat_seq), len(prot_seq))) self.genome_suspect = 1 try: if prot_seq and prot_seq != Seq.translate( feat_seq, self.code_table, cds=True).strip("*"): cds_warnings.append(warnings["inconsistent_translation"]) self.defects['cds_seq_not_matching'] += 1 except TranslationError as e: cds_warnings.append("Unable to verify protein sequence:" + str(e)) if not prot_seq: try: prot_seq = Seq.translate(feat_seq, self.code_table, cds=True).strip("*") cds_warnings.append(warnings["no_translation_supplied"]) except TranslationError as e: cds_warnings.append(warnings["no_translation_supplied"] + str(e)) out_feat.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if out_feat.get('parent_gene'): propagate_cds_props_to_gene(out_feat, self.genes[out_feat['parent_gene']]) if cds_warnings: out_feat['warnings'] = cds_warnings self.cdss[out_feat['id']] = out_feat
def save_one_genome(self, ctx, params): """ :param params: instance of type "SaveOneGenomeParams" -> structure: parameter "workspace" of String, parameter "name" of String, parameter "data" of type "Genome" (Genome object holds much of the data relevant for a genome in KBase Genome publications should be papers about the genome, not papers about certain features of the genome (which go into the Feature object) Should the Genome object have a list of feature ids? (in addition to having a list of feature_refs) Should the Genome object contain a list of contig_ids too? @optional assembly_ref quality close_genomes analysis_events features source_id source contigs contig_ids publications md5 taxonomy gc_content complete dna_size num_contigs contig_lengths contigset_ref @metadata ws gc_content as GC content @metadata ws taxonomy as Taxonomy @metadata ws md5 as MD5 @metadata ws dna_size as Size @metadata ws genetic_code as Genetic code @metadata ws domain as Domain @metadata ws source_id as Source ID @metadata ws source as Source @metadata ws scientific_name as Name @metadata ws length(close_genomes) as Close genomes @metadata ws length(features) as Number features @metadata ws num_contigs as Number contigs) -> structure: parameter "id" of type "Genome_id" (KBase genome ID @id kb), parameter "scientific_name" of String, parameter "domain" of String, parameter "genetic_code" of Long, parameter "dna_size" of Long, parameter "num_contigs" of Long, parameter "contigs" of list of type "Contig" (Type spec for a "Contig" subobject in the "ContigSet" object Contig_id id - ID of contig in contigset string md5 - unique hash of contig sequence string sequence - sequence of the contig string description - Description of the contig (e.g. everything after the ID in a FASTA file) @optional length md5 genetic_code cell_compartment replicon_geometry replicon_type name description complete) -> structure: parameter "id" of type "Contig_id" (ContigSet contig ID @id external), parameter "length" of Long, parameter "md5" of String, parameter "sequence" of String, parameter "genetic_code" of Long, parameter "cell_compartment" of String, parameter "replicon_type" of String, parameter "replicon_geometry" of String, parameter "name" of String, parameter "description" of String, parameter "complete" of type "Bool", parameter "contig_lengths" of list of Long, parameter "contig_ids" of list of type "Contig_id" (ContigSet contig ID @id external), parameter "source" of String, parameter "source_id" of type "source_id" (Reference to a source_id @id external), parameter "md5" of String, parameter "taxonomy" of String, parameter "gc_content" of Double, parameter "complete" of Long, parameter "publications" of list of type "publication" (Structure for a publication (from ER API) also want to capture authors, journal name (not in ER)) -> tuple of size 7: parameter "id" of Long, parameter "source_db" of String, parameter "article_title" of String, parameter "link" of String, parameter "pubdate" of String, parameter "authors" of String, parameter "journal_name" of String, parameter "features" of list of type "Feature" (Structure for a single feature of a genome Should genome_id contain the genome_id in the Genome object, the workspace id of the Genome object, a genomeref, something else? Should sequence be in separate objects too? We may want to add additional fields for other CDM functions (e.g., atomic regulons, coexpressed fids, co_occurring fids,...) @optional orthologs quality feature_creation_event md5 location function ontology_terms protein_translation protein_families subsystems publications subsystem_data aliases annotations regulon_data atomic_regulons coexpressed_fids co_occurring_fids dna_sequence protein_translation_length dna_sequence_length) -> structure: parameter "id" of type "Feature_id" (KBase Feature ID @id external), parameter "location" of list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id external), Long, String, Long, parameter "type" of String, parameter "function" of String, parameter "ontology_terms" of mapping from String to mapping from String to type "OntologyData" -> structure: parameter "id" of String, parameter "ontology_ref" of String, parameter "term_lineage" of list of String, parameter "term_name" of String, parameter "evidence" of list of type "OntologyEvidence" (@optional translation_provenance alignment_evidence) -> structure: parameter "method" of String, parameter "method_version" of String, parameter "timestamp" of String, parameter "translation_provenance" of tuple of size 3: parameter "ontologytranslation_ref" of String, parameter "namespace" of String, parameter "source_term" of String, parameter "alignment_evidence" of list of tuple of size 4: parameter "start" of Long, parameter "stop" of Long, parameter "align_length" of Long, parameter "identify" of Double, parameter "md5" of String, parameter "protein_translation" of String, parameter "dna_sequence" of String, parameter "protein_translation_length" of Long, parameter "dna_sequence_length" of Long, parameter "publications" of list of type "publication" (Structure for a publication (from ER API) also want to capture authors, journal name (not in ER)) -> tuple of size 7: parameter "id" of Long, parameter "source_db" of String, parameter "article_title" of String, parameter "link" of String, parameter "pubdate" of String, parameter "authors" of String, parameter "journal_name" of String, parameter "subsystems" of list of String, parameter "protein_families" of list of type "ProteinFamily" (Structure for a protein family @optional query_begin query_end subject_begin subject_end score evalue subject_description release_version) -> structure: parameter "id" of String, parameter "subject_db" of String, parameter "release_version" of String, parameter "subject_description" of String, parameter "query_begin" of Long, parameter "query_end" of Long, parameter "subject_begin" of Long, parameter "subject_end" of Long, parameter "score" of Double, parameter "evalue" of Double, parameter "aliases" of list of String, parameter "orthologs" of list of tuple of size 2: String, Double, parameter "annotations" of list of type "annotation" (a notation by a curator of the genome object) -> tuple of size 3: parameter "comment" of String, parameter "annotator" of String, parameter "annotation_time" of Double, parameter "subsystem_data" of list of type "subsystem_data" (Structure for subsystem data (from CDMI API)) -> tuple of size 3: parameter "subsystem" of String, parameter "variant" of String, parameter "role" of String, parameter "regulon_data" of list of type "regulon_data" (Structure for regulon data (from CDMI API)) -> tuple of size 3: parameter "regulon_id" of String, parameter "regulon_set" of list of type "Feature_id" (KBase Feature ID @id external), parameter "tfs" of list of type "Feature_id" (KBase Feature ID @id external), parameter "atomic_regulons" of list of type "atomic_regulon" (Structure for an atomic regulon (from CDMI API)) -> tuple of size 2: parameter "atomic_regulon_id" of String, parameter "atomic_regulon_size" of Long, parameter "coexpressed_fids" of list of type "coexpressed_fid" (Structure for coexpressed fids (from CDMI API)) -> tuple of size 2: parameter "scored_fid" of type "Feature_id" (KBase Feature ID @id external), parameter "score" of Double, parameter "co_occurring_fids" of list of type "co_occurring_fid" (Structure for co-occurring fids (from CDMI API)) -> tuple of size 2: parameter "scored_fid" of type "Feature_id" (KBase Feature ID @id external), parameter "score" of Double, parameter "quality" of type "Feature_quality_measure" (@optional weighted_hit_count hit_count existence_priority overlap_rules pyrrolysylprotein truncated_begin truncated_end existence_confidence frameshifted selenoprotein) -> structure: parameter "truncated_begin" of type "Bool", parameter "truncated_end" of type "Bool", parameter "existence_confidence" of Double, parameter "frameshifted" of type "Bool", parameter "selenoprotein" of type "Bool", parameter "pyrrolysylprotein" of type "Bool", parameter "overlap_rules" of list of String, parameter "existence_priority" of Double, parameter "hit_count" of Double, parameter "weighted_hit_count" of Double, parameter "feature_creation_event" of type "Analysis_event" (@optional tool_name execution_time parameters hostname) -> structure: parameter "id" of type "Analysis_event_id", parameter "tool_name" of String, parameter "execution_time" of Double, parameter "parameters" of list of String, parameter "hostname" of String, parameter "contigset_ref" of type "ContigSet_ref" (Reference to a ContigSet object containing the contigs for this genome in the workspace @id ws KBaseGenomes.ContigSet), parameter "assembly_ref" of type "Assembly_ref" (Reference to an Assembly object in the workspace @id ws KBaseGenomeAnnotations.Assembly), parameter "quality" of type "Genome_quality_measure" (@optional frameshift_error_rate sequence_error_rate) -> structure: parameter "frameshift_error_rate" of Double, parameter "sequence_error_rate" of Double, parameter "close_genomes" of list of type "Close_genome" (@optional genome closeness_measure) -> structure: parameter "genome" of type "Genome_id" (KBase genome ID @id kb), parameter "closeness_measure" of Double, parameter "analysis_events" of list of type "Analysis_event" (@optional tool_name execution_time parameters hostname) -> structure: parameter "id" of type "Analysis_event_id", parameter "tool_name" of String, parameter "execution_time" of Double, parameter "parameters" of list of String, parameter "hostname" of String, parameter "hidden" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) :returns: instance of type "SaveGenomeResult" -> structure: parameter "info" of type "object_info" (Information about an object, including user provided metadata. obj_id objid - the numerical id of the object. obj_name name - the name of the object. type_string type - the type of the object. timestamp save_date - the save date of the object. obj_ver ver - the version of the object. username saved_by - the user that saved or copied the object. ws_id wsid - the workspace containing the object. ws_name workspace - the workspace containing the object. string chsum - the md5 checksum of the object. int size - the size of the object in bytes. usermeta meta - arbitrary user-supplied metadata about the object.) -> tuple of size 11: parameter "objid" of type "obj_id" (The unique, permanent numerical ID of an object.), parameter "name" of type "obj_name" (A string used as a name for an object. Any string consisting of alphanumeric characters and the characters |._- that is not an integer is acceptable.), parameter "type" of type "type_string" (A type string. Specifies the type and its version in a single string in the format [module].[typename]-[major].[minor]: module - a string. The module name of the typespec containing the type. typename - a string. The name of the type as assigned by the typedef statement. major - an integer. The major version of the type. A change in the major version implies the type has changed in a non-backwards compatible way. minor - an integer. The minor version of the type. A change in the minor version implies that the type has changed in a way that is backwards compatible with previous type definitions. In many cases, the major and minor versions are optional, and if not provided the most recent version will be used. Example: MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z (representing the UTC timezone) or the difference in time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, parameter "saved_by" of type "username" (Login name of a KBase user account.), parameter "wsid" of type "ws_id" (The unique, permanent numerical ID of a workspace.), parameter "workspace" of type "ws_name" (A string used as a name for a workspace. Any string consisting of alphanumeric characters and "_", ".", or "-" that is not an integer is acceptable. The name may optionally be prefixed with the workspace owner's user name and a colon, e.g. kbasetest:my_workspace.), parameter "chsum" of String, parameter "size" of Long, parameter "meta" of type "usermeta" (User provided metadata about an object. Arbitrary key-value pairs provided by the user.) -> mapping from String to String """ # ctx is the context object # return variables are: returnVal #BEGIN save_one_genome genome_interface = GenomeInterface(self.cfg) returnVal = genome_interface.save_one_genome(params) #END save_one_genome # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method save_one_genome return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def save_one_genome(self, ctx, params): """ :param params: instance of type "SaveOneGenomeParams" -> structure: parameter "workspace" of String, parameter "name" of String, parameter "data" of type "Genome" (Genome object holds much of the data relevant for a genome in KBase Genome publications should be papers about the genome Should the Genome object contain a list of contig_ids too? Source: allowed entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload #allowed entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload controlled vocabulary managed by API Domain is a controlled vocabulary Warnings : mostly controlled vocab but also allow for unstructured Genome_tiers : controlled vocabulary (based on ap input and API checked) Allowed values: #Representative, Reference, ExternalDB, User Examples Tiers: All phytozome - Representative and ExternalDB Phytozome flagship genomes - Reference, Representative and ExternalDB Ensembl - Representative and ExternalDB RefSeq Reference - Reference, Representative and ExternalDB RefSeq Representative - Representative and ExternalDB RefSeq Latest or All Assemblies folder - ExternalDB User Data - User tagged Example Sources: RefSeq, Ensembl, Phytozome, Microcosm, User, RAST, Prokka, (other annotators) @optional warnings contig_lengths contig_ids source_id taxonomy publications @optional ontology_events ontologies_present non_coding_features mrnas @optional genbank_handle_ref gff_handle_ref external_source_origination_date @optional release original_source_file_name notes quality_scores suspect assembly_ref @metadata ws gc_content as GC content @metadata ws taxonomy as Taxonomy @metadata ws md5 as MD5 @metadata ws dna_size as Size @metadata ws genetic_code as Genetic code @metadata ws domain as Domain @metadata ws source_id as Source ID @metadata ws source as Source @metadata ws scientific_name as Name @metadata ws length(features) as Number of Protein Encoding Genes @metadata ws length(cdss) as Number of CDS @metadata ws assembly_ref as Assembly Object @metadata ws num_contigs as Number contigs @metadata ws length(warnings) as Number of Genome Level Warnings @metadata ws suspect as Suspect Genome) -> structure: parameter "id" of type "Genome_id" (KBase genome ID @id kb), parameter "scientific_name" of String, parameter "domain" of String, parameter "warnings" of list of String, parameter "genome_tiers" of list of String, parameter "feature_counts" of mapping from String to Long, parameter "genetic_code" of Long, parameter "dna_size" of Long, parameter "num_contigs" of Long, parameter "molecule_type" of String, parameter "contig_lengths" of list of Long, parameter "contig_ids" of list of String, parameter "source" of String, parameter "source_id" of type "source_id" (Reference to a source_id @id external), parameter "md5" of String, parameter "taxonomy" of String, parameter "gc_content" of Double, parameter "publications" of list of type "publication" (Structure for a publication (float pubmedid string source (ex. Pubmed) string title string web address string publication year string authors string journal)) -> tuple of size 7: parameter "pubmedid" of Double, parameter "source" of String, parameter "title" of String, parameter "url" of String, parameter "year" of String, parameter "authors" of String, parameter "journal" of String, parameter "ontology_events" of list of type "Ontology_event" (@optional ontology_ref method_version eco) -> structure: parameter "id" of String, parameter "ontology_ref" of type "Ontology_ref" (Reference to a ontology object @id ws KBaseOntology.OntologyDictionary), parameter "method" of String, parameter "method_version" of String, parameter "timestamp" of String, parameter "eco" of String, parameter "ontologies_present" of mapping from String to mapping from String to String, parameter "features" of list of type "Feature" (Structure for a single CDS encoding ???gene??? of a genome ONLY PUT GENES THAT HAVE A CORRESPONDING CDS IN THIS ARRAY NOTE: Sequence is optional. Ideally we can keep it in here, but Recognize due to space constraints another solution may be needed. We may want to add additional fields for other CDM functions (e.g., atomic regulons, coexpressed fids, co_occurring fids,...) protein_translation_length and protein_translation are for longest coded protein (representative protein for splice variants) NOTE: New Aliases field definitely breaks compatibility. As Does Function. flags are flag fields in GenBank format. This will be a controlled vocabulary. Initially Acceptable values are pseudo, ribosomal_slippage, and trans_splicing Md5 is the md5 of dna_sequence. @optional functions ontology_terms note protein_translation mrnas flags warnings @optional inference_data dna_sequence aliases db_xrefs children functional_descriptions) -> structure: parameter "id" of type "Feature_id" (KBase Feature ID @id external), parameter "location" of list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id external), Long, String, Long, parameter "functions" of list of String, parameter "functional_descriptions" of list of String, parameter "ontology_terms" of mapping from String to mapping from String to list of Long, parameter "note" of String, parameter "md5" of String, parameter "protein_translation" of String, parameter "protein_translation_length" of Long, parameter "cdss" of list of String, parameter "mrnas" of list of String, parameter "children" of list of String, parameter "flags" of list of String, parameter "warnings" of list of String, parameter "inference_data" of list of type "InferenceInfo" (category;#Maybe a controlled vocabulary type;#Maybe a controlled vocabulary) -> structure: parameter "category" of String, parameter "type" of String, parameter "evidence" of String, parameter "dna_sequence" of String, parameter "dna_sequence_length" of Long, parameter "aliases" of list of tuple of size 2: parameter "fieldname" of String, parameter "alias" of String, parameter "db_xrefs" of list of tuple of size 2: parameter "db_source" of String, parameter "db_identifier" of String, parameter "non_coding_features" of list of type "NonCodingFeature" (Structure for a single feature that is NOT one of the following: Protein encoding gene (gene that has a corresponding CDS) mRNA CDS Note pseudo-genes and Non protein encoding genes are put into this flags are flag fields in GenBank format. This will be a controlled vocabulary. Initially Acceptable values are pseudo, ribosomal_slippage, and trans_splicing Md5 is the md5 of dna_sequence. @optional functions ontology_terms note flags warnings functional_descriptions @optional inference_data dna_sequence aliases db_xrefs children parent_gene) -> structure: parameter "id" of type "Feature_id" (KBase Feature ID @id external), parameter "location" of list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id external), Long, String, Long, parameter "type" of String, parameter "functions" of list of String, parameter "functional_descriptions" of list of String, parameter "ontology_terms" of mapping from String to mapping from String to list of Long, parameter "note" of String, parameter "md5" of String, parameter "parent_gene" of String, parameter "children" of list of String, parameter "flags" of list of String, parameter "warnings" of list of String, parameter "inference_data" of list of type "InferenceInfo" (category;#Maybe a controlled vocabulary type;#Maybe a controlled vocabulary) -> structure: parameter "category" of String, parameter "type" of String, parameter "evidence" of String, parameter "dna_sequence" of String, parameter "dna_sequence_length" of Long, parameter "aliases" of list of tuple of size 2: parameter "fieldname" of String, parameter "alias" of String, parameter "db_xrefs" of list of tuple of size 2: parameter "db_source" of String, parameter "db_identifier" of String, parameter "cdss" of list of type "CDS" (Structure for a single feature CDS flags are flag fields in GenBank format. This will be a controlled vocabulary. Initially Acceptable values are pseudo, ribosomal_slippage, and trans_splicing Md5 is the md5 of dna_sequence. @optional parent_gene parent_mrna functions ontology_terms note flags warnings @optional inference_data dna_sequence aliases db_xrefs functional_descriptions) -> structure: parameter "id" of type "cds_id" (KBase CDS ID @id external), parameter "location" of list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id external), Long, String, Long, parameter "md5" of String, parameter "protein_md5" of String, parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id external), parameter "parent_mrna" of type "mrna_id" (KBase mRNA ID @id external), parameter "note" of String, parameter "functions" of list of String, parameter "functional_descriptions" of list of String, parameter "ontology_terms" of mapping from String to mapping from String to list of Long, parameter "flags" of list of String, parameter "warnings" of list of String, parameter "inference_data" of list of type "InferenceInfo" (category;#Maybe a controlled vocabulary type;#Maybe a controlled vocabulary) -> structure: parameter "category" of String, parameter "type" of String, parameter "evidence" of String, parameter "protein_translation" of String, parameter "protein_translation_length" of Long, parameter "aliases" of list of tuple of size 2: parameter "fieldname" of String, parameter "alias" of String, parameter "db_xrefs" of list of tuple of size 2: parameter "db_source" of String, parameter "db_identifier" of String, parameter "dna_sequence" of String, parameter "dna_sequence_length" of Long, parameter "mrnas" of list of type "mRNA" (Structure for a single feature mRNA flags are flag fields in GenBank format. This will be a controlled vocabulary. Initially Acceptable values are pseudo, ribosomal_slippage, and trans_splicing Md5 is the md5 of dna_sequence. @optional parent_gene cds functions ontology_terms note flags warnings @optional inference_data dna_sequence aliases db_xrefs functional_descriptions) -> structure: parameter "id" of type "mrna_id" (KBase mRNA ID @id external), parameter "location" of list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id external), Long, String, Long, parameter "md5" of String, parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id external), parameter "cds" of type "cds_id" (KBase CDS ID @id external), parameter "dna_sequence" of String, parameter "dna_sequence_length" of Long, parameter "note" of String, parameter "functions" of list of String, parameter "functional_descriptions" of list of String, parameter "ontology_terms" of mapping from String to mapping from String to list of Long, parameter "flags" of list of String, parameter "warnings" of list of String, parameter "inference_data" of list of type "InferenceInfo" (category;#Maybe a controlled vocabulary type;#Maybe a controlled vocabulary) -> structure: parameter "category" of String, parameter "type" of String, parameter "evidence" of String, parameter "aliases" of list of tuple of size 2: parameter "fieldname" of String, parameter "alias" of String, parameter "db_xrefs" of list of tuple of size 2: parameter "db_source" of String, parameter "db_identifier" of String, parameter "assembly_ref" of type "Assembly_ref" (Reference to an Assembly object in the workspace @id ws KBaseGenomeAnnotations.Assembly), parameter "taxon_ref" of type "Taxon_ref" (Reference to a taxon object @id ws KBaseGenomeAnnotations.Taxon), parameter "genbank_handle_ref" of type "genbank_handle_ref" (Reference to a handle to the Genbank file on shock @id handle), parameter "gff_handle_ref" of type "gff_handle_ref" (Reference to a handle to the GFF file on shock @id handle), parameter "external_source_origination_date" of String, parameter "release" of String, parameter "original_source_file_name" of String, parameter "notes" of String, parameter "quality_scores" of list of type "GenomeQualityScore" (Score_interpretation : fraction_complete - controlled vocabulary managed by API @optional method_report_ref method_version) -> structure: parameter "method" of String, parameter "method_report_ref" of type "Method_report_ref" (Reference to a report object @id ws KBaseReport.Report), parameter "method_version" of String, parameter "score" of String, parameter "score_interpretation" of String, parameter "timestamp" of String, parameter "suspect" of type "Bool", parameter "hidden" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "upgrade" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) :returns: instance of type "SaveGenomeResult" -> structure: parameter "info" of type "object_info" (Information about an object, including user provided metadata. obj_id objid - the numerical id of the object. obj_name name - the name of the object. type_string type - the type of the object. timestamp save_date - the save date of the object. obj_ver ver - the version of the object. username saved_by - the user that saved or copied the object. ws_id wsid - the workspace containing the object. ws_name workspace - the workspace containing the object. string chsum - the md5 checksum of the object. int size - the size of the object in bytes. usermeta meta - arbitrary user-supplied metadata about the object.) -> tuple of size 11: parameter "objid" of type "obj_id" (The unique, permanent numerical ID of an object.), parameter "name" of type "obj_name" (A string used as a name for an object. Any string consisting of alphanumeric characters and the characters |._- that is not an integer is acceptable.), parameter "type" of type "type_string" (A type string. Specifies the type and its version in a single string in the format [module].[typename]-[major].[minor]: module - a string. The module name of the typespec containing the type. typename - a string. The name of the type as assigned by the typedef statement. major - an integer. The major version of the type. A change in the major version implies the type has changed in a non-backwards compatible way. minor - an integer. The minor version of the type. A change in the minor version implies that the type has changed in a way that is backwards compatible with previous type definitions. In many cases, the major and minor versions are optional, and if not provided the most recent version will be used. Example: MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z (representing the UTC timezone) or the difference in time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, parameter "saved_by" of type "username" (Login name of a KBase user account.), parameter "wsid" of type "ws_id" (The unique, permanent numerical ID of a workspace.), parameter "workspace" of type "ws_name" (A string used as a name for a workspace. Any string consisting of alphanumeric characters and "_", ".", or "-" that is not an integer is acceptable. The name may optionally be prefixed with the workspace owner's user name and a colon, e.g. kbasetest:my_workspace.), parameter "chsum" of String, parameter "size" of Long, parameter "meta" of type "usermeta" (User provided metadata about an object. Arbitrary key-value pairs provided by the user.) -> mapping from String to String """ # ctx is the context object # return variables are: returnVal #BEGIN save_one_genome genome_interface = GenomeInterface(self.cfg) returnVal = genome_interface.save_one_genome(params) #END save_one_genome # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method save_one_genome return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.au = AssemblyUtil(config.callbackURL) self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(self.cfg) self.taxon_wsname = self.cfg.raw['taxon-workspace-name'] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() self.version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1) self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR', 'start_codon', 'stop_codon', 'region', 'chromosome', 'scaffold') self.spoof_gene_count = 0 self.is_phytozome = False self.strict = True self.generate_genes = False self.warnings = [] self.feature_dict = collections.OrderedDict() self.cdss = set() self.ontologies_present = collections.defaultdict(dict) self.ontology_events = list() self.skiped_features = collections.Counter() self.feature_counts = collections.Counter() def warn(self, message): self.warnings.append(message) def generate_genome_json(self, params): # 1) validate parameters self._validate_import_file_params(params) self.code_table = params.get('genetic_code', 11) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) if params.get('generate_missing_genes'): self.generate_genes = True # 4) do the upload genome = self._gen_genome_json( input_fasta_file=file_paths["fasta_file"], input_gff_file=file_paths["gff_file"], workspace_name=params['workspace_name'], core_genome_name=params['genome_name'], scientific_name=params['scientific_name'], source=params['source'], source_id=params['source_id'], release=params['release'], ) if params.get('genetic_code'): genome["genetic_code"] = params['genetic_code'] return genome, input_directory def import_file(self, params): genome, input_directory = self.generate_genome_json(params) json.dump(genome, open( "{}/{}.json".format(self.cfg.sharedFolder, genome['id']), 'w'), indent=4) result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params.get('metadata', {}), }) report_string = 'A genome with {} contigs and the following feature ' \ 'types was imported: {}'\ .format(len(genome['contig_ids']), "\n".join( [k + ": " + str(v) for k, v in genome['feature_counts'].items()])) log(report_string) # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] details = { 'genome_ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]), 'genome_info': info } return details def _gen_genome_json(self, input_gff_file=None, input_fasta_file=None, workspace_name=None, core_genome_name=None, scientific_name="unknown_taxon", source=None, source_id=None, release=None): # reading in GFF file features_by_contig = self._retrieve_gff_file(input_gff_file) contig_ids = set() # parse feature information fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta") for contig in fasta_contigs: molecule_type = str(contig.seq.alphabet).replace( 'IUPACAmbiguous', '').strip('()') contig_ids.add(contig.id) for feature in features_by_contig.get(contig.id, []): self._transform_feature(contig, feature) for cid in set(features_by_contig.keys()) - contig_ids: self.warn("Sequence name {} does not match a sequence id in the " "FASTA file. {} features will not be imported.".format( cid, len(features_by_contig[cid]))) if self.strict: raise ValueError( "Every feature sequence id must match a fasta sequence id") self._process_cdss() # save assembly file assembly_ref = self.au.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': core_genome_name + ".assembly" }) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] # generate genome info genome = self._gen_genome_info(core_genome_name, scientific_name, assembly_ref, source, source_id, assembly_data, input_gff_file, molecule_type) genome['release'] = release if self.spoof_gene_count > 0: genome['warnings'] = genome.get('warnings', []) + \ [warnings['spoofed_genome'].format(self.spoof_gene_count)] genome['suspect'] = 1 return genome @staticmethod def _location(in_feature): in_feature['strand'] = in_feature['strand'].replace( "-1", "-").translate(strand_table) if in_feature['strand'] == '+': start = in_feature['start'] elif in_feature['strand'] == '-': start = in_feature['end'] else: raise ValueError('Invalid feature strand: {}'.format( in_feature['strand'])) return [ in_feature['contig'], start, in_feature['strand'], in_feature['end'] - in_feature['start'] + 1 ] @staticmethod def _validate_import_file_params(params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError( 'Required "{}" field must be a map/dict'.format(key)) n_valid_fields = 0 if 'path' in file and file['path'] is not None: n_valid_fields += 1 if 'shock_id' in file and file['shock_id'] is not None: n_valid_fields += 1 if 'ftp_url' in file and file['ftp_url'] is not None: n_valid_fields += 1 raise ValueError( 'FTP link is currently not supported for FastaGFFToGenome') if n_valid_fields < 1: error_msg = 'Required "{}" field must include one source: '.format( key) error_msg += 'path | shock_id' raise ValueError(error_msg) if n_valid_fields > 1: error_msg = 'Required "{}" field has too many sources specified: '.format( key) error_msg += str(list(file.keys())) raise ValueError(error_msg) if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError( "Invalid genetic code specified: {}".format(params)) def _set_parsed_params(self, params): log('Setting params') default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'taxon_reference': None, 'source': 'User', 'release': None, 'metadata': {}, 'source_id': 'unknown', } default_params.update(params) log(json.dumps(default_params, indent=1)) return default_params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None if 'path' in file and file['path'] is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) log('Moving file from {} to {}'.format(local_file_path, file_path)) shutil.copy2(local_file_path, file_path) if 'shock_id' in file and file['shock_id'] is not None: # handle shock file log('Downloading file from SHOCK node: {}-{}'.format( self.cfg.sharedFolder, file['shock_id'])) sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) # extract the file if it is compressed if file_path is not None: log("staged input file =" + file_path) sys.stdout.flush() dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] else: raise ValueError( 'No valid files could be extracted based on the input') return file_paths def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ log("Reading GFF file") feature_list = collections.defaultdict(list) is_patric = 0 gff_file_handle = open(input_gff_file) current_line = gff_file_handle.readline() line_count = 0 while (current_line != ''): current_line = current_line.strip() if (current_line.isspace() or current_line == "" or current_line.startswith("#")): pass else: #Split line (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') #Checking to see if Phytozome if "phytozome" in source_id.lower(): self.is_phytozome = True #Checking to see if Phytozome if "PATRIC" in source_id: is_patric = True #PATRIC prepends their contig ids with some gibberish if is_patric and "|" in contig_id: contig_id = contig_id.split("|", 1)[1] #Populating basic feature object ftr = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': collections.defaultdict(list) } #Populating with attribute key-value pair #This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() #Sometimes empty string if not attribute: continue #Use of 1 to limit split as '=' character can also be made available later #Sometimes lack of "=", assume spaces instead if ("=" in attribute): key, value = attribute.split("=", 1) ftr['attributes'][key.lower()].append( parse.unquote(value.strip('"'))) elif (" " in attribute): key, value = attribute.split(" ", 1) ftr['attributes'][key.lower()].append( parse.unquote(value.strip('"'))) else: pass #log("Warning: attribute "+attribute+" cannot be separated into key,value pair") ftr['attributes']['raw'] = attributes if "id" in ftr['attributes']: ftr['ID'] = ftr['attributes']['id'][0] if "parent" in ftr['attributes']: ftr['Parent'] = ftr['attributes']['parent'][0] feature_list[contig_id].append(ftr) current_line = gff_file_handle.readline() gff_file_handle.close() #Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) #Most bacterial files have only CDSs #In order to work with prokaryotic and eukaryotic gene structure synonymously #Here we add feature dictionaries representing the parent gene and mRNAs #feature_list = self._add_missing_parents(feature_list) #Phytozome has the annoying habit of editing their identifiers so we fix them if self.is_phytozome: self._update_phytozome_features(feature_list) #All identifiers need to be checked so that they follow the same general rules #Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) return feature_list def _add_missing_identifiers(self, feature_list): log("Adding missing identifiers") #General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list: for i, feat in enumerate(feature_list[contig]): if "ID" not in feature_list[contig][i]: for key in ("transcriptid", "proteinid", "pacid", "parent", "name", 'transcript_id'): if key in feature_list[contig][i]['attributes']: feature_list[contig][i]['ID'] = feature_list[ contig][i]['attributes'][key][0] break if feat['type'] not in self.skip_types: self.feature_counts[feat['type']] += 1 #If the process fails, throw an error if "ID" not in feature_list[contig][i]: feat['ID'] = "{}_{}".format( feat['type'], self.feature_counts[feat['type']]) #log("Warning: Could find unique ID to utilize in GFF attributes: {}. " # "ID '{}' has been assigned".format(feat['attributes'], feat['ID'])) return feature_list def _add_missing_parents(self, feature_list): #General rules is if CDS or RNA missing parent, add them for contig in feature_list: ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ftrs[i]["type"] in self.skip_types: continue if ("Parent" not in ftrs[i]): #Assuming parent doesn't exist at all, so create de novo instead of trying to find it if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]): new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if ("CDS" in ftrs[i]["type"]): new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list @staticmethod def _update_phytozome_features(feature_list): #General rule is to use the "Name" field where possible #And update parent attribute correspondingly for contig in feature_list: feature_position_dict = {} for i in range(len(feature_list[contig])): #Maintain old_id for reference #Sometimes ID isn't available, so use PACid old_id = None for key in ("id", "pacid"): if (key in feature_list[contig][i]['attributes']): old_id = feature_list[contig][i]['attributes'][key][0] break if (old_id is None): #This should be an error #log("Cannot find unique ID, PACid, or pacid in GFF " # "attributes: " + feature_list[contig][i][contig]) continue #Retain old_id feature_position_dict[old_id] = i # Clip off the increment on CDS IDs so fragments of the same # CDS share the same ID if "CDS" in feature_list[contig][i]["ID"]: feature_list[contig][i]["ID"] = feature_list[contig][i][ "ID"].rsplit('.', 1)[0] #In Phytozome, gene and mRNA have "Name" field, CDS do not if ("name" in feature_list[contig][i]['attributes']): feature_list[contig][i]["ID"] = feature_list[contig][i][ 'attributes']['name'][0] if ("Parent" in feature_list[contig][i]): #Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): #General rules: #1) Genes keep identifier #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" #3) CDS always uses RNA identifier with ".CDS" appended mRNA_parent_dict = dict() for contig in feature_list: for ftr in feature_list[contig]: if ftr["type"] in self.skip_types: continue if ("Parent" in ftr): #Retain old_id of parents old_id = ftr["ID"] if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]): ftr["ID"] = ftr["Parent"] + "." + ftr["type"] #link old to new ids for mRNA to use with CDS if ("RNA" in ftr["type"]): mRNA_parent_dict[old_id] = ftr["ID"] return feature_list def _check_location_order(self, locations): """If order looks good return None. If out of order return warning If on multiple strands return warning""" strand = None last_start = 0 for location in locations: if strand == None: strand = location[2] elif strand != location[2]: return warnings["both_strand_coordinates"] if strand == "-": locations = reversed(locations) for location in locations: if last_start > location[1]: return warnings["out_of_order"] else: last_start = location[1] return None def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError( "{} is not a supported ontology".format(ontology_type)) if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = collections.defaultdict(dict) db_xrefs = [] # these are keys are formatted strangely and require special parsing for key in ("go_process", "go_function", "go_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') # CATH terms are not distinct from EC numbers so myst be found by key for term in feature.get('cath_funfam', []) + feature.get('cath', []): for ref in term.split(','): ontology['CATH'][ref] = [self._create_ontology_event("CATH")] self.ontologies_present['CATH'][ref] = self.ont_mappings[ 'CATH'].get(ref, '') search_keys = [ 'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam', 'pfam', 'cog', 'go', 'po', 'ko' ] ont_terms = [] # flatten out into list of values for key in search_keys: if key in feature: ont_terms += [x for y in feature[key] for x in y.split(',')] for ref in ont_terms: if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), db_xrefs def _transform_feature(self, contig, in_feature): """Converts a feature from the gff ftr format into the appropriate format for a genome object """ def _aliases(feat): keys = ('locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'ec_number', 'gene_synonym') alias_list = [] for key in keys: if key in feat['attributes']: alias_list.extend([(key, val) for val in feat['attributes'][key]]) return alias_list if in_feature['start'] < 1 or in_feature['end'] > len(contig): self.warn("Feature with invalid location for specified " "contig: " + str(in_feature)) if self.strict: raise ValueError( "Features must be completely contained within the Contig in the " "Fasta file. Feature: " + str(in_feature)) return feat_seq = contig.seq[in_feature['start'] - 1:in_feature['end']].upper() if in_feature['strand'] in {'-', '-1'}: feat_seq = feat_seq.reverse_complement() # if the feature ID is duplicated (CDS or transpliced gene) we only # need to update the location and dna_sequence if in_feature.get('ID') in self.feature_dict: existing = self.feature_dict[in_feature['ID']] existing['location'].append(self._location(in_feature)) existing['dna_sequence'] = existing.get('dna_sequence', '') + str(feat_seq) existing['dna_sequence_length'] = len(existing['dna_sequence']) return # The following is common to all the feature types out_feat = { "id": in_feature.get('ID'), "type": in_feature['type'], "location": [self._location(in_feature)], "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), } # add optional fields if 'note' in in_feature['attributes']: out_feat['note'] = in_feature['attributes']["note"][0] ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes']) if ont: out_feat['ontology_terms'] = ont aliases = _aliases(in_feature) if aliases: out_feat['aliases'] = aliases if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'product' in in_feature['attributes']: out_feat['functions'] = in_feature['attributes']["product"] if 'product_name' in in_feature['attributes']: if "functions" in out_feat: out_feat['functions'].extend( in_feature['attributes']["product_name"]) else: out_feat['functions'] = in_feature['attributes'][ "product_name"] if 'function' in in_feature['attributes']: out_feat['functional_descriptions'] = in_feature['attributes'][ "function"] if 'inference' in in_feature['attributes']: GenomeUtils.parse_inferences(in_feature['attributes']['inference']) if 'trans-splicing' in in_feature['attributes'].get('exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['trans_splicing'] if 'pseudo' in in_feature['attributes'].get('exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['pseudo'] if 'ribosomal-slippage' in in_feature['attributes'].get( 'exception', []): out_feat['flags'] = out_feat.get('flags', []) + ['ribosomal_slippage'] parent_id = in_feature.get('Parent', '') if parent_id and parent_id not in self.feature_dict: raise ValueError( "Parent ID: {} was not found in feature ID list.".format( parent_id)) # if the feature is a exon or UTR, it will only be used to update the # location and sequence of it's parent, we add the info to it parent # feature but not the feature dict if in_feature['type'] in self.skip_types: if parent_id and in_feature['type'] in { 'exon', 'five_prime_UTR', 'three_prime_UTR' }: parent = self.feature_dict[parent_id] if in_feature['type'] not in parent: parent[in_feature['type']] = [] parent[in_feature['type']].append(out_feat) return # add type specific features elif 'gene' in in_feature['type']: out_feat['protein_translation_length'] = 0 out_feat['cdss'] = [] elif in_feature['type'] == 'CDS': if parent_id: parent = self.feature_dict[parent_id] if 'cdss' in parent: # parent must be a gene if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "genes_CDS_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "CDS_fail_child_of_gene_coordinate_validation"] .format(parent_id) ] parent['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id else: # parent must be mRNA if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["mRNA_fail_parent_coordinate_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "CDS_fail_child_of_mRNA_coordinate_validation"] .format(parent_id) ] parent['cds'] = in_feature['ID'] out_feat['parent_mrna'] = parent_id parent_gene = self.feature_dict[parent['parent_gene']] parent_gene['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent['parent_gene'] # keep track of CDSs for post processing self.cdss.add(out_feat['id']) elif in_feature['type'] == 'mRNA': if parent_id: parent = self.feature_dict[parent_id] if 'mrnas' not in parent: parent['mrnas'] = [] if 'cdss' in parent: # parent must be a gene parent['mrnas'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["genes_mRNA_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings["mRNAs_parent_gene_fails_location_validation"] .format(parent_id) ] else: out_feat["type"] = in_feature['type'] # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] if parent_id: parent = self.feature_dict[parent_id] if 'children' not in parent: parent['children'] = [] parent['children'].append(out_feat['id']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "generic_parents_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"] = out_feat.get('warnings', []) + [ warnings[ "generic_childs_parent_fails_location_validation"]. format(parent_id) ] self.feature_dict[out_feat['id']] = out_feat def _process_cdss(self): """Because CDSs can have multiple fragments, it's necessary to go back over them to calculate a final protein sequence""" for cds_id in self.cdss: cds = self.feature_dict[cds_id] try: prot_seq = str( Seq(cds['dna_sequence']).translate(self.code_table, cds=True).strip("*")) except TranslationError as e: cds['warnings'] = cds.get('warnings', []) + [str(e)] prot_seq = "" cds.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if 'parent_gene' in cds: parent_gene = self.feature_dict[cds['parent_gene']] # no propigation for now propagate_cds_props_to_gene(cds, parent_gene) elif self.generate_genes: spoof = copy.copy(cds) spoof['type'] = 'gene' spoof['id'] = cds['id'] + "_gene" spoof['cdss'] = [cds['id']] spoof['warnings'] = [ warnings['spoofed_gene'].format(cds['id']) ] self.feature_dict[spoof['id']] = spoof cds['parent_gene'] = spoof['id'] self.spoof_gene_count += 1 else: raise ValueError(warnings['no_spoof']) self.feature_dict[cds['id']] = cds def _update_from_exons(self, feature): """This function updates the sequence and location of a feature based on it's UTRs, CDSs and exon information""" # note that start and end here are in direction of translation def start(loc): return loc[0][1] def end(loc): if loc[-1][2] == "+": return loc[-1][1] + loc[-1][3] + 1 else: return loc[-1][1] - loc[-1][3] - 1 if 'exon' in feature: # update the feature with the exon locations and sequences feature['location'] = [x['location'][0] for x in feature['exon']] feature['dna_sequence'] = "".join(x['dna_sequence'] for x in feature['exon']) feature['dna_sequence_length'] = len(feature['dna_sequence']) # construct feature location from utrs and cdss if present elif 'cds' in feature: cds = [copy.deepcopy(self.feature_dict[feature['cds']])] locs = [] seq = "" for frag in feature.get('five_prime_UTR', []) + cds + \ feature.get('three_prime_UTR', []): # merge into last location if adjacent if locs and abs(end(locs) - start(frag['location'])) == 1: # extend the location length by the length of the first # location in the fragment first = frag['location'].pop(0) locs[-1][3] += first[3] locs.extend(frag['location']) seq += frag['dna_sequence'] feature['location'] = locs feature['dna_sequence'] = seq feature['dna_sequence_length'] = len(seq) # remove these properties as they are no longer needed for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']: feature.pop(x, None) else: ValueError('Feature {} must contain either exon or cds data to ' 'construct an accurate location and sequence'.format( feature['id'])) def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref, source, source_id, assembly, input_gff_file, molecule_type): """ _gen_genome_info: generate genome info """ genome = dict() genome["id"] = core_genome_name genome["scientific_name"] = scientific_name genome["assembly_ref"] = assembly_ref genome['molecule_type'] = molecule_type genome["features"] = [] genome["cdss"] = [] genome["mrnas"] = [] genome['non_coding_features'] = [] genome["gc_content"] = assembly["gc_content"] genome["dna_size"] = assembly["dna_size"] genome['md5'] = assembly['md5'] genome['contig_ids'], genome['contig_lengths'] = zip( *[(k, v['length']) for k, v in assembly['contigs'].items()]) genome['num_contigs'] = len(assembly['contigs']) genome['ontologies_present'] = dict(self.ontologies_present) genome['ontology_events'] = self.ontology_events genome['taxonomy'], genome['taxon_ref'], genome['domain'], \ genome["genetic_code"] = self.gi.retrieve_taxon(self.taxon_wsname, genome['scientific_name']) genome['source'], genome['genome_tiers'] = self.gi.determine_tier( source) genome['source_id'] = source_id # Phytozome gff files are not compatible with the RNASeq Pipeline # so it's better to build from the object than cache the file if self.is_phytozome: gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid'] for feature in self.feature_dict.values(): self.feature_counts[feature['type']] += 1 if 'exon' in feature or feature['type'] == 'mRNA': self._update_from_exons(feature) # Test if location order is in order. is_transpliced = "flags" in feature and "trans_splicing" in feature[ "flags"] if not is_transpliced and len(feature["location"]) > 1: # Check the order only if not trans_spliced and has more than 1 location. location_warning = self._check_location_order( feature["location"]) if location_warning is not None: feature["warnings"] = feature.get('warnings', []) + [location_warning] contig_len = genome["contig_lengths"][genome["contig_ids"].index( feature["location"][0][0])] feature = check_full_contig_length_or_multi_strand_feature( feature, is_transpliced, contig_len, self.skip_types) # sort features into their respective arrays if feature['type'] == 'CDS': del feature['type'] genome['cdss'].append(feature) elif feature['type'] == 'mRNA': del feature['type'] genome['mrnas'].append(feature) elif feature['type'] == 'gene': # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in feature: feature[key] = list(set(feature[key])) if feature['cdss']: del feature['type'] self.feature_counts["protein_encoding_gene"] += 1 genome['features'].append(feature) else: feature.pop('mrnas', None) feature.pop('cdss', None) feature.pop('protein_translation_length', None) self.feature_counts["non_coding_features"] += 1 genome['non_coding_features'].append(feature) else: genome['non_coding_features'].append(feature) if self.warnings: genome['warnings'] = self.warnings genome['feature_counts'] = dict(self.feature_counts) return genome
class FastaGFFToGenome: def __init__(self, config): self.cfg = config self.au = AssemblyUtil(config.callbackURL) self.dfu = DataFileUtil(self.cfg.callbackURL) self.gi = GenomeInterface(self.cfg) self.taxon_wsname = self.cfg.raw['taxon-workspace-name'] self.time_string = str( datetime.datetime.fromtimestamp( time.time()).strftime('%Y_%m_%d_%H_%M_%S')) yml_text = open('/kb/module/kbase.yml').read() mod_match = re.search(r'module-version:\n\W+(.+)\n', yml_text) if mod_match: self.version = mod_match.group(1) else: self.version = None self.ont_mappings = load_ontology_mappings('/kb/module/data') self.code_table = 11 self.skip_types = ('exon', 'five_prime_UTR', 'three_prime_UTR', 'start_codon', 'stop_codon', 'region', 'chromosome', 'scaffold') self.spoof_gene_count = 0 self.is_phytozome = False self.is_metagenome = False self.strict = True self.generate_genes = False self.warnings = [] # type: list self.feature_dict = collections.OrderedDict() # type: dict self.cdss = set() # type: set self.ontologies_present = collections.defaultdict(dict) # type: dict self.ontology_events = list() # type: list self.skiped_features = collections.Counter( ) # type: collections.Counter self.feature_counts = collections.Counter( ) # type: collections.Counter self.re_api_url = config.re_api_url def warn(self, message): self.warnings.append(message) def generate_genome_json(self, params): # 1) validate parameters self._validate_import_file_params(params) self.code_table = params.get('genetic_code', 11) # 2) construct the input directory staging area input_directory = os.path.join(self.cfg.sharedFolder, 'fast_gff_upload_' + str(uuid.uuid4())) os.makedirs(input_directory) file_paths = self._stage_input(params, input_directory) # 3) extract out the parameters params = self._set_parsed_params(params) if params.get('generate_missing_genes'): self.generate_genes = True # 4) do the upload genome = self._gen_genome_json(params, file_paths["gff_file"], file_paths["fasta_file"]) return genome, input_directory def import_file(self, params): self.is_metagenome = params.get('is_metagenome', False) if self.is_metagenome: ws_datatype = "KBaseMetagenomes.AnnotatedMetagenomeAssembly" else: ws_datatype = "KBaseGenomes.Genome" genome, input_directory = self.generate_genome_json(params) json.dump(genome, open(f"{self.cfg.sharedFolder}/{genome['id']}.json", 'w'), indent=4) result = self.gi.save_one_genome({ 'workspace': params['workspace_name'], 'name': params['genome_name'], 'data': genome, "meta": params.get('metadata', {}), 'workspace_datatype': ws_datatype, }) feature_types = "\n".join( [f"{k}: {v}" for k, v in genome['feature_counts'].items()]) report_string = ( f"A genome with {len(genome['contig_ids'])} contigs and the following feature " f"types was imported: \n{feature_types}") # XXX report_string is unused except for this log logging.info(report_string) # 5) clear the temp directory shutil.rmtree(input_directory) # 6) return the result info = result['info'] prefix = '' if self.is_metagenome: prefix = 'meta' details = { prefix + 'genome_ref': f'{info[6]}/{info[0]}/{info[4]}', prefix + 'genome_info': info } return details def _gen_genome_json(self, params, input_gff_file, input_fasta_file): # reading in GFF file features_by_contig = self._retrieve_gff_file(input_gff_file) contig_ids = set() # parse feature information fasta_contigs = Bio.SeqIO.parse(input_fasta_file, "fasta") for contig in fasta_contigs: molecule_type = str(contig.seq.alphabet).replace( 'IUPACAmbiguous', '').strip('()') contig_ids.add(contig.id) for feature in features_by_contig.get(contig.id, []): self._transform_feature(contig, feature) for cid in set(features_by_contig.keys()) - contig_ids: self.warn( f"Sequence name {cid} does not match a sequence id in the FASTA file." f"{len(features_by_contig[cid])} features will not be imported." ) if self.strict: raise ValueError( "Every feature sequence id must match a fasta sequence id") prot_fasta_path = f"{self.cfg.sharedFolder}/{params['genome_name']}_protein.fasta" # if is a metagenome, the following function writes a protein fasta self._process_cdss(prot_fasta_path) # save assembly file ''' Metagenome Changes: if we want to pass more stuff to AssemblyUtil, do here. TODO: add flag to save_assembly_from_fasta ''' if self.is_metagenome: genome_type = "metagenome" else: genome_type = params.get('genome_type', 'isolate') if params.get('existing_assembly_ref'): assembly_ref = params['existing_assembly_ref'] ret = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0] assembly_obj_type = ret['info'][2].split('-')[0] valid_assembly_types = [ "KBaseGenomeAnnotations.Assembly", "KBaseGenomes.ContigSet" ] if assembly_obj_type not in valid_assembly_types: raise ValueError( f"{assembly_ref} is not a reference to an assembly") assembly_data = ret['data'] # should do more thorough check of sequences. if not validate_lists_have_same_elements( assembly_data['contigs'].keys(), contig_ids): raise ValueError( f"provided assembly with ref {assembly_ref} does not " "have matching contig ids to provided input fasta.") logging.info(f"Using supplied assembly: {assembly_ref}") else: assembly_ref = self.au.save_assembly_from_fasta({ 'file': { 'path': input_fasta_file }, 'workspace_name': params['workspace_name'], 'assembly_name': params['genome_name'] + ".assembly", 'type': genome_type, }) assembly_data = self.dfu.get_objects({ 'object_refs': [assembly_ref], 'ignore_errors': 0 })['data'][0]['data'] # generate genome info genome = self._gen_genome_info(assembly_ref, assembly_data, input_gff_file, molecule_type, prot_fasta_path, params) if self.spoof_gene_count > 0: self.warn(warnings['spoofed_genome'].format(self.spoof_gene_count)) genome['suspect'] = 1 if self.warnings: genome['warnings'] = self.warnings return genome @staticmethod def _location(in_feature): in_feature['strand'] = in_feature['strand'].replace( "-1", "-").translate(strand_table) if in_feature['strand'] == '+': start = in_feature['start'] elif in_feature['strand'] == '-': start = in_feature['end'] else: raise ValueError('Invalid feature strand: {}'.format( in_feature['strand'])) return [ in_feature['contig'], start, in_feature['strand'], in_feature['end'] - in_feature['start'] + 1 ] @staticmethod def _validate_import_file_params(params): """ validate_import_file_params: validates params passed to FastaGFFToGenome.import_file method """ # check for required parameters for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']: if p not in params: raise ValueError(f'"{p}" parameter is required, but missing') # one and only one of 'path', or 'shock_id' is required for key in ('fasta_file', 'gff_file'): file = params[key] if not isinstance(file, dict): raise ValueError(f'Required "{key}" field must be a map/dict') sources = ('path', 'shock_id') n_valid_fields = sum(1 for f in sources if file.get(f)) print(f"inputs: {n_valid_fields}") if n_valid_fields < 1: raise ValueError( f'Required "{key}" field must include one source: ' f'{", ".join(sources)}') if n_valid_fields > 1: raise ValueError( f'Required "{key}" field has too many sources specified: ' f'{", ".join(file.keys())}') if params.get('genetic_code'): if not (isinstance(params['genetic_code'], int) and 0 < params['genetic_code'] < 32): raise ValueError( "Invalid genetic code specified: {}".format(params)) def _set_parsed_params(self, params): logging.info('Setting params') default_params = { 'taxon_wsname': self.cfg.raw['taxon-workspace-name'], 'scientific_name': 'unknown_taxon', 'source': 'User', 'release': None, 'metadata': {}, 'source_id': 'unknown', } default_params.update(params) logging.info(json.dumps(default_params, indent=1)) return default_params def _stage_input(self, params, input_directory): """ stage_input: Setup the input_directory by fetching the files and uncompressing if needed """ file_paths = dict() for key in ('fasta_file', 'gff_file'): file = params[key] file_path = None ''' below seems like weird if statement ''' if file.get('path') is not None: local_file_path = file['path'] file_path = os.path.join(input_directory, os.path.basename(local_file_path)) logging.info( f'Moving file from {local_file_path} to {file_path}') # Metagenome Updates # not sure if we have to be careful about moving the objects # around if os.path.isfile(local_file_path): shutil.copy2(local_file_path, file_path) else: raise FileNotFoundError( f"Input {key} file {local_file_path} not found") err_msg = "Shutil copy unsucessful" elif file.get('shock_id') is not None: # handle shock file logging.info(f'Downloading file from SHOCK node: ' f'{self.cfg.sharedFolder}-{file["shock_id"]}') sys.stdout.flush() file_name = self.dfu.shock_to_file({ 'file_path': input_directory, 'shock_id': file['shock_id'] })['node_file_name'] file_path = os.path.join(input_directory, file_name) err_msg = "Shock retrieval" # extract the file if it is compressed ''' Metagenome Changes: may have to make check here to see if the the file is too big for working dir. ''' if file_path is not None: logging.info("staged input file =" + file_path) sys.stdout.flush() if not os.path.isfile(file_path): raise FileNotFoundError(f"{file_path} not a file") dfUtil_result = self.dfu.unpack_file({'file_path': file_path}) file_paths[key] = dfUtil_result['file_path'] err_msg = "DataFielUtil 'unpack_file' function call" else: raise ValueError( 'No valid files could be extracted based on the input') if not os.path.isfile(file_path): raise ValueError(f"{err_msg} for {key} file to {file_path}") return file_paths def _retrieve_gff_file(self, input_gff_file): """ _retrieve_gff_file: retrieve info from gff_file """ logging.info("Reading GFF file") feature_list = collections.defaultdict(list) # type: dict is_patric = 0 ''' Metagenome Changes: the lines below iterate through the entire gff input file, which for a Metagenome may be an issue. ! Only a problem if there are space limits on processing in this request ''' for current_line in open(input_gff_file): if current_line.isspace( ) or current_line == "" or current_line.startswith("#"): continue # Split line try: (contig_id, source_id, feature_type, start, end, score, strand, phase, attributes) = current_line.split('\t') except ValueError: raise ValueError(f"unable to parse {current_line}") ''' Do Metagenomes need this phytozome/PATRIC stuff??''' # Checking to see if Phytozome if "phytozome" in source_id.lower(): self.is_phytozome = True # Checking to see if Phytozome if "PATRIC" in source_id: is_patric = True # PATRIC prepends their contig ids with some gibberish if is_patric and "|" in contig_id: contig_id = contig_id.split("|", 1)[1] # Populating basic feature object ftr: dict = { 'contig': contig_id, 'source': source_id, 'type': feature_type, 'start': int(start), 'end': int(end), 'score': score, 'strand': strand, 'phase': phase, 'attributes': collections.defaultdict(list) } # Populating with attribute key-value pair # This is where the feature id is from for attribute in attributes.split(";"): attribute = attribute.strip() # Sometimes empty string if not attribute: continue # Use of 1 to limit split as '=' character can also be made available later # Sometimes lack of "=", assume spaces instead if "=" in attribute: key, value = attribute.split("=", 1) elif " " in attribute: key, value = attribute.split(" ", 1) else: logging.debug(f'Unable to parse {attribute}') continue ftr['attributes'][make_snake_case(key)].append( parse.unquote(value.strip('"'))) ftr['attributes']['raw'] = attributes if "id" in ftr['attributes']: ftr['ID'] = ftr['attributes']['id'][0] if "parent" in ftr['attributes']: ftr['Parent'] = ftr['attributes']['parent'][0] feature_list[contig_id].append(ftr) # Some GFF/GTF files don't use "ID" so we go through the possibilities feature_list = self._add_missing_identifiers(feature_list) # Most bacterial files have only CDSs # In order to work with prokaryotic and eukaryotic gene structure synonymously # Here we add feature dictionaries representing the parent gene and mRNAs # feature_list = self._add_missing_parents(feature_list) # Phytozome has the annoying habit of editing their identifiers so we fix them if self.is_phytozome: self._update_phytozome_features(feature_list) # All identifiers need to be checked so that they follow the same general rules # Rules are listed within the function itself feature_list = self._update_identifiers(feature_list) return feature_list def _add_missing_identifiers(self, feature_list): logging.info("Adding missing identifiers") # General rule is to iterate through a range of possibilities if "ID" is missing for contig in feature_list: for i, feat in enumerate(feature_list[contig]): if "ID" not in feature_list[contig][i]: # all of the following are not guaranteed to be unique ID's # for key in ("transcriptid", "proteinid", "pacid", # "parent", "name", 'transcript_id'): for key in ("protein_id", "name", "pacid", "parent"): if key in feature_list[contig][i]['attributes']: feature_list[contig][i]['ID'] = feature_list[ contig][i]['attributes'][key][0] break if feat['type'] not in self.skip_types: self.feature_counts[feat['type']] += 1 # If the process fails, throw an error if "ID" not in feature_list[contig][i]: feat[ 'ID'] = f"{feat['type']}_{self.feature_counts[feat['type']]}" return feature_list def _add_missing_parents(self, feature_list): # General rules is if CDS or RNA missing parent, add them for contig in feature_list: ftrs = feature_list[contig] new_ftrs = [] for i in range(len(ftrs)): if ftrs[i]["type"] in self.skip_types: continue if "Parent" not in ftrs[i]: # Assuming parent doesn't exist at all, so create de novo instead of trying to find it if "RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]: new_gene_ftr = copy.deepcopy(ftrs[i]) new_gene_ftr["type"] = "gene" ftrs[i]["Parent"] = new_gene_ftr["ID"] new_ftrs.append(new_gene_ftr) if "CDS" in ftrs[i]["type"]: new_rna_ftr = copy.deepcopy(ftrs[i]) new_rna_ftr["type"] = "mRNA" new_ftrs.append(new_rna_ftr) ftrs[i]["Parent"] = new_rna_ftr["ID"] new_ftrs.append(ftrs[i]) feature_list[contig] = new_ftrs return feature_list @staticmethod def _update_phytozome_features(feature_list): # General rule is to use the "Name" field where possible # And update parent attribute correspondingly for contig in feature_list: feature_position_dict = {} for i in range(len(feature_list[contig])): # Maintain old_id for reference # Sometimes ID isn't available, so use PACid old_id = None for key in ("id", "pacid"): if key in feature_list[contig][i]['attributes']: old_id = feature_list[contig][i]['attributes'][key][0] break if old_id is None: continue # Retain old_id feature_position_dict[old_id] = i # Clip off the increment on CDS IDs so fragments of the same # CDS share the same ID if "CDS" in feature_list[contig][i]["ID"]: feature_list[contig][i]["ID"] = feature_list[contig][i][ "ID"].rsplit('.', 1)[0] # In Phytozome, gene and mRNA have "Name" field, CDS do not if "name" in feature_list[contig][i]['attributes']: feature_list[contig][i]["ID"] = feature_list[contig][i][ 'attributes']['name'][0] if "Parent" in feature_list[contig][i]: # Update Parent to match new ID of parent ftr feature_list[contig][i]["Parent"] = feature_list[contig][ feature_position_dict[feature_list[contig][i] ["Parent"]]]["ID"] return feature_list def _update_identifiers(self, feature_list): # General rules: # 1) Genes keep identifier # 2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA" # 3) CDS always uses RNA identifier with ".CDS" appended mRNA_parent_dict = dict() for contig in feature_list: for ftr in feature_list[contig]: if ftr["type"] in self.skip_types: continue if "Parent" in ftr: # Retain old_id of parents old_id = ftr["ID"] if ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]: ftr["ID"] = ftr["Parent"] + "." + ftr["type"] # link old to new ids for mRNA to use with CDS if "RNA" in ftr["type"]: mRNA_parent_dict[old_id] = ftr["ID"] return feature_list def _check_location_order(self, locations): """If order looks good return None. If out of order return warning If on multiple strands return warning""" strand = None last_start = 0 for location in locations: if strand is None: strand = location[2] elif strand != location[2]: return warnings["both_strand_coordinates"] if strand == "-": locations = reversed(locations) for location in locations: if last_start > location[1]: return warnings["out_of_order"] else: last_start = location[1] return None def _create_ontology_event(self, ontology_type): """Creates the ontology_event if necessary Returns the index of the ontology event back.""" if ontology_type not in self.ont_mappings: raise ValueError( "{} is not a supported ontology".format(ontology_type)) if "event_index" not in self.ont_mappings[ontology_type]: self.ont_mappings[ontology_type]['event_index'] = len( self.ontology_events) if ontology_type == "GO": ontology_ref = "KBaseOntology/gene_ontology" elif ontology_type == "PO": ontology_ref = "KBaseOntology/plant_ontology" else: ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology" self.ontology_events.append({ "method": "GenomeFileUtils Genbank uploader from annotations", "method_version": self.version, "timestamp": self.time_string, "id": ontology_type, "ontology_ref": ontology_ref }) return self.ont_mappings[ontology_type]['event_index'] def _get_ontology_db_xrefs(self, feature): """Splits the ontology info from the other db_xrefs""" ontology = collections.defaultdict(dict) # type: dict db_xrefs = [] # these are keys are formatted strangely and require special parsing for key in ("go_process", "go_function", "go_component"): ontology_event_index = self._create_ontology_event("GO") for term in feature.get(key, []): sp = term.split(" - ") ontology['GO'][sp[0]] = [ontology_event_index] self.ontologies_present['GO'][ sp[0]] = self.ont_mappings['GO'].get(sp[0], '') # CATH terms are not distinct from EC numbers so myst be found by key for term in feature.get('cath_funfam', []) + feature.get('cath', []): for ref in term.split(','): ontology['CATH'][ref] = [self._create_ontology_event("CATH")] self.ontologies_present['CATH'][ref] = self.ont_mappings[ 'CATH'].get(ref, '') search_keys = [ 'ontology_term', 'db_xref', 'dbxref', 'product_source', 'tigrfam', 'pfam', 'cog', 'go', 'po', 'ko' ] ont_terms = [] # type: list # flatten out into list of values for key in search_keys: if key in feature: ont_terms += [x for y in feature[key] for x in y.split(',')] for ref in ont_terms: if ref.startswith('GO:'): ontology['GO'][ref] = [self._create_ontology_event("GO")] self.ontologies_present['GO'][ref] = self.ont_mappings[ 'GO'].get(ref, '') elif ref.startswith('PO:'): ontology['PO'][ref] = [self._create_ontology_event("PO")] self.ontologies_present['PO'][ref] = self.ont_mappings[ 'PO'].get(ref, '') elif ref.startswith('KO:'): ontology['KO'][ref] = [self._create_ontology_event("KO")] self.ontologies_present['KO'][ref] = self.ont_mappings[ 'KO'].get(ref, '') elif ref.startswith('COG'): ontology['COG'][ref] = [self._create_ontology_event("COG")] self.ontologies_present['COG'][ref] = self.ont_mappings[ 'COG'].get(ref, '') elif ref.startswith('PF'): ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")] self.ontologies_present['PFAM'][ref] = self.ont_mappings[ 'PFAM'].get(ref, '') elif ref.startswith('TIGR'): ontology['TIGRFAM'][ref] = [ self._create_ontology_event("TIGRFAM") ] self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[ 'TIGRFAM'].get(ref, '') elif ":" not in ref: db_xrefs.append(tuple(["Unknown_Source", ref])) else: db_xrefs.append(tuple(ref.split(":", 1))) return dict(ontology), db_xrefs ''' Metagenome Changes: okay looks like this might be the real meat of it ''' def _transform_feature(self, contig, in_feature): """Converts a feature from the gff ftr format into the appropriate format for a genome object """ def _aliases(feat): keys = ('locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id', 'gene', 'ec_number', 'gene_synonym') alias_list = [] for key in keys: if key in feat['attributes']: alias_list.extend([(key, val) for val in feat['attributes'][key]]) return alias_list if in_feature['start'] < 1 or in_feature['end'] > len(contig): self.warn( f"Feature with invalid location for specified contig: {in_feature}" ) if self.strict: raise ValueError( "Features must be completely contained within the Contig in the " f"Fasta file. Feature: in_feature") return feat_seq = contig.seq[in_feature['start'] - 1:in_feature['end']].upper() if in_feature['strand'] in {'-', '-1'}: feat_seq = feat_seq.reverse_complement() # if the feature ID is duplicated (CDS or transpliced gene) we only # need to update the location and dna_sequence if in_feature.get('ID') in self.feature_dict: existing = self.feature_dict[in_feature['ID']] existing['location'].append(self._location(in_feature)) existing['dna_sequence'] = existing.get('dna_sequence', '') + str(feat_seq) existing['dna_sequence_length'] = len(existing['dna_sequence']) return # The following is common to all the feature types out_feat = { "id": in_feature.get('ID'), "type": in_feature['type'], "location": [self._location(in_feature)], "dna_sequence": str(feat_seq), "dna_sequence_length": len(feat_seq), "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(), "warnings": [], "flags": [], } # add optional fields if 'note' in in_feature['attributes']: out_feat['note'] = in_feature['attributes']["note"][0] ont, db_xrefs = self._get_ontology_db_xrefs(in_feature['attributes']) if ont: out_feat['ontology_terms'] = ont aliases = _aliases(in_feature) if aliases: out_feat['aliases'] = aliases if db_xrefs: out_feat['db_xrefs'] = db_xrefs if 'product' in in_feature['attributes']: out_feat['functions'] = in_feature['attributes']["product"] if 'product_name' in in_feature['attributes']: if "functions" in out_feat: out_feat['functions'].extend( in_feature['attributes']["product_name"]) else: out_feat['functions'] = in_feature['attributes'][ "product_name"] if 'function' in in_feature['attributes']: out_feat['functional_descriptions'] = in_feature['attributes'][ "function"] if 'inference' in in_feature['attributes']: GenomeUtils.parse_inferences(in_feature['attributes']['inference']) if 'trans-splicing' in in_feature['attributes'].get('exception', []): out_feat['flags'].append('trans_splicing') if 'pseudo' in in_feature['attributes'].get('exception', []): out_feat['flags'].append('pseudo') if 'ribosomal-slippage' in in_feature['attributes'].get( 'exception', []): out_feat['flags'].append('ribosomal_slippage') parent_id = in_feature.get('Parent', '') if parent_id and parent_id not in self.feature_dict: raise ValueError( f"Parent ID: {parent_id} was not found in feature ID list.") # if the feature is a exon or UTR, it will only be used to update the # location and sequence of it's parent, we add the info to it parent # feature but not the feature dict if in_feature['type'] in self.skip_types: if parent_id and in_feature['type'] in { 'exon', 'five_prime_UTR', 'three_prime_UTR' }: parent = self.feature_dict[parent_id] if in_feature['type'] not in parent: parent[in_feature['type']] = [] parent[in_feature['type']].append(out_feat) return # add type specific features elif 'gene' in in_feature['type']: out_feat['protein_translation_length'] = 0 out_feat['cdss'] = [] elif in_feature['type'] == 'CDS': if parent_id: parent = self.feature_dict[parent_id] if 'cdss' in parent: # parent must be a gene if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "genes_CDS_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append(warnings[ "CDS_fail_child_of_gene_coordinate_validation"]. format(parent_id)) parent['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id else: # parent must be mRNA if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["mRNA_fail_parent_coordinate_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append(warnings[ "CDS_fail_child_of_mRNA_coordinate_validation"]. format(parent_id)) parent['cds'] = in_feature['ID'] out_feat['parent_mrna'] = parent_id parent_gene = self.feature_dict[parent['parent_gene']] parent_gene['cdss'].append(in_feature['ID']) out_feat['parent_gene'] = parent['parent_gene'] # keep track of CDSs for post processing self.cdss.add(out_feat['id']) elif in_feature['type'] == 'mRNA': if parent_id: parent = self.feature_dict[parent_id] if 'mrnas' not in parent: parent['mrnas'] = [] if 'cdss' in parent: # parent must be a gene parent['mrnas'].append(in_feature['ID']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings["genes_mRNA_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append( warnings["mRNAs_parent_gene_fails_location_validation"] .format(parent_id)) else: out_feat["type"] = in_feature['type'] # this prevents big misc_features from blowing up the genome size if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE: del out_feat['dna_sequence'] if parent_id: parent = self.feature_dict[parent_id] if 'children' not in parent: parent['children'] = [] parent['children'].append(out_feat['id']) out_feat['parent_gene'] = parent_id if not is_parent(parent, out_feat): parent["warnings"] = parent.get('warnings', []) + [ warnings[ "generic_parents_child_fails_location_validation"]. format(out_feat["id"]) ] out_feat["warnings"].append(warnings[ "generic_childs_parent_fails_location_validation"]. format(parent_id)) # cleanup empty optional arrays for key in ['warnings', 'flags']: if not out_feat[key]: del out_feat[key] self.feature_dict[out_feat['id']] = out_feat def _process_cdss(self, prot_fasta_path): """Because CDSs can have multiple fragments, it's necessary to go back over them to calculate a final protein sequence""" if self.is_metagenome: prot_fasta = {} # type: dict untranslatable_prot = set() for cds_id in self.cdss: cds = self.feature_dict[cds_id] try: prot_seq = str( Seq(cds['dna_sequence']).translate(self.code_table, cds=True).strip("*")) except TranslationError as e: cds['warnings'] = cds.get('warnings', []) + [str(e)] # NOTE: we may need a different way of handling this for metagenomes. prot_seq = "" if self.is_metagenome: untranslatable_prot.add(cds_id) if self.is_metagenome: if prot_seq != "": protein_id = "" if cds.get("aliases"): aliases = cds['aliases'] for key, val in aliases: if key == "protein_id": protein_id = val if not protein_id: protein_id = cds['id'] # assign to some default else: # log a warning here? pass # TODO: update header to reflect what we actually want people # to see. if protein_id in prot_fasta: prot_fasta[protein_id][0] += "|" + cds['id'] else: fasta_seq_data = ">" + protein_id + " cds_ids:" + cds[ 'id'] prot_fasta[protein_id] = [fasta_seq_data, prot_seq] else: pass else: cds.update({ "protein_translation": prot_seq, "protein_md5": hashlib.md5(prot_seq.encode('utf8')).hexdigest(), "protein_translation_length": len(prot_seq), }) if 'parent_gene' in cds: parent_gene = self.feature_dict[cds['parent_gene']] # no propigation for now propagate_cds_props_to_gene(cds, parent_gene, self.is_metagenome) elif self.generate_genes: spoof = copy.copy(cds) spoof['type'] = 'gene' spoof['id'] = cds['id'] + "_gene" spoof['cdss'] = [cds['id']] spoof['warnings'] = [ warnings['spoofed_gene'].format(cds['id']) ] self.feature_dict[spoof['id']] = spoof cds['parent_gene'] = spoof['id'] self.spoof_gene_count += 1 else: raise ValueError(warnings['no_spoof']) self.feature_dict[cds['id']] = cds if self.is_metagenome: with open(prot_fasta_path, 'w') as fid: for key, line in prot_fasta.items(): fid.write('\n'.join(line)) # do something with 'untranslatable_prot' def _update_from_exons(self, feature): """This function updates the sequence and location of a feature based on it's UTRs, CDSs and exon information""" # note that start and end here are in direction of translation def start(loc): return loc[0][1] def end(loc): if loc[-1][2] == "+": return loc[-1][1] + loc[-1][3] + 1 else: return loc[-1][1] - loc[-1][3] - 1 if 'exon' in feature: # update the feature with the exon locations and sequences feature['location'] = [x['location'][0] for x in feature['exon']] feature['dna_sequence'] = "".join(x['dna_sequence'] for x in feature['exon']) feature['dna_sequence_length'] = len(feature['dna_sequence']) # construct feature location from utrs and cdss if present elif 'cds' in feature: cds = [copy.deepcopy(self.feature_dict[feature['cds']])] locs = [] # type: list seq = "" for frag in feature.get('five_prime_UTR', []) + cds + \ feature.get('three_prime_UTR', []): # merge into last location if adjacent if locs and abs(end(locs) - start(frag['location'])) == 1: # extend the location length by the length of the first # location in the fragment first = frag['location'].pop(0) locs[-1][3] += first[3] locs.extend(frag['location']) seq += frag['dna_sequence'] feature['location'] = locs feature['dna_sequence'] = seq feature['dna_sequence_length'] = len(seq) # remove these properties as they are no longer needed for x in ['five_prime_UTR', 'three_prime_UTR', 'exon']: feature.pop(x, None) else: ValueError( 'Feature {feature["id"]} must contain either exon or cds data to ' 'construct an accurate location and sequence') def _gen_genome_info(self, assembly_ref, assembly, input_gff_file, molecule_type, prot_fasta_path, params): """ _gen_genome_info: generate genome info Here is the meat of the saving operation. Genome Fields: features: protein encoding genes cdss: mrnas: mrna sequences non_coding_features: everything that doesn't fall into 'features', 'cdss', 'mrnas' """ features = [] cdss = [] mrnas = [] non_coding_features = [] genome = { "id": params.get('genome_name'), "scientific_name": params.get('scientific_name', "Unknown"), "assembly_ref": assembly_ref, 'molecule_type': molecule_type, "gc_content": assembly["gc_content"], "dna_size": assembly["dna_size"], 'md5': assembly['md5'], 'num_contigs': len(assembly['contigs']), 'ontologies_present': dict(self.ontologies_present), 'ontology_events': self.ontology_events, } if self.is_metagenome: metagenome_fields = [ ("publications", []), ("external_source_origination_date", None), ("original_source_file_name", None), ("notes", None), # NOTE: in the future environment should use an ontology. ("environment", None), ] # type: list for field, default in metagenome_fields: genome[field] = params.get(field, default) # save protein fasta to shock prot_to_shock = self.dfu.file_to_shock({ 'file_path': prot_fasta_path, 'make_handle': 1, 'pack': 'gzip' }) genome['protein_handle_ref'] = prot_to_shock['handle']['hid'] genome['contig_ids'], genome['contig_lengths'] = zip( *[(k, v['length']) for k, v in assembly['contigs'].items()]) if self.is_metagenome: genome['source'], _ = self.gi.determine_tier(params.get('source')) else: genome['source'], genome['genome_tiers'] = self.gi.determine_tier( params.get('source')) # Set taxonomy-related fields in the genome data if params.get('taxon_id'): GenomeUtils.set_taxon_data(int(params['taxon_id']), self.re_api_url, genome) else: GenomeUtils.set_default_taxon_data(genome) # handle optional fields for key in ('release', 'genetic_code', 'genome_type', 'source_id'): if params.get(key): genome[key] = params[key] # Phytozome gff files are not compatible with the RNASeq Pipeline # so it's better to build from the object than cache the file if self.is_phytozome or self.is_metagenome: gff_file_to_shock = self.dfu.file_to_shock({ 'file_path': input_gff_file, 'make_handle': 1, 'pack': "gzip" }) genome['gff_handle_ref'] = gff_file_to_shock['handle']['hid'] for feature in self.feature_dict.values(): self.feature_counts[feature['type']] += 1 if 'exon' in feature or feature['type'] == 'mRNA': self._update_from_exons(feature) # Test if location order is in order. is_transpliced = "flags" in feature and "trans_splicing" in feature[ "flags"] if not is_transpliced and len(feature["location"]) > 1: # Check the order only if not trans_spliced and has more than 1 location. location_warning = self._check_location_order( feature["location"]) if location_warning is not None: feature["warnings"] = feature.get('warnings', []) + [location_warning] contig_len = genome["contig_lengths"][genome["contig_ids"].index( feature["location"][0][0])] feature = check_full_contig_length_or_multi_strand_feature( feature, is_transpliced, contig_len, self.skip_types) # sort features into their respective arrays if feature['type'] == 'CDS': if not self.is_metagenome: del feature['type'] cdss.append(feature) elif feature['type'] == 'mRNA': if not self.is_metagenome: del feature['type'] mrnas.append(feature) elif feature['type'] == 'gene': # remove duplicates that may arise from CDS info propagation for key in ('functions', 'aliases', 'db_xrefs'): if key in feature: feature[key] = list(set(feature[key])) if feature['cdss']: if not self.is_metagenome: del feature['type'] self.feature_counts["protein_encoding_gene"] += 1 features.append(feature) else: feature.pop('mrnas', None) feature.pop('cdss', None) feature.pop('protein_translation_length', None) self.feature_counts["non_coding_gene"] += 1 non_coding_features.append(feature) else: non_coding_features.append(feature) # if input is metagenome, save features, cdss, non_coding_features, and # mrnas to shock if self.is_metagenome: # TODO: make this section more efficient by editing the above. metagenome_features = features + cdss + mrnas + non_coding_features genome['num_features'] = len(metagenome_features) genome_name = params['genome_name'] json_file_path = f'{self.cfg.sharedFolder}/{genome_name}_features.json' # save to json files first with open(json_file_path, 'w') as fid: json.dump(metagenome_features, fid) # write json to shock json_to_shock = self.dfu.file_to_shock({ 'file_path': json_file_path, 'make_handle': 1, 'pack': 'gzip' }) self.feature_counts["non_coding_features"] = len( non_coding_features) genome['features_handle_ref'] = json_to_shock['handle']['hid'] # remove json file to avoid disk overload os.remove(json_file_path) # delete python objects to reduce overhead del metagenome_features del features, cdss, mrnas, non_coding_features else: # TODO determine whether we want to deepcopy here instead of reference. genome['features'] = features genome['cdss'] = cdss genome['mrnas'] = mrnas genome['non_coding_features'] = non_coding_features self.feature_counts["non_coding_features"] = len( genome['non_coding_features']) if self.warnings: genome['warnings'] = self.warnings genome['feature_counts'] = dict(self.feature_counts) return genome