def _load_go_terms(self, go_terms, feature_id, analysis_id, go_db_id, skip_missing): for go_id in go_terms: term = go_id term_sp = term.split(':') if len(term_sp) != 2: self.session.rollback() raise Exception("Cannot parse GO term {}".format(go_id)) term_db = term_sp[0] term_acc = term_sp[1] try: goterm_id = self.ci.get_cvterm_id(term_acc, term_db) except chado.RecordNotFoundError: goterm_id = None if not goterm_id: if skip_missing: warn('Could not find term with name "%s", skipping it', term_acc) continue else: raise Exception('Could not find term with name "%s"' % term_acc) # Insert GO terms into feature_cvterm table. Default pub_id = 1 (NULL) was used. But # only insert if not already there self._add_feat_cvterm_with_id(feature_id, goterm_id) # Insert Go terms into the analysisfeatureprop table but only if it # doesn't already exist self._add_analysis_feature(feature_id, analysis_id, goterm_id, term_acc)
def _parse_interpro_xml5(self, analysis_id, organism_id, xml, parse_go, re_name, query_type, skip_missing): res = self.session.query(self.model.db).filter_by(name="GO") if res.count(): go_db_id = res.one().db_id else: warn("Goterm loading was requested but the GO schema is not installed in chado, skipping") go_db_id = False total_count = 0 for entity in xml: total_count += 1 for child in entity: child_name = child.tag if child_name == "xref": seq_id = child.get('id') try: feature_id = self._match_feature(seq_id, re_name, query_type, organism_id, skip_missing=False) # we need to have an exception if it fails except RecordNotFoundError: seq_name = child.get('name', "") feature_id = self._match_feature(seq_name, re_name, query_type, organism_id, skip_missing) if skip_missing and feature_id is None: continue analysisfeature_id = self._add_analysis_feature_ipr(feature_id, analysis_id, entity) if not analysisfeature_id: continue ipr_array = self._parse_feature_xml(entity, feature_id) ipr_terms = ipr_array["iprterms"] self._load_ipr_terms(ipr_terms, feature_id, analysis_id, skip_missing) if parse_go and go_db_id: self._load_go_terms(ipr_array["goterms"], feature_id, analysis_id, go_db_id, skip_missing) return total_count
def _load_ipr_terms(self, ipr_terms, feature_id, analysis_id, skip_missing): for ipr_id, ipr_term in ipr_terms.items(): if (ipr_term["ipr_name"] and ipr_term["ipr_name"] != 'noIPR'): # currently there is no InterPro Ontology OBO file so we can't # load the IPR terms that way, we need to just add them # as we encounter them. If the term already exists # we do not want to update it. # Check using IPRnumber (in case ipr_name changed at some point in time) if ipr_id in self._interpro_cache: cvterm_id = self._interpro_cache[ipr_id] else: cvterm_id = self.ci.create_cvterm(ipr_term['ipr_name'], 'INTERPRO', 'INTERPRO', term_definition=ipr_term['ipr_desc'], accession=ipr_id) if not cvterm_id: if skip_missing: warn('Could not find cvterm %s %s, skipping it', ipr_id, ipr_term['ipr_name']) continue else: raise Exception('Could not find cvterm %s %s' % ipr_id, ipr_term['ipr_name']) self._interpro_cache[ipr_id] = cvterm_id # Insert IPR terms into the feature_cvterm table # the default pub_id of 1 (NULL) is used. if the cvterm already exists then just skip adding it self._add_feat_cvterm_with_id(feature_id, cvterm_id) # Insert IPR terms into the analysisfeatureprop table but only if it # doesn't already exist self._add_analysis_feature(feature_id, analysis_id, cvterm_id, ipr_id)
def _match_feature(self, feature_id, re_name, query_type, organism_id, skip_missing=False): seqterm = self.ci.get_cvterm_id(query_type, 'sequence') if re_name: re_res = re.search(re_name, feature_id) if re_res: feature_id = re_res.group(1) cache_id = (feature_id, organism_id, seqterm) if cache_id not in self._feature_cache: if skip_missing: warn('Could not find feature with name "%s", skipping it', feature_id) return None else: raise RecordNotFoundError( 'Could not find feature with name "%s"' % feature_id) return self._feature_cache[cache_id]['feature_id']
def cli(ctx, url=None, api_key=None, admin=False, **kwds): """Help initialize global configuration (in home directory) """ click.echo("""Welcome to Chado's Chakin! (茶巾)""") if os.path.exists(config.global_config_path()): info( "Your chakin configuration already exists. Please edit it instead: %s" % config.global_config_path()) return 0 while True: # Check environment dbhost = click.prompt("PGHOST") dbname = click.prompt("PGDATABASE") dbuser = click.prompt("PGUSER") dbpass = click.prompt("PGPASS", hide_input=True) dbport = click.prompt("PGPORT") schema = click.prompt("PGSCHEMA") info("Testing connection...") try: instance = ChadoInstance(dbhost=dbhost, dbname=dbname, dbuser=dbuser, dbpass=dbpass, dbport=dbport, dbschema=schema) # We do a connection test during startup. info("Ok! Everything looks good.") break except Exception as e: warn( "Error, we could not access the configuration data for your instance: %s", e) should_break = click.prompt( "Continue despite inability to contact this instance? [y/n]") if should_break in ('Y', 'y'): break config_path = config.global_config_path() if os.path.exists(config_path): warn("File %s already exists, refusing to overwrite." % config_path) return -1 with open(config_path, "w") as f: f.write( CONFIG_TEMPLATE % { 'dbhost': dbhost, 'dbname': dbname, 'dbuser': dbuser, 'dbpass': dbpass, 'dbport': dbport, 'schema': schema, }) info(SUCCESS_MESSAGE)
def _parse_interpro_xml4(self, analysis_id, organism_id, xml, interpro_file, parse_go, re_name, query_type, skip_missing): # If there is an EBI header then we need to skip that # and set our proteins array to be the second element of the array. This # occurs if results were generated with the online InterProScan tool. # if the XML starts in with the results then this happens when InterProScan # is used command-line and we can just use the object as is res = self.session.query(self.model.db).filter_by(name="GO") if res.count(): go_db_id = res.one().db_id else: warn( "Goterm loading was requested but the GO schema is not installed in chado, skipping" ) go_db_id = False total_count = 0 if re.search("^EBIInterProScanResults", xml.tag): proteins = xml[1] elif re.search("^interpro_matches", xml.tag): proteins = xml for protein in proteins: total_count += 1 # match the protein id with the feature name feature_id = 0 seqid = protein.get('id') # Remove _ORF from the sequence name seqid = re.search(r'^(.+)_\d+_ORF\d+.*', seqid).group(1) # match the name of the feature in the XML file to a feature in Chado feature_id = self._match_feature(seqid, re_name, query_type, organism_id, skip_missing) if not feature_id: continue # Create an entry in the analysisfeature table and add the XML for this feature # to the analysisfeatureprop table analysisfeature_id = self._add_analysis_feature_ipr( feature_id, analysis_id, protein) if not analysisfeature_id: continue # parse the xml ipr_array = self._parse_feature_xml(protein, feature_id) ipr_terms = ipr_array['iprterms'] # Add IPR terms self._load_ipr_terms(ipr_terms, feature_id, analysis_id, skip_missing) if parse_go and go_db_id: self._load_go_terms(ipr_array["goterms"], feature_id, analysis_id, go_db_id, skip_missing) return total_count
def _create_biomaterial(self, biomaterial_name, organism_id, analysis_id=None, biosourceprovider_id=None, dbxref_id=None, description=None): # Check if biomaterial exist res_biomaterial = self.session.query(self.model.biomaterial).filter_by(name=biomaterial_name) biomaterial_id = "" if res_biomaterial.count(): biomaterial_id = res_biomaterial.one().biomaterial_id # Do not update if not set and existing in DB if not description: description = res_biomaterial.one().description if not dbxref_id: dbxref_id = res_biomaterial.one().dbxref_id if not biosourceprovider_id: res_biomaterial.one().biosourceprovider_id analysis_name = "" if analysis_id: res_analysis = self.session.query(self.model.analysis).filter_by(analysis_id=analysis_id) if res_analysis.count(): analysis_name = res_analysis.one().name else: warn("Analysis not found: will ignore") if (not description and analysis_name): description = 'This biomaterial: ' + biomaterial_name + ', was created for the analysis: ' + analysis_name if not biomaterial_id: biomat = self.model.biomaterial() biomat.name = biomaterial_name biomat.description = description biomat.taxon_id = organism_id biomat.biosourceprovider_id = biosourceprovider_id biomat.dbxref_id = dbxref_id self.session.add(biomat) self.session.flush() self.session.refresh(biomat) biomaterial_id = biomat.biomaterial_id else: self.session.query(self.model.biomaterial).filter_by(biomaterial_id=biomaterial_id).update({ 'description': description, 'biosourceprovider_id': biosourceprovider_id, 'dbxref_id': dbxref_id }) return biomaterial_id
def cli(ctx, url=None, api_key=None, admin=False, **kwds): """Help initialize global configuration (in home directory) """ click.echo("""Welcome to Chado's Chakin! (茶巾)""") if os.path.exists(config.global_config_path()): info("Your chakin configuration already exists. Please edit it instead: %s" % config.global_config_path()) return 0 while True: # Check environment dbhost = click.prompt("PGHOST") dbname = click.prompt("PGDATABASE") dbuser = click.prompt("PGUSER") dbpass = click.prompt("PGPASS", hide_input=True) dbport = click.prompt("PGPORT") schema = click.prompt("PGSCHEMA") info("Testing connection...") try: instance = ChadoInstance(dbhost=dbhost, dbname=dbname, dbuser=dbuser, dbpass=dbpass, dbport=dbport, dbschema=schema) # We do a connection test during startup. info("Ok! Everything looks good.") break except Exception as e: warn("Error, we could not access the configuration data for your instance: %s", e) should_break = click.prompt("Continue despite inability to contact this instance? [y/n]") if should_break in ('Y', 'y'): break config_path = config.global_config_path() if os.path.exists(config_path): warn("File %s already exists, refusing to overwrite." % config_path) return -1 with open(config_path, "w") as f: f.write(CONFIG_TEMPLATE % { 'dbhost': dbhost, 'dbname': dbname, 'dbuser': dbuser, 'dbpass': dbpass, 'dbport': dbport, 'schema': schema, }) info(SUCCESS_MESSAGE)
def _add_target(self, feat, target_str): target = target_str.split(' ') if len(target) != 3 and len(target) != 4: warn('Malformed Target value: {}, skipping'.format(target_str)) return strand = 1 if len(target) == 4: if target[3] == '+': strand = 1 elif target[3] == '-': strand = -1 else: warn('Malformed Target value (bad strand): {}, skipping'.format(target_str)) return landmark_str = target[0] landmark = None start = int(target[1]) end = int(target[2]) rank = 0 if feat in self._featureloc_cache: rank = len(self._featureloc_cache[feat]) for x in self._feature_cache: if x[0] == landmark_str: landmark = self._feature_cache[x]['feature_id'] break if landmark is None: warn('Malformed Target value (unknown target): {}, skipping'.format(target_str)) return self._do_add_featureloc(landmark, feat, rank, start, end, strand)
def go(self, input, organism_id, analysis_id, query_type='polypeptide', match_on_name=False, name_column=2, go_column=5, re_name=None, skip_missing=False): """ Load GO annotation from a tabular file, in the same way as does the tripal_analysis_go module :type input: str :param input: Path to the input tabular file to load :type organism_id: int :param organism_id: Organism ID :type analysis_id: int :param analysis_id: Analysis ID :type query_type: str :param query_type: The feature type (e.g. \'gene\', \'mRNA\', 'polypeptide', \'contig\') of the query. It must be a valid Sequence Ontology term. :type match_on_name: bool :param match_on_name: Match features using their name instead of their uniquename :type name_column: int :param name_column: Column containing the feature identifiers (2, 3, 10 or 11; default=2). :type go_column: int :param go_column: Column containing the GO id (default=5). :type re_name: str :param re_name: Regular expression to extract the feature name from the input file (first capturing group will be used). :type skip_missing: bool :param skip_missing: Skip lines with unknown features or GO id instead of aborting everything. :rtype: dict :return: Number of inserted GO terms """ if analysis_id and len(self.ci.analysis.get_analyses(analysis_id=analysis_id)) != 1: raise Exception("Could not find analysis with id '{}'".format(analysis_id)) if len(self.ci.organism.get_organisms(organism_id=organism_id)) != 1: raise Exception("Could not find organism with id '{}'".format(organism_id)) seqterm = self.ci.get_cvterm_id(query_type, 'sequence') # Cache all possibly existing features self._reset_cache() self._init_feature_cache(organism_id, seqterm, match_on_name) # Cache analysisfeature content for given analysis_id self._init_analysisfeature_cache(analysis_id) self._init_featcvterm_cache() # Cache all existing cvterms from GO cv db = 'GO' self.ci._preload_dbxref2cvterms(db) count_ins = 0 # Parse the tab file with open(input) as in_gaf: rd = csv.reader(in_gaf, delimiter=str("\t")) for row in rd: if row[0] and row[0][0] in ('!', '#'): # skip header continue term = row[go_column - 1] term_sp = term.split(':') if len(term_sp) != 2: raise Exception('Malformed term "%s"' % term) term_db = term_sp[0] term_acc = term_sp[1] feat_id = row[name_column - 1] feat_id = self._match_feature(feat_id, re_name, query_type, organism_id, skip_missing) if skip_missing and feat_id is None: continue try: term_id = self.ci.get_cvterm_id(term_acc, term_db) except chado.RecordNotFoundError: term_id = None if not term_id: if skip_missing: warn('Could not find term with name "%s", skipping it', term_acc) continue else: raise Exception('Could not find term with name "%s"' % term_acc) # Add feature<->cvterm association self._add_feat_cvterm_with_id(feat_id, term_id) # Associate the feature to the analysis self._add_analysis_feature(feat_id, analysis_id, term_id, term) count_ins += 1 self.session.commit() self._reset_cache() return {'inserted': count_ins}
def _load_gff_feature_with_children(self, rec, f, analysis_id, organism_id, re_protein_capture, re_protein, protein_id_attr, parent=None, no_seq_compute=False): # Be tolerant for proteins (shameless hard coding) if f.type == 'protein': f.type = 'polypeptide' if f.type in self._blacklisted_cvterms: if 'ID' in f.qualifiers and len(f.qualifiers['ID']) > 1: warn("WARNING: skipping feature %s of unknown type %s" % (f.qualifiers['ID'][0], f.type)) else: warn("WARNING: skipping feature of unknown type %s" % (f.type)) return full_transcript_seq = None if f.type == 'mRNA': seq_exons = [] seq_cds = [] min_cds = None max_cds = None detected_protein_id = None if protein_id_attr: if protein_id_attr in f.qualifiers and f.qualifiers[protein_id_attr]: detected_protein_id = f.qualifiers[protein_id_attr][0] # To compute mRNA and polypeptide for subrna in f.sub_features: if subrna.type == 'CDS': seq_cds.append(rec.seq[subrna.location.nofuzzy_start:subrna.location.nofuzzy_end]) if min_cds is None or subrna.location.start < min_cds: min_cds = subrna.location.start if max_cds is None or subrna.location.end > max_cds: max_cds = subrna.location.end if protein_id_attr and not detected_protein_id: if protein_id_attr in subrna.qualifiers and subrna.qualifiers[protein_id_attr]: detected_protein_id = subrna.qualifiers[protein_id_attr][0] if subrna.type == 'exon': seq_exons.append(rec.seq[subrna.location.nofuzzy_start:subrna.location.nofuzzy_end]) if not no_seq_compute and len(rec.seq) > 0 and str(rec.seq)[0:10] != "??????????": if seq_exons: full_transcript_seq = reduce(operator.add, seq_exons) elif seq_cds: full_transcript_seq = reduce(operator.add, seq_cds) if f.strand == -1: full_transcript_seq = full_transcript_seq.reverse_complement() if full_transcript_seq is not None: added_feat = self._add_feature_with_attr(rec, f, analysis_id, organism_id, residues=str(full_transcript_seq), parent=parent) else: added_feat = self._add_feature_with_attr(rec, f, analysis_id, organism_id, parent=parent) mrna_has_polypeptide = False for subf in f.sub_features: self._load_gff_feature_with_children(rec, subf, analysis_id, organism_id, re_protein_capture, re_protein, protein_id_attr, parent=added_feat['feature_id'], no_seq_compute=no_seq_compute) if f.type == 'mRNA': mrna_has_polypeptide = mrna_has_polypeptide or (subf.type == 'polypeptide') # Create a polypeptide feature if f.type == 'mRNA' and not mrna_has_polypeptide and min_cds is not None and max_cds is not None: if re_protein: pep_uname = re.sub(re_protein_capture, re_protein, added_feat['uniquename']) elif detected_protein_id: pep_uname = detected_protein_id else: pep_uname = added_feat['uniquename'] + '-protein' polypeptide = SeqFeature(FeatureLocation(min_cds, max_cds), type="polypeptide", strand=f.location.strand, qualifiers={'ID': [pep_uname], 'Name': [added_feat['name']]}) if 'source' in subrna.qualifiers: polypeptide.qualifiers['source'] = subrna.qualifiers['source'] protein_seq = None if not no_seq_compute and len(rec.seq) > 0 and str(rec.seq)[0:10] != "??????????": full_cds_seq = reduce(operator.add, seq_cds) if f.strand == -1: full_cds_seq = full_cds_seq.reverse_complement() protein_seq = str(full_cds_seq.translate()) self._add_feature_with_attr(rec, polypeptide, analysis_id, organism_id, residues=protein_seq, parent=added_feat['feature_id'], parent_rel='derives_from')
def load_gff(self, gff, analysis_id, organism_id, landmark_type=None, re_protein=None, re_protein_capture="^(.*?)$", fasta=None, no_seq_compute=False, quiet=False, add_only=False, protein_id_attr=None): """ Load features from a gff file :type gff: str :param gff: Path to the Fasta file to load :type analysis_id: int :param analysis_id: Analysis ID :type organism_id: int :param organism_id: Organism ID :type landmark_type: str :param landmark_type: Type of the landmarks (will speed up loading if provided, e.g. contig, should be a term of the Sequence ontology) :type re_protein: str :param re_protein: Replacement string for the protein name using capturing groups defined by --re_protein_capture :type re_protein_capture: str :param re_protein_capture: Regular expression to capture groups in mRNA name to use in --re_protein (e.g. "^(.*?)-R([A-Z]+)$", default="^(.*?)$") :type protein_id_attr: str :param protein_id_attr: Attribute containing the protein uniquename. It is searched at the mRNA level, and if not found at CDS level. :type fasta: str :param fasta: Path to a Fasta containing sequences for some features. When creating a feature, if its sequence is in this fasta file it will be loaded. Otherwise for mRNA and polypeptides it will be computed from the genome sequence (if available), otherwise it will be left empty. :type no_seq_compute: bool :param no_seq_compute: Disable the computation of mRNA and polypeptides sequences based on genome sequence and positions. :type quiet: bool :param quiet: Hide progress information :type add_only: bool :param add_only: Use this flag if you're not updating existing features, but just adding new features to the selected analysis and organism. It will speedup loading, and reduce memory usage, but might produce errors in case of already existing feature. :rtype: None :return: None """ if len(self.ci.analysis.get_analyses(analysis_id=analysis_id)) != 1: raise Exception("Could not find analysis with id '{}'".format(analysis_id)) if len(self.ci.organism.get_organisms(organism_id=organism_id)) != 1: raise Exception("Could not find organism with id '{}'".format(organism_id)) if protein_id_attr and re_protein: raise Exception("--protein_id_attr and --re_protein cannot be used at the same time.") self.cache_existing = not add_only # Get possible landmarks landmarks = self.session.query(self.model.feature.name, self.model.feature.uniquename, self.model.feature.feature_id, self.model.feature.type_id, self.model.feature.organism_id) \ .filter_by(organism_id=organism_id) if landmark_type: # Filter by landmark type if provided (else we look for all features) landmark_type_id = self.ci.get_cvterm_id(landmark_type, 'sequence') landmarks = landmarks.filter(self.model.feature.type_id == landmark_type_id) self._landmark_cache = {} for lm in landmarks: if lm.name not in self._landmark_cache: self._landmark_cache[lm.name] = [] if lm.feature_id not in self._landmark_cache[lm.name]: self._landmark_cache[lm.name].append(lm.feature_id) # There may be multiple landmarks with the same name # Also look for uniquename if lm.uniquename not in self._landmark_cache: self._landmark_cache[lm.uniquename] = [] if lm.feature_id not in self._landmark_cache[lm.uniquename]: self._landmark_cache[lm.uniquename].append(lm.feature_id) examiner = GFF.GFFExaminer() gff_handle = open(gff) gff_limits = examiner.available_limits(gff_handle) gff_handle.close() # Check that we have all the cvterms in the db self._blacklisted_cvterms = [] for feat_type in gff_limits['gff_type']: type_to_check = feat_type[0] # Be tolerant for proteins (shameless hard coding) if type_to_check == 'protein': type_to_check = 'polypeptide' # Will raise an exception if not present + keep value in cache try: self.ci.get_cvterm_id(type_to_check, 'sequence', True) except chado.RecordNotFoundError: if type_to_check not in self._blacklisted_cvterms: warn("WARNING: will skip features of unknown type: %s", type_to_check) self._blacklisted_cvterms.append(type_to_check) # Read optional fasta file self._fasta_sequence_cache = {} if fasta: for record in SeqIO.parse(fasta, "fasta"): self._fasta_sequence_cache[record.id] = str(record.seq) # Check that all landmarks are there for seq_id in gff_limits['gff_id']: seq_id = seq_id[0] if seq_id not in self._landmark_cache: if landmark_type: # Landmark does not exist yet, but we know how to create it lm = SeqFeature(FeatureLocation(0, 1), type=landmark_type, qualifiers={'ID': [seq_id], 'Name': [seq_id]}) if seq_id in self._fasta_sequence_cache: added_feat = self._add_feature_with_attr(None, lm, analysis_id, organism_id, have_loc=False, residues=self._fasta_sequence_cache[seq_id]) else: added_feat = self._add_feature_with_attr(None, lm, analysis_id, organism_id, have_loc=False) self._landmark_cache[seq_id] = [added_feat['feature_id']] else: raise Exception("Could not find landmark named '{}', add --landmark_type to create it".format(seq_id)) elif len(self._landmark_cache[seq_id]) > 1: raise Exception("Found {} landmarks with same name '{}'".format(len(self._landmark_cache[seq_id]), seq_id)) count_ins = 0 for rec in GFF.parse(gff): # Preload landmark seq to compute some seqs on it # We compare to ????... as the gff parser will populate rec.seq with a fake sequence based on the size from "sequence-region" header if not no_seq_compute: if rec.id in self._fasta_sequence_cache: rec.seq = Seq.Seq(self._fasta_sequence_cache[rec.id]) del self._fasta_sequence_cache[rec.id] # Save a little memory elif len(rec.seq) == 0 or str(rec.seq)[0:10] == "??????????": seq_res = self.session.query(self.model.feature.residues) \ .filter(self.model.feature.uniquename == rec.id) if landmark_type: seq_res = seq_res.filter(self.model.feature.type_id == landmark_type_id) seq_res = seq_res.all() if len(seq_res) == 1 and seq_res[0].residues: rec.seq = Seq.Seq(seq_res[0].residues) # Set a custom attr to store the chado feature_id rec._chado_feature_id = self._landmark_cache[rec.id][0] if not quiet: print("Loading features on {}".format(rec.id)) for f in rec.features: self._load_gff_feature_with_children(rec, f, analysis_id, organism_id, re_protein_capture, re_protein, protein_id_attr, no_seq_compute=no_seq_compute) count_ins += 1 if not quiet: print("Inserted feature #{}".format(count_ins)) self._update_rel_ranks() self.session.commit() self._reset_cache() return {'inserted': count_ins}