def get_best_record_key(records): # Takes records, a list of LocalHumanFiltered objects. # Returns the protein sequence key of the "best" filtered record, where best # is (th.) the best blast hit with the highest confident sccs entry # ACTUAL return is a triple: for the best protein, (query id, seq key, prot key, experiment key) if not records: raise Exception("Given records list contains no records") #DEBUG for rec in records: print "\tHit: {0}, {1}, {2}, eval: {3} bitscore {4}".format( rec.hit_id, rec.hit_protein_id, rec.hit_experiment_id, rec.hit_evalue, rec.hit_bitscore) session = ScopedSession() # Sort records by blast score (record.hit_bitscore), highest first records.sort(key=lambda record: record.hit_bitscore, reverse=True) # record is LocalHumanFiltered obj: query_id, hit_id, hit_protein_id, hit_experiment_id, hit_evalue, hit_bitscore best_sccs_conf = 0.0 best_record = None for record in records: # Get protein ORM object matching record protein = session.query(Protein).get(record.hit_protein_id) if not protein: raise Exception( "No protein fetched for record protein ID {0}".format( record.hit_protein_id)) try: prot_best_sccs = _get_best_sccs(protein) except Exception as e: print e continue if float(prot_best_sccs.confidence) == 1.0: return (record.query_id, protein.sequence_key, protein.id, protein.experiment_key) elif float(prot_best_sccs.confidence) > best_sccs_conf: best_sccs_conf = float(prot_best_sccs.confidence) best_record = record if best_record == None: raise Exception("Could not find a best protein sequence") hit_id, hit_protein_id, hit_experiment_id = map( int, (best_record.hit_id, best_record.hit_protein_id, best_record.hit_experiment_id)) return (record.query_id, hit_id, hit_protein_id, hit_experiment_id)
def get_best_homolog(records): # Takes a list of Filtered objects. Returns the filtered object with # best blast score and best structural coverage. if not records: raise LookupError("Given records list contains no records") session = ScopedSession() # Sort records by blast score (record.hit_bitscore), highest first records.sort(key=lambda record: record.hit_bitscore, reverse=True) best_struct_score = 0.0 best_record = None best_protein = None for record in records: # Get protein ORM object matching record protein = session.query(Protein).get(record.hit_protein_id) if not protein: print "No protein fetched for query {0} protein ID {1}. Skipping protein..".format(record.query_id, record.hit_protein_id) continue try: protein_score = structure_score(protein) except LookupError as l: print "Exception {0}. Skipping protein {1}..".format(l, protein.id) continue if protein_score == 1.0: best_record = record best_protein = protein break elif protein_score > best_struct_score: best_struct_score = protein_score best_record = record best_protein = protein if best_record == None: raise LookupError("Could not find a protein with structure") # Get structure list for best protein hit_structures = get_structure_ids(best_protein) # HomologComparisonProtein prototype: # (self, query_id, hit_id, hit_protein, hit_experiment, evalue, bitscore, structures, num_domains) return ( HomologComparisonProtein(best_record.query_id, best_record.hit_id, \ best_record.hit_protein_id, best_record.hit_experiment_id, \ best_record.hit_evalue, best_record.hit_bitscore, \ hit_structures, len(best_protein.domains) \ ) \ )
def get_best_record_key(records): # Takes records, a list of LocalHumanFiltered objects. # Returns the protein sequence key of the "best" filtered record, where best # is (th.) the best blast hit with the highest confident sccs entry # ACTUAL return is a triple: for the best protein, (query id, seq key, prot key, experiment key) if not records: raise Exception("Given records list contains no records") # DEBUG for rec in records: print "\tHit: {0}, {1}, {2}, eval: {3} bitscore {4}".format( rec.hit_id, rec.hit_protein_id, rec.hit_experiment_id, rec.hit_evalue, rec.hit_bitscore ) session = ScopedSession() # Sort records by blast score (record.hit_bitscore), highest first records.sort(key=lambda record: record.hit_bitscore, reverse=True) # record is LocalHumanFiltered obj: query_id, hit_id, hit_protein_id, hit_experiment_id, hit_evalue, hit_bitscore best_sccs_conf = 0.0 best_record = None for record in records: # Get protein ORM object matching record protein = session.query(Protein).get(record.hit_protein_id) if not protein: raise Exception("No protein fetched for record protein ID {0}".format(record.hit_protein_id)) try: prot_best_sccs = _get_best_sccs(protein) except Exception as e: print e continue if float(prot_best_sccs.confidence) == 1.0: return (record.query_id, protein.sequence_key, protein.id, protein.experiment_key) elif float(prot_best_sccs.confidence) > best_sccs_conf: best_sccs_conf = float(prot_best_sccs.confidence) best_record = record if best_record == None: raise Exception("Could not find a best protein sequence") hit_id, hit_protein_id, hit_experiment_id = map( int, (best_record.hit_id, best_record.hit_protein_id, best_record.hit_experiment_id) ) return (record.query_id, hit_id, hit_protein_id, hit_experiment_id)
def _get_pdb_struct(domain,): # Returns a triple (pdb id, chain, structure ORM object) for a PDB (/known struct) domain session = ScopedSession() # Check for objects needed to query for pdb struct. if domain.parent_id == None or domain.sccs == None: raise Exception("Domain {0} has no sccs record (cannot find PDB)".format(domain.id)) # Query to get the PDBSeqRes record matching the domain, return structure info psr = session.query(PDBSeqRes).filter_by(sequence_key=domain.parent_id[3:], chain=domain.sccs.chain).first() if not psr: raise Exception( "No PDB record found for domain {0} (seq {1}, chain {2})".format( domain.id, domain.parent_id[3:], domain.sccs.chain ) ) elif psr.structure == None: raise Exception("Domain {0} PDBSeqRes entry has no structure".format(domain.id)) return (psr.pdb.pdbId, psr.chain, psr.structure)
def db_add( domain_id, astral_id, astral_sid, domain_start, domain_stop, astral_start, astral_stop, pdb_id, chain, overlap, session=None, ): # Create a domain_to_astral ORM object and push it to the DB. Create a scoped session for push. # Raise exception if push fails # If session not given, create scoped session for the DB push if not session: print "db_add:: No session provided. Creating scoped session" session = ScopedSession() if astral_start == None or astral_stop == None: dtoa_obj = DomainAstralOverlap( domain_id=domain_id, astral_id=astral_id, astral_sid=astral_sid, domain_start=domain_start, domain_stop=domain_stop, pdb_id=pdb_id, chain=chain, overlap=overlap, ) else: dtoa_obj = DomainAstralOverlap( domain_id=domain_id, astral_id=astral_id, astral_sid=astral_sid, domain_start=domain_start, domain_stop=domain_stop, astral_start=int(astral_start), astral_stop=int(astral_stop), pdb_id=pdb_id, chain=chain, overlap=overlap, ) session.add(dtoa_obj) try: session.flush() except IntegrityError: print "DomainToAstral {0} object already in database. Skipping".format(dtoa_obj) session.rollback() except Exception as e: print "Error in pushing DomainToAstral object {0} to database".format(dtoa_obj) raise return dtoa_obj
def pairwise_max_sum(self, source_list, target_list): # Returns the sum of the maximum pair score where a pair is a (top element, bottom element) pair # Implented to check the pdb.mammoth table ORM object, to get astral-v-astrall struct comp scores # If an element in a list is None, add nothing to total. #DEBUG #print "Source structs: ", source_list #print "Target structs: ", target_list session = ScopedSession() sum = 0.0 # Compare each source structure to all target structures for source_struct in source_list: if source_struct == None: continue max_score = 0.0 for target_struct in target_list: if target_struct == None: continue struct_comp_obj = session.query(AstralComparison).filter_by(prediction=source_struct, experiment=target_struct).first() if not struct_comp_obj: struct_comp_obj = session.query(AstralComparison).filter_by(prediction=target_struct, experiment=source_struct).first() if not struct_comp_obj: #DEBUG #print "pms:: could not fetch comparison from the DB for target: {0}".format(target_struct) continue #DEBUG #print "pms:: struct comp object found for target: {0}, score {1}, cur max: {2}".format(target_struct, float(struct_comp_obj.zscore), max_score) if float(struct_comp_obj.zscore) > max_score: max_score = float(struct_comp_obj.zscore) #DEBUG #print "Max score: {0} for source '{1}' against targets:".format(max_score, source_struct), target_list sum += max_score return sum
def output_db(list): # Takes a list of HomologComparisonProteinPair objects to store in the DB, # table hpf.homolog_structure_allvall[...] session = ScopedSession() for pair in list: hsa_obj = HomologStructAllVAll(source_id=pair.source.query_id, target_id=pair.target.query_id, score=pair.similarity) session.add(hsa_obj) try: session.flush() except IntegrityError: print "Object {0} already in database. Rolling back and skipping..".format(hsa_obj) session.rollback() continue except: print "Could not add protein comparison object {0} to DB".format(hsa_obj) raise
def _setup_db(self, ): if not self.session: from hpf.hddb.db import ScopedSession self.session = ScopedSession()
class HpfHHPredResultFile(HHPredResultFile): """A subclass of HHPredResultFile that requires an hhpred version key and parses sequence_key from file Creates a 'dbo' variable (as a property) to represent the corresponding hpf.hddb.db ORM object from the DB. 'dbo' is None if HHPredResultFile is not represented in DB, otherwise is the corresponding ORM object. Adds the 'db_store' convenience method to push this object to the DB. Adds session instance variables to avoid redundant session creation """ def __init__(self, version_key=None, *args, **kwargs): if not version_key: raise Exception("Hpf HHPredResultFile must be provided with a version key. See hhpred_version DB table") super(HpfHHPredResultFile,self).__init__(*args, **kwargs) self.version_key = version_key self.sequence_key = self._parse_seqkey(self.query) self.session = None self._dbo = None def _get_dbo(self, ): if not self._dbo: self._dbo = self._fetch_dbo() return self._dbo def _set_dbo(self, value): self._dbo = value dbo = property(_get_dbo, _set_dbo) def _parse_seqkey(self, str): """Attempts to parse an HPF sequence key from given string. This is set to parse HHPred 'query' field of form 'hpf|<seqkey>|<proteinkey>|<experimentkey>|<experiment>' """ return int( str.split('|')[1] ) def _setup_db(self, ): if not self.session: from hpf.hddb.db import ScopedSession self.session = ScopedSession() def _fetch_dbo(self, ): """Queries the database for an already existing HHPRF (same sequence key and version). If query find matching HHPRF entry, returns it. If no matching entries in DB, returns None. """ from hpf.hddb.db import HHPredResultFile as HHPRF if not self.session: self._setup_db() rf = self.session.query(HHPRF).filter_by(sequence_key=self.sequence_key, version_key=self.version_key).first() return rf def db_store(self, ): """Pushes the HHPredResultFile object to the HPF database (via hpd.hddb.db.HHPredRFFactory) Flow: get session; if HHPredRF object NOT already represented in DB: make ORM object; push to DB; set dbo property to pushed object; return dbo object Returns the added ORM object """ if not self.session: self._setup_db() if self.dbo: print "HHPredResultFile already represented in DB: {0}. Returning...".format(self.dbo) else: from hpf.hddb.db import HHPredRFFactory, push_to_db hrf_dbo = HHPredRFFactory().create(self, debug=self.debug) push_to_db(self.session, hrf_dbo, exception_str="Failed to add {0} to the DB".format(hrf_dbo)) self.dbo = hrf_dbo return self.dbo def db_store_hits(self, ): """A convenience function to push all self.hits Hit objects to the DB via HHPredHit.db_store() Raises warning if no hits are available. Raises warning if # of hits parsed (in self.hits) is different from # of hits in DB If HHpredResultFile object has not yet been pushed to DB, pushes it. Returns list of HHPredHit ORM objects. """ if not self.hits: raise Warning("No hit objects to push to DB") if not self.dbo: self.db_store() if not self.session: self._setup_db() rf_key = self.dbo.id hit_dbos = list() for hit in self.hits: hit_dbos.append( hit.db_store(self.session, rf_key) ) if self.debug: print "Hits parsed: {0}, Hits in DB: {1}".format(len(self.hits), len(hit_dbos)) if len(self.hits) != len(hit_dbos): raise Warning("Number of hits parsed from file does not match number added to DB") return hit_dbos