def _store_cluster_centers(self, cluster_centers, sequence_key, convergence_key): """Stores cluster centers in hpf.rosetta_cluster (ORM: RosettaCluster) and centers' structures in hpf.structure (ORM: Structure) Parameters: cluster_centers - list of hpf.mcm.cluster.RobettaClusterCenter objs sequence_key - the sequence key of the domain sequence decoys were created from convergence_key - ID of the corresponding RosettaConvergence ORM object to link ClusterCenters to NOTE: RobettaCluster objects MUST have their pdb_file parameters set Returns dict of added RosettaCluster DBOs (linked to Structure objs by key), {index => RosettaCluster obj} """ from hpf.hddb.db import Structure, RosettaCluster self._check_session() centers_dict = dict() for center in cluster_centers: atom_record = center.get_atom_record() if atom_record == None or atom_record == "": raise Exception("Failed to get atom record for center {0}".format(center)) # Create and push Structure ORM object struct = Structure(sequence_key=sequence_key, structure_type="decoy", comment=center.pdb_file, text=atom_record) struct = push_to_db(self.session, struct, exception_str="Failed to add Structure for center {0} to DB".format(center)) # Create and push RosettaCluster cc = RosettaCluster(index=center.index, size=center.size, rank=center.rank, convergence_key=convergence_key, structure_key=struct.id) push_to_db(self.session, cc, exception_str="Failed to add RosettaCluster for center {0} to DB".format(center)) # Add to centers dict centers_dict[center.index] = cc return centers_dict
def astral_to_domain(experiment_id, threshold=0.5, dbstore=False): """ Fetches all known-type domains for givein experiment ID. Computes astral overlap to domains, prints and optionally stores in hpf DB (table astral_domain_overlap) Will only store overlaps >= threshold parameter """ from hpf.hddb.db import Session, push_to_db, AstralDomainOverlap, Protein, Domain from hpf.structure_comparison.overlap import overlap from hpf.structure_comparison.astral_util import get_astrals, get_astral_startstop, parse_astral_chain # Create session and get all domains session = Session() domains = session.query(Domain).join(Protein).filter(Protein.experiment_key==experiment_id).filter(Domain.domain_type.in_(['psiblast','fold_recognition'])).all() print "Considering {0} domains to compute astral overlap for".format(len(domains)) # For each domain, get representative astrals, calculate overlap, and store (optional) missing_astral = 0 for domain in domains: domain_pdb_start = domain.region.parent_start domain_pdb_stop = domain.region.parent_stop astrals = get_astrals(domain, session) if not astrals: #print "No astrals found for domain {0}".format(domain.id) missing_astral += 1 continue for astral in astrals: try: (astral_start, astral_stop) = get_astral_startstop(astral) overlap_ratio = overlap(astral_start, astral_stop, domain_pdb_start, domain_pdb_stop) except ValueError: print "Negative overlap for domain {0} ({1}-{2}), Astral {3} (PDB {4}{5})".format( domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain) print "Ignoring, moving to next astral.." continue except: print "Error calculating overlap for domain {0} ({1}-{2}), Astral {3} (PBD {4}{5})".format( domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain) raise if dbstore and overlap_ratio >= float(threshold): chain = parse_astral_chain(astral.chain) atod_dbo = AstralDomainOverlap(astral_id=astral.id, astral_sid=astral.sid, domain_id=domain.id, astral_start=astral_start, astral_stop=astral_stop, domain_start=domain_pdb_start, domain_stop=domain_pdb_stop, pdb_id=astral.pdbid, chain=chain, overlap=overlap_ratio, ) push_to_db(session, atod_dbo, exception_str="Error in pushing AstralDomainOverlap {0} to DB".format(atod_dbo), raise_on_duplicate=False) #print "Domain {0} ({1}-{2}), Astral {3} (PDB {4}{5}), Astral overlap {6}".format(domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain, overlap_ratio) print "Calculating astral to domain overlap for experiment {0} complete".format(experiment_id) print "{0} of {1} known-structure domains had no astral entries".format(missing_astral, len(domains))
def db_store(self, session, resultfile_key): """A convenience method to create HHPredHit ORM objects and push them to the HPF DB given a pre-existing session. Most easily called with a HHPredResultFile parent object """ from sqlalchemy.exc import IntegrityError from hpf.hddb.db import HHPredHitFactory, HHPredHit as HHPH, push_to_db hit_dbo = HHPredHitFactory().create(self, resultfile_key=resultfile_key) try: push_to_db(session, hit_dbo, exception_str="Failed to add HHPredHit {0} to the DB".format(hit_dbo)) except IntegrityError: print "HHpredHit {0} already exists in DB. Returning pre-existing ORM object".format(hit_dbo) hit_dbo = session.query(HHPH).filter_by(resultfile_key=hit_dbo.resultfile_key, number=hit_dbo.number).first() return hit_dbo
def db_store(self, ): """Pushes the HHPredResultFile object to the HPF database (via hpd.hddb.db.HHPredRFFactory) Flow: get session; if HHPredRF object NOT already represented in DB: make ORM object; push to DB; set dbo property to pushed object; return dbo object Returns the added ORM object """ if not self.session: self._setup_db() if self.dbo: print "HHPredResultFile already represented in DB: {0}. Returning...".format(self.dbo) else: from hpf.hddb.db import HHPredRFFactory, push_to_db hrf_dbo = HHPredRFFactory().create(self, debug=self.debug) push_to_db(self.session, hrf_dbo, exception_str="Failed to add {0} to the DB".format(hrf_dbo)) self.dbo = hrf_dbo return self.dbo
def translate_foldables(dir, update_db=True): """Translates foldable fasta files in the manner described above. If update_db is true, adds new sequences translated to hpf.sequence table and updates hpf.filesystemOutfile (foldable records) to link to the new sequence key. """ from hashlib import sha1 from hpf.hddb.reexport_foldables import write_fasta from hpf.hddb.db import push_to_db, Session, Sequence, FilesystemOutfile print "Translating foldable fastas found in dir {0}".format(dir) files = os.listdir(dir) for file in files: try: code = parse_code_from_file(file) except IOError as e: print "{0}. Ignoring file..".format(e) sequence_key, sequence = parse_foldable_file(file) # If the sequence contains a non-standard AA code, translate nonstandard to normal codes if re.search(nonstandard_pattern, sequence): print "{0} contains nonstandard AA codes. Translating".format(file) print "\tOriginal : {0}".format(sequence) translated_seq_id = "None" for nsaa in non_standard.keys(): sequence = sequence.replace(nsaa, non_standard[nsaa]) print "\tTranslated: {0}".format(sequence) if update_db: # Add new sequence to DB (push_ will return None if seq_dbo is already in DB) print "Adding translated sequence to the DB" session = Session() seq_dbo = Sequence(sequence=sequence, sha1=sha1(sequence).hexdigest()) seq_dbo = push_to_db(session, seq_dbo, exception_str="Pushing sequence for code {0} failed".format(code), raise_on_duplicate=False) if not seq_dbo: seq_dbo = session.query(Sequence).filter_by(sha1=sha1(sequence).hexdigest()).first() # Get foldable record and change seq key to new translated seq's id print "Updating foldable record from old ({0}) to new ({1}) sequence key".format(sequence_key, seq_dbo.id) foldable_dbo = session.query(FilesystemOutfile).filter_by(prediction_code=code).first() if not foldable_dbo: raise Exception("Foldable record not found in DB for code {0}".format(code)) foldable_dbo.sequence_key = seq_dbo.id session.flush() translated_seq_id = seq_dbo.id print "Writing translated foldable to file {0}".format(file) with open(file, 'w') as handle: write_fasta(handle, translated_seq_id, len(sequence), sequence) print "Translating foldables complete"
def mcm_setup(letter_code, version, comment): """ Takes a foldable record batch prediction code (two letters, eg 'ax'), a version (which serves as an arbitrary identifier for keeping track of MCM runs), and a comment to add the to hpf.mcmRun row's comment field. Checks DB for code and version. If pre-existing, exits. Otherwise, creates mcmRun entries corresponding to each foldable code, to keep track of their MCM processing. See mcmrun_driver.py for the actual MCM fetch and run functionality. """ from datetime import datetime from hpf.hddb.db import Session, McmRun, FilesystemOutfile as FoldableRecord, push_to_db session = Session() # Check code if not re.match(r"[a-z]{2}", letter_code): raise Exception("Code {0} does not match code format (two lower-case letters)".format(code)) # Check for pre-existing McmRun records with same code and version if session.query(McmRun).filter(McmRun.prediction_code.like("{0}%".format(letter_code))).filter(McmRun.version==version).first(): raise Exception("Code {0}, version {1} already exists in DB. Use a different version or clean DB").format(letter_code, version) # Get all foldables, and create McmRun objects from them and passed in params foldables = session.query(FoldableRecord).filter(FoldableRecord.prediction_code.like("{0}%".format(letter_code))).all() insert_date = datetime.now() mcmrun_count = 0 for fold_rec in foldables: new_mcmrun = McmRun(prediction_code=fold_rec.prediction_code, sequence_key=fold_rec.sequence_key, inserted=insert_date, version=version, comment=comment, ) push_to_db(session, new_mcmrun, exception_str="Could not push McmRun code {0} to DB".format(fold_rec.prediction_code)) mcmrun_count += 1 print "McmRun setup: {0} McmRun objects pushed to DB".format(mcmrun_count)
def _store_mcm_data(self, mcm_scores, sequence_key, outfile_key, convergence_key, centers_dict): """Creates and populates Mammoth and corresponding McmData ORM objs with linking IDs (sequence, outfile, convergence, and structure), then pushes to the DB. Generally store top 5 MCM scores Parameters: mcm_scores - list of hpf.hddb.db.McmData objects to store in DB sequence_key - ID of sequence from which MCM scores come outfile_key - ID of the foldable record (FilesystemOutfile) corresponding to the seq and MCM scores convergence_ke- ID of the convergence info for this sequence's cluster info (RosettaConvergence) centers_dict - dict of form {CC index => OBJ} where object is anything w/ instance variable 'structure_key' corresponding to the CC index's structure (EG: RosettaCluster DBO) This function will also fetch the structure key of the MCM score's mammothed cluster center and the structure key of the MCM score's mammoth astral structure (must do per mcm score) """ from hpf.hddb.db import MammothFactory factory = MammothFactory() self._check_session() for score in mcm_scores: cc_index = int(score.mammoth.prediction.split(".")[0][6:]) structure_key = centers_dict[cc_index].structure_key astral_structure_key = int(score.mammoth.experiment.split(".")[0]) mammoth_dbo = factory.create(score.mammoth) mammoth_dbo.p_structure_key = structure_key mammoth_dbo.e_structure_key = astral_structure_key push_to_db(self.session, mammoth_dbo, exception_str="Failed to add {0} to DB for sequence {1}, index {2}".format(mammoth_dbo, sequence_key, cc_index)) score.sequence_key = sequence_key score.outfile_key = outfile_key score.convergence_key = convergence_key score.structure_key = structure_key score.astral_structure_key = astral_structure_key push_to_db(self.session, score, exception_str="Failed to add {0} to DB for sequence {1}, index {2}".format(score, sequence_key, cc_index))
def _store_convergence(self, convergence, foldable_key, decoy_file): """Stores convergence values in hpf.rosetta_convergence (ORM: RosettaConvergence) Links to hpf.filesystemOutfile via foldable_key (id) Parameters: convergence - hpf.mcm.cluster.RobettaConvergence object containing cluster convergence info foldable_key - the ID of the hpf.filesystemOutfile (ORM: FilesytemOutfile) entry for this code decoy_file - the filename of decoy file given to the clusterer to cluster Returns the successfuly added RosettaConvergence ORM object """ from hpf.hddb.db import RosettaConvergence self._check_session() cv = RosettaConvergence(outfile_key=foldable_key, target=decoy_file, radius1=convergence.radius1, size1=convergence.size1, radius2=convergence.radius2, size2=convergence.size2, total_decoys=convergence.total_decoys) cv = push_to_db(self.session, cv, exception_str="Failed to add RosettaConvergence (outfile_key: {0}) to DB".format(cv.outfile_key)) return cv
def mammothrun_setup_multi_supergroup(supergroup_key1, supergroup_key2, new_supergroup_name, new_supergroup_comment, version, comment): """ """ from datetime import datetime from hpf.hddb.db import Session, MammothRun, MammothGroup, MammothSupergroup, push_to_db session = Session() msg_dbo = MammothSupergroup(name=new_supergroup_name, comment=new_supergroup_comment) push_to_db(session, msg_dbo, raise_on_duplicate=True) print msg_dbo new_supergroup_key = msg_dbo.id sg1_struct_keys = [] groups1 = session.query(MammothGroup).filter(MammothGroup.supergroup_key == supergroup_key1).all() for entry in groups1: sg1_struct_keys.append(entry.structure_key) sg2_struct_keys = [] groups2 = session.query(MammothGroup).filter(MammothGroup.supergroup_key == supergroup_key2).all() for entry in groups2: sg2_struct_keys.append(entry.structure_key) sg1_group_keys = set() for structure_key in sg1_struct_keys: #kdrew: take last two digits of structure_key and that is its group_key group_key = str(supergroup_key1)+str(structure_key)[-2:] sg1_group_keys.add(group_key) #kdrew: insert into MammothGroup mg_dbo = MammothGroup(supergroup_key=new_supergroup_key, group_key=group_key, structure_key=structure_key ) print mg_dbo push_to_db(session, mg_dbo, raise_on_duplicate=False) sg2_group_keys = set() for structure_key in sg2_struct_keys: #kdrew: take last two digits of structure_key and that is its group_key group_key = str(supergroup_key2)+str(structure_key)[-2:] sg2_group_keys.add(group_key) #kdrew: insert into MammothGroup mg_dbo = MammothGroup(supergroup_key=new_supergroup_key, group_key=group_key, structure_key=structure_key ) print mg_dbo push_to_db(session, mg_dbo, raise_on_duplicate=False) for group_key1 in sg1_group_keys: for group_key2 in sg2_group_keys: mr_dbo = MammothRun(supergroup_key=new_supergroup_key, group_key1=group_key1, group_key2=group_key2, version=version, comment=comment ) print mr_dbo push_to_db(session, mr_dbo, raise_on_duplicate=False)
def mammothrun_setup(structure_key_file, supergroup_name, supergroup_comment, version, comment): """ """ from datetime import datetime from hpf.hddb.db import Session, MammothRun, MammothGroup, MammothSupergroup, push_to_db session = Session() # Check for pre-existing McmRun records with same code and version #if session.query(MammothRun).filter(MammothRun.supergroup_key==supergroup_key).filter(MammothRun.version==version).first(): # raise Exception("Supergroup {0}, version {1} already exists in DB. Use a different version or clean DB").format(supergroup_key, version) msg_dbo = MammothSupergroup(name=supergroup_name, comment=supergroup_comment) push_to_db(session, msg_dbo, raise_on_duplicate=True) print msg_dbo supergroup_key = msg_dbo.id struct_keys = [] handle = open(structure_key_file) for line in handle.readlines(): try: int(line.rstrip()) struct_keys.append(line.rstrip()) except: continue #seen = set() #seen_add = seen.add #return [ x for x in seq if x not in seen and not seen_add(x)] group_keys = set() for structure_key in struct_keys: #kdrew: take last two digits of structure_key and that is its group_key group_key = str(structure_key)[-2:] group_keys.add(group_key) #kdrew: insert into MammothGroup mg_dbo = MammothGroup(supergroup_key=supergroup_key, group_key=group_key, structure_key=structure_key ) print mg_dbo push_to_db(session, mg_dbo, raise_on_duplicate=False) for group_key in group_keys: mrs_dbo = MammothRun(supergroup_key=supergroup_key, group_key1=group_key, group_key2=group_key, version=version, comment=comment ) print mrs_dbo push_to_db(session, mrs_dbo, raise_on_duplicate=False) for (group_key1, group_key2) in it.combinations(group_keys,2): mr_dbo = MammothRun(supergroup_key=supergroup_key, group_key1=group_key1, group_key2=group_key2, version=version, comment=comment ) print mr_dbo push_to_db(session, mr_dbo, raise_on_duplicate=False)
def structure_mammoth(prediction_id, experiment_id, ptype, etype, base_dir=None, dbstore=True, version=1, debug=False, cleanup=True, table_destination="default", fake_mammoth=False): """ Takes the hpf DB structure IDs of two protein structures. Optionally checks for pre-computed mammoth in DB (if exists, returns). Fetches PDB files for both structures to local storage, runs mammoth on structures, and optionally stores mammoth results in DB (hpf, structure_mammoth tbl). Creates directory p<prediction_id>_e<experiment_id>_mammoth in given base_dir. Removes this directory if cleanup is True. version is optional if when specifying dbstore=True, you want to check for and add a specific version (field in hpf.structure_mammoth) of structure->structure mammoth result. Defaults to 1 """ from hpf.pdb.mammoth import Mammoth, MammothAlignment, MammothCL from hpf.hddb.db import Session, Structure, Mammoth as MammothORM, StructureMammoth, StructureMammoth1186, StructureMammoth_class_lookup, push_to_db # Init session and check for pre-existing mammoth (optional) #session = Session() if dbstore: # Check StructureMammoth (hpf.structure_mammoth) for structure IDs previous = _sm_query(prediction_id, experiment_id, version) if previous: if debug: print "Previous version of Struct->Struct Mammoth exists: {0}".format(previous) return previous # Check Mammoth (hpf.mammoth) for structure IDs - if exists, transfer values, do not rerun Mammoth mammoth_dbo = _mammoth_query(prediction_id, experiment_id) if mammoth_dbo: if debug: print "Struct->Struct comparison exists in Mammoth table (hpf.mammoth): {0}".format(mammoth_dbo), print "Adding to StructureMammoth table and returning" SM = StructureMammoth_class_lookup[table_destination] sm_dbo = SM( prediction_id=mammoth_dbo.p_structure_key, experiment_id=mammoth_dbo.e_structure_key, prediction_type=ptype, experiment_type=etype, ini_psi=mammoth_dbo.ini_psi, ini_rms=mammoth_dbo.ini_rms, end_psi=mammoth_dbo.end_psi, end_rms=mammoth_dbo.end_rms, zscore=mammoth_dbo.zscore, evalue=mammoth_dbo.evalue, version=version) push_to_db(session, sm_dbo, raise_on_duplicate=False) if debug: print "Added: {0}".format(sm_dbo) return sm_dbo if fake_mammoth: #------------------------------------------------------------------------------------- # MANUAL BREAK: IF NO RESULTS IN THE DB, RETURN LARGE SCORE TO INDICATE MAMMOTH NECESSARY #print "Returning FALSE MAMMOTH" SM = StructureMammoth_class_lookup[table_destination] return SM( prediction_id=prediction_id, experiment_id=experiment_id, prediction_type=ptype, experiment_type=etype, ini_psi=0.0, ini_rms=0.0, end_psi=0.0, end_rms=0.0, zscore=100000.0, evalue=0.0, version=version) #------------------------------------------------------------------------------------- # Check input workdir if not base_dir: base_dir = os.getcwd() if not os.path.isdir(base_dir): raise Exception("Work dir '{0}' is not a valid directory".format(base_dir)) # Get prediction and experiment struct DBOs p_structure = session.query(Structure).get(prediction_id) if not (p_structure): raise Exception("Prediction structure ({0}) not found in DB".format(prediction_id,)) e_structure = session.query(Structure).get(experiment_id) if not (e_structure): raise Exception("Experiment structure ({0}) not found in DB".format(experiment_id,)) # Set up working environment work_dir = os.path.join(base_dir, "p{0}_e{1}_mammoth".format(prediction_id, experiment_id)) os.mkdir(work_dir) os.chdir(work_dir) prediction_file = "p{0}.pdb".format(prediction_id) experiment_file = "e{0}.pdb".format(experiment_id) write_struct_file(prediction_file, p_structure) write_struct_file(experiment_file, e_structure) # Run mammoth (via hpf.pdb.mammoth Mammoth), parse results mammoth_file = "p{0}_e{1}.mammoth".format(prediction_id, experiment_id) mcl = MammothCL(experiment_file, prediction_file, cwd=work_dir, output=mammoth_file) m = Mammoth(mcl, parse=True) mscore = m.run() # (Optional) store structure-structure mammoth in DB SM = StructureMammoth_class_lookup[table_destination] sm_dbo = SM( prediction_id=prediction_id, experiment_id=experiment_id, prediction_type=ptype, experiment_type=etype, ini_psi=mscore.ini_psi, ini_rms=mscore.ini_rms, end_psi=mscore.end_psi, end_rms=mscore.end_rms, zscore=mscore.zscore, evalue=mscore.evalue, version=version) if dbstore: push_to_db(session, sm_dbo, raise_on_duplicate=False) # Complete and (optional) cleanup print "Mammothing {0} against {1} complete (zscore {2})".format(prediction_id, experiment_id, mscore.zscore) os.chdir(base_dir) if cleanup: files = os.listdir(work_dir) for file in files: os.remove(os.path.join(work_dir,file)) os.removedirs(work_dir) session.close() return sm_dbo
previous = _sm_query(ac.source_astral.structure_key, ac.comparison_astral.structure_key, VERSION, session) if previous: print "Mammoth structure {0} -> {1} already exists in DB".format(ac.source_astral.structure_key, ac.comparison_astral.structure_key) continue # Make StructureMammoth object and push to DB sm_dbo = StructureMammoth(prediction_id=ac.source_astral.structure_key, experiment_id=ac.comparison_astral.structure_key, prediction_type='astral', experiment_type='astral', ini_psi=ac.ini_psi, ini_rms=ac.ini_rms, end_psi=ac.end_psi, end_rms=ac.end_rms, zscore=ac.zscore, evalue=ac.evalue, version=VERSION ) push_to_db(session, sm_dbo, raise_on_duplicate=False) added += 1 START += PIECE_SIZE STOP += PIECE_SIZE with open("astralmammoth2structuremammoth_workingon.txt", 'w') as handle: handle.write("AstralMammoth to StructureMammoth conversion: Working on piece {0} to {1}\n".format(START, STOP)) print "Translated {0} entries from astral_mammoth to structure_mammoth".format(added) print "Complete"