def populate_seqlength(sdict): from hpf.hddb.db import Session, Sequence session = Session() for seq_id in sdict.keys(): seq_obj = session.query(Sequence).get(seq_id) sdict[seq_id].sequence_length = len(seq_obj.sequence) print "Getting sequence length for all sequences complete"
def setUp(self,): session = Session() # family_id = 19187 # protein_id = 1151960 # domain_id = 1302435 family_id = 18883 protein_id = 1063014 domain_id = 1212855 self.family = session.query(Family).get(family_id) self.protein = session.query(Protein).get(protein_id) self.domain = session.query(Domain).get(domain_id) self.ps_sites = get_possel_sites(self.family) (self.pdb_id, self.pdb_chain, self.struct) = get_domain_struct(self.domain) print "Testing on Family {0}, Protein {1}, Domain {2}, Structure {3}, PDB {4}{5}".format( family_id, protein_id, domain_id, self.struct.id, self.pdb_id, self.pdb_chain ) print "Sites of +Sel for family: ", self.ps_sites self.pdb_struct = BioPDBStruct(self.pdb_id, self.pdb_chain, debug=True) self.cluster_id_str = "Family {0}, Protein {1}, Domain {2}, Structure {3}".format( family_id, protein_id, domain_id, self.struct.id )
def astral_to_domain(experiment_id, threshold=0.5, dbstore=False): """ Fetches all known-type domains for givein experiment ID. Computes astral overlap to domains, prints and optionally stores in hpf DB (table astral_domain_overlap) Will only store overlaps >= threshold parameter """ from hpf.hddb.db import Session, push_to_db, AstralDomainOverlap, Protein, Domain from hpf.structure_comparison.overlap import overlap from hpf.structure_comparison.astral_util import get_astrals, get_astral_startstop, parse_astral_chain # Create session and get all domains session = Session() domains = session.query(Domain).join(Protein).filter(Protein.experiment_key==experiment_id).filter(Domain.domain_type.in_(['psiblast','fold_recognition'])).all() print "Considering {0} domains to compute astral overlap for".format(len(domains)) # For each domain, get representative astrals, calculate overlap, and store (optional) missing_astral = 0 for domain in domains: domain_pdb_start = domain.region.parent_start domain_pdb_stop = domain.region.parent_stop astrals = get_astrals(domain, session) if not astrals: #print "No astrals found for domain {0}".format(domain.id) missing_astral += 1 continue for astral in astrals: try: (astral_start, astral_stop) = get_astral_startstop(astral) overlap_ratio = overlap(astral_start, astral_stop, domain_pdb_start, domain_pdb_stop) except ValueError: print "Negative overlap for domain {0} ({1}-{2}), Astral {3} (PDB {4}{5})".format( domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain) print "Ignoring, moving to next astral.." continue except: print "Error calculating overlap for domain {0} ({1}-{2}), Astral {3} (PBD {4}{5})".format( domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain) raise if dbstore and overlap_ratio >= float(threshold): chain = parse_astral_chain(astral.chain) atod_dbo = AstralDomainOverlap(astral_id=astral.id, astral_sid=astral.sid, domain_id=domain.id, astral_start=astral_start, astral_stop=astral_stop, domain_start=domain_pdb_start, domain_stop=domain_pdb_stop, pdb_id=astral.pdbid, chain=chain, overlap=overlap_ratio, ) push_to_db(session, atod_dbo, exception_str="Error in pushing AstralDomainOverlap {0} to DB".format(atod_dbo), raise_on_duplicate=False) #print "Domain {0} ({1}-{2}), Astral {3} (PDB {4}{5}), Astral overlap {6}".format(domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain, overlap_ratio) print "Calculating astral to domain overlap for experiment {0} complete".format(experiment_id) print "{0} of {1} known-structure domains had no astral entries".format(missing_astral, len(domains))
class AlignmentToPDBMapper(): # Creates a mapper between a family alignment and a pdb structure # NOTE: This is an abstraction of all of the confused and undocumented mappers below. # Use this, because it is clear and because it works. KEEP IT SIMPLE. # UNIT TESTS in pdb/tests/alignment_pdbmapper_test.py # Access three public maps: # alignment_pdbseq_map - alignment col => pdb seqres # pdbseq_pdbatom_map - pdb seqres => pdb atom # alignment_pdbatom_map - alignment col => pdb atom def __init__(self, family, protein, domain, debug=False): self.DEBUG = debug self.session = Session() self.family = family self.protein = protein self.domain = domain self.alignment = self.family.alignments[0] if self.domain.sccs: self.pdbseqres = self.session.query(PDBSeqRes).filter_by(sequence_key=self.domain.parent_id[3:], chain=self.domain.sccs.chain).first() else: self.pdbseqres = self.session.query(PDBSeqRes).filter_by(sequence_key=self.domain.parent_id[3:]).first() self.pdbid = self.pdbseqres.pdb.pdbId+self.pdbseqres.chain # Create map: alignment column -> pdb seq res self.alignment_pdbseq_map = PDBDomainAlignmentMapper(self.protein, self.domain, self.alignment.alignment, self.pdbseqres, inverse=True) # Create map: pdb seq res -> pdb atom res self.pdbseq_pdbatom_map = PDBAtomSeqResMapper(self.pdbseqres, inverse=False) # Create map: alignment column -> pdb atom res self.alignment_pdbatom_map = AlignmentToPDBAtomMapper(self.alignment_pdbseq_map, self.pdbseq_pdbatom_map)
def xml(id): from hpf.hddb.db import Session, Family session = Session() family = session.query(Family).get(id) filename = "%i.xml" % family.id if runtime().opt(GZIP): import gzip filename = "%s.gz" % filename handle = gzip.open(filename,"w") else: handle = open(filename,"w") try: doc = FamilyFeatureBuilder( lambda: DefaultXMLGenerator(handle,pretty=True), lambda handler: StructureFeatureProvider(handler), lambda handler: ColumnFeatureProvider(handler), lambda handler: IeaFeatureProvider(handler), lambda handler: SelectionFeatureProvider(handler) ) doc.buildDocument(family) finally: handle.close() session.close()
def __init__(self, protein, domain, alignment, pdbseqres=None, **kwargs): """ @type protein: hpf.hddb.db.Protein @type domain: hpf.hddb.db.Domain @type alignment: Bio.Align.Generic @param pdbseqres: (Optional) Specify a PDB chain's sequence. """ super(PDBDomainAlignmentMapper,self).__init__( protein, domain, alignment, **kwargs) self._chain = pdbseqres # If no pdbseqres given, fetch PDBSeqRes ORM object from db by sequence_key (a parsed domain.parent_id). if self._chain==None: from hpf.hddb.db import Session,PDBSeqRes Session = Session() parent_id = int(self._domain.parent_id[3:]) if domain.sccs: self._chain = Session.query(PDBSeqRes).filter_by(sequence_key=parent_id, chain=domain.sccs.chain).first() else: self._chain = Session.query(PDBSeqRes).filter(PDBSeqRes.sequence_key==parent_id).first() pdbid = self._chain.pdb.pdbId+self._chain.chain self._pdbid = pdbid; self._seed = self._seed_alignment()
def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter( Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close()
def __getitem__(self, key): from hpf.hddb.db import Session, SequenceAc amnh = self._oid_amnh[key] session = Session() try: ac = session.query(SequenceAc).filter(SequenceAc.ac==amnh).first() return ac.protein_key if ac else None finally: session.close()
def setUp(self, ): session = Session() self.fam = session.query(Family).get(19187) self.prot = session.query(Protein).get(1171456) self.dom = session.query(Domain).get(1307995) self.alignment = self.fam.alignment.alignment self.psr = session.query(PDBSeqRes).filter_by(sequence_key=self.dom.parent_id[3:]).first() self.ptpdba = ProteinToPDBAtom(self.prot,self.dom,self.alignment, pdbseqres=self.psr, inverse=False) self.pdbmap = AlignmentToPDBMapper(self.fam.id, self.prot.id, self.dom.id, debug=True)
def main(): from hpf.hddb.db import Session, Sequence, Astral session = Session() astral_records = session.query(Astral) with open(args.outfile, 'w') as handle: for astral in astral_records: print "{0}\r".format(astral.sccs), handle.write(">{0}|{1}{2}|{3}\n".format(astral.sccs, astral.pdbid, astral.chain, astral.sequence_key)) handle.write("{0}\n".format(astral.sequence.sequence)) print "Exporting SCOP sequences to file {0} complete".format(args.outfile)
def ids_from_db(): # Retrieves a list of local (hpf db) sequence IDs for the sequences you want to calculate enrichment on # These must be the same ID as in the ID field of pfam/interpro results # Get sequence IDs for all humanrna proteins (773 list) session = Session() proteins = session.query(Protein).filter_by(experiment_key=1177).all() protein_seqids = [] for protein in proteins: protein_seqids.append(protein.sequence_key) return protein_seqids
def setUp(self, ): self.family = 19187; self.protein = 1171456; self.domain = 1307995; session = Session() self.family = session.query(Family).get(self.family) self.protein = session.query(Protein).get(self.protein) self.domain = session.query(Domain).get(self.domain) self.pdbmap = AlignmentToPDBMapper(self.family, self.protein, self.domain, debug=True)
def setUp(self, ): self.family_id = 18883 self.protein_id = 1063014 self.domain_id = 1212855 session = Session() self.family = session.query(Family).get(self.family_id) self.protein = session.query(Protein).get(self.protein_id) self.domain = session.query(Domain).get(self.domain_id) self.pdbmap = AlignmentToPDBMapper(self.family, self.protein, self.domain, debug=True)
def init_dict(): # Creates a returns a dict of the form sequence id => Signaprint obj. Initially, signaprint object is empty from hpf.hddb.db import Session, Protein from sqlalchemy import distinct session = Session() mouse_seqs = session.query(distinct(Protein.sequence_key)).filter_by(experiment_key=1171) signaprint_dict = dict() for (seq,) in mouse_seqs: signaprint_dict[seq] = Signaprint(sequence_id=seq) print "Dict of {0} sequence => Signaprints created".format(len(signaprint_dict)) return signaprint_dict
def _make_seqfile(self, ): from hpf.hddb.db import Session, Sequence session=Session() sequence = session.query(Sequence).get(self.sequence_id) if not sequence: raise Exception("Getting sequence object from database failed") outhandle = open(self.sequence_file, 'w') outhandle.write(">hpf_seqid|{0}\n".format(sequence.id)) outhandle.write("{0}\n".format(sequence.sequence)) outhandle.close() session.close()
def index(records): """ In-place rename's record id's from OID name to HPF sequence_key """ from hpf.hddb.db import Session, SequenceAc mapping = oid_amnh() session = Session() for record in records: amnh = mapping[record.id] id = str(session.query(SequenceAc).filter(SequenceAc.ac==amnh).one().sequence_key) record.id = id
def tasks(experiment_id): print "Interpro driver::tasks:: Getting tasks (sequences) for experiment {0}".format(experiment_id) session = Session() proteins = session.query(Protein.sequence_key).filter_by(experiment_key=experiment_id) experiment_seqs = list() for protein in proteins: experiment_seqs.append(protein[0]) if proteins.count() != len(experiment_seqs): raise Exception("Number of sequences extracted from experiment {0} does not match number of proteins".format(experiment_id)) unique_seqs = list(set(experiment_seqs)) print "{0} tasks (unique sequences) from {1} sequences retrieved".format(len(unique_seqs), len(experiment_seqs)) return unique_seqs
def translate_foldables(dir, update_db=True): """Translates foldable fasta files in the manner described above. If update_db is true, adds new sequences translated to hpf.sequence table and updates hpf.filesystemOutfile (foldable records) to link to the new sequence key. """ from hashlib import sha1 from hpf.hddb.reexport_foldables import write_fasta from hpf.hddb.db import push_to_db, Session, Sequence, FilesystemOutfile print "Translating foldable fastas found in dir {0}".format(dir) files = os.listdir(dir) for file in files: try: code = parse_code_from_file(file) except IOError as e: print "{0}. Ignoring file..".format(e) sequence_key, sequence = parse_foldable_file(file) # If the sequence contains a non-standard AA code, translate nonstandard to normal codes if re.search(nonstandard_pattern, sequence): print "{0} contains nonstandard AA codes. Translating".format(file) print "\tOriginal : {0}".format(sequence) translated_seq_id = "None" for nsaa in non_standard.keys(): sequence = sequence.replace(nsaa, non_standard[nsaa]) print "\tTranslated: {0}".format(sequence) if update_db: # Add new sequence to DB (push_ will return None if seq_dbo is already in DB) print "Adding translated sequence to the DB" session = Session() seq_dbo = Sequence(sequence=sequence, sha1=sha1(sequence).hexdigest()) seq_dbo = push_to_db(session, seq_dbo, exception_str="Pushing sequence for code {0} failed".format(code), raise_on_duplicate=False) if not seq_dbo: seq_dbo = session.query(Sequence).filter_by(sha1=sha1(sequence).hexdigest()).first() # Get foldable record and change seq key to new translated seq's id print "Updating foldable record from old ({0}) to new ({1}) sequence key".format(sequence_key, seq_dbo.id) foldable_dbo = session.query(FilesystemOutfile).filter_by(prediction_code=code).first() if not foldable_dbo: raise Exception("Foldable record not found in DB for code {0}".format(code)) foldable_dbo.sequence_key = seq_dbo.id session.flush() translated_seq_id = seq_dbo.id print "Writing translated foldable to file {0}".format(file) with open(file, 'w') as handle: write_fasta(handle, translated_seq_id, len(sequence), sequence) print "Translating foldables complete"
def _get_decoy_struct(domain, probability_cutoff=0.8): # Returns a DB Structure ORM object global session if session == None: session = Session() # Get decoy with highest MCM value that meets probability cutoff mcm = session.query(McmData).filter_by(outfile_key=domain.outfile_key).order_by('probability desc').first() if not mcm: print "Domain {0} has no decoy structures".format(self.domain.id) return None elif mcm.probability < probability_cutoff: print "Domain {0} structure (prob: {1}) does not meet cutoff {2}".format(domain.id, mcm.probability, probability_cutoff) return None return mcm.structure
def __init__(self, sequence_key, nr_db, ginzu_version="4", dir=None, autorun=True, dbstore=True, debug=True): """Variables: self.prediction - The PsipredPrediction object returned from Psipred32(...).run() self.dbo - If dbstore=True, the Psipred ORM object (DataBaseObject) from the HPF DB """ from hpf.hddb.db import Session, Sequence self.sequence_key = sequence_key self.nr_db = os.path.abspath(os.path.expanduser(nr_db)) self.ginzu_version = ginzu_version self.dir = dir if dir else os.getcwd() self.dbstore = dbstore self.debug = debug #kdrew: commenting out because "nr" is not a file but a location #if not os.path.isfile(self.nr_db): # raise Exception("Must provide a valid NR database file") self.session = Session() self.sequence = self.session.query(Sequence).get(self.sequence_key) if not self.sequence: raise Exception("No sequence with key {0} exists in DB".format(self.sequence_key)) self.fasta_file = "{0}.fasta".format(self.sequence_key) self.chkpt_file = "{0}.chk".format(self.sequence_key) self.psipred_file = "{0}.psipred".format(self.sequence_key) # Set in run (dbo optionally) self.prediction = None self.dbo = None if autorun: self.prediction = self.run()
def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter(Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close()
def cluster_driver(family_id): session = Session() family = session.query(Family).get(family_id) if family == None: raise Exception("Family {0} could not be fetched from the database".format(family_id)) # Get sites for family (+Sel and TODO: firedb) ps_sites = get_possel_sites(family) # DEBUG print "Start cluster analysis for family {0}".format(family.id) print "Family {0} sites of +sel: ".format(family.id), ps_sites # Get repr protein from family (first one) protein = family.proteins[0] # Attempt clustering on structures for all domains in protein for domain in protein.domains: try: (pdb_id, pdb_chain, struct) = get_domain_struct(domain) except Exception as e: print e print "Domain {0} (Family {1}, Protein {2}) has no valid structure. Skipping..".format( domain.id, family.id, protein.id ) continue # Get domain-specific sites domain_sites = get_domain_sites(family, protein, domain, ps_sites) # DEBUG print "Family {0}, Protein {1}, Domain {2} local sites: ".format(family.id, protein.id, domain.id), domain_sites # Create BioPDBStruct object to call cluster analysis on pdb_struct = BioPDBStruct(pdb_id, pdb_chain, debug=True) cluster_id_str = "Family {0}, Protein {1}, Domain {2}, Structure {3}".format( family.id, protein.id, domain.id, struct.id ) try: pdb_struct.cluster_analysis( domain_sites, sample_size=samples, store_file=results_file, tag=cluster_id_str, report=True ) except Exception as e: print e print "Can not complete clustering analysis on domain {0}. Skipping..".format(domain.id) continue # DEBUG print "Clustering for domain/structs in Family {0} (Protein {1}) complete".format(family.id, protein.id)
def tasks(): # Task -> a family id to process clustering on # Get list of families to cluster (GENERAL version would get list of proteins) session = Session() # TEST: limit query for testing # families = session.query(Family).filter_by(manually_curated=0).limit(5).all() families = session.query(Family).filter_by(manually_curated=0).all() family_ids = [] for fam in families: family_ids.append(fam.id) if family_ids == []: raise Exception("No families to cluster were retrieved from the db") # TEST - manually populate family test set # family_ids = [19187, 18883] return family_ids
def setUp(self, ): from hpf.pdb.psipred import HPFPsipredWrap session = Session() self.sequence_key = 8560575 self.ginzu_version = 4 self.sequence = session.query(Sequence).get(self.sequence_key).sequence self.reference_pred = session.query(PsipredORM).filter_by(sequence_key=self.sequence_key, ginzu_version=self.ginzu_version).first().prediction self.db = BASEPATH + "small_db" # Run HPFPsipredWrap to get the new SS string self.created_pred = HPFPsipredWrap(sequence_key=self.sequence_key, nr_db=self.db, ginzu_version=self.ginzu_version, autorun=True, dbstore=False ).get_prediction_string()
def _get_pdb_struct(domain, ): # Returns a DB Structure ORM object global session if session == None: session = Session() # Check for objects needed to query for pdb struct. if domain.parent_id == None or domain.sccs == None: print "Domain {0} has no sccs record (cannot find PDB). Returning None..".format(domain.id) return None # Query to get the PDBSeqRes record mathing the domain, return structure from PDBSeqRes. psr = session.query(PDBSeqRes).filter_by(sequence_key=domain.parent_id[3:], chain=domain.sccs.chain).first() if not psr: print "No PDB record found for domain {0} (seq {1}, chain {2}). Returning None.".format(domain.id, domain.parent_id[3:], domain.sccs.chain) return None elif psr.structure == None: print "Domain {0} PDBSeqRes entry has no structure. Returning None..".format(domain.id) return psr.structure
def export_fasta(outhandle, experiment): """ outhandle - a filehandle for writing sequences to experiment - the hpf.experiment DB id to fetch sequences from """ print "Exporting all sequences from Experiment {0}".format(experiment) session = Session() proteins = session.query(Protein).filter_by(experiment_key=experiment) num_seqs = proteins.count() i = 0 for protein in proteins: outhandle.write(">hpf|{0}|{1}|{2}|{3} ({4})\n".format(protein.sequence_key, protein.id, protein.experiment_key, protein.experiment.name, protein.experiment.short_name)) outhandle.write("{0}\n".format(protein.sequence.sequence)) sys.stdout.write("{0} of {1}\r".format(i, num_seqs)) i += 1 print "{0} sequences from experiment {1} exported".format(i, experiment)
def __enter__(self): if not self.dir: self.dir = mkdtemp() from hpf.hddb.db import Session self.session = Session() self.list_handle = open(self.mammoth_list,"w") self.info_handle = open(self.info_file,"w") print >>self.info_handle, "\t".join(McmDBExporter.columns) print >>self.list_handle, "MAMMOTH List\n%s" % self.dir return self
def populate_mgi(sdict): from hpf.hddb.db import Session, SequenceAc session = Session() mgi_count = 0 for seq in sdict.keys(): acs = session.query(SequenceAc).join(SequenceAc.protein).filter_by(sequence_key=seq, experiment_key=1171).all() mgi_ids = set() for ac in acs: if ac.ac2 == "None": continue mgi_ids.add(ac.ac2) if len(mgi_ids) == 0: sdict[seq].mgi = "None" elif len(mgi_ids) == 1: sdict[seq].mgi = list(mgi_ids)[0] mgi_count += 1 else: sdict[seq].mgi = list(mgi_ids) mgi_count += 1 print "{0} sequences assigned an MGI ID".format(mgi_count)
def main(experiment, outfile): from hpf.hddb.db import Session, Protein session = Session() gn_pattern = r"GN=(?P<gene_name>\S+)" handle = open(outfile, 'w') proteins = session.query(Protein).filter_by(experiment_key=experiment) for p in proteins: gn_found = re.search(gn_pattern, p.ac.description) gene_name = gn_found.group('gene_name') if gn_found else "None" #if p.ac.ac2: # handle.write("{0}\t{1}\t{2}\n".format(p.sequence_key, p.ac.ac2, gene_name)) #else: handle.write("{0}\t{1}\t{2}\n".format(p.sequence_key, p.ac.ac, gene_name)) sys.stdout.write("protein: {0} {1} \r".format(p.sequence_key, gene_name)) handle.close() print "Map complete"
def _db_setup(self, ): print "FastaFile db_setup" # Create a session via hpf.hddb.db Session (a Session object from sessionmaker, sqlalchemy). self.session = Session() print "Connected to database: {0}".format(engine.url) # Query the DB session for the given experiment table (if none, no experiment of that ID). self.experiment = self.session.query(Experiment).filter(Experiment.id == self.experiment_id).first() if self.experiment == None: raise ValueError("Experiment {0} does not exist in the database {1}.".format(self.experiment_id, engine.url))
def main(prediction_code): print "Running Rosetta ab init setup for code {0}".format(prediction_code) # Set up DB and query for foldable and domain records print "Opening session and querying DB for domain and foldable" session = Session() domain = session.query(Domain).filter_by(ibm_prediction_code=prediction_code).first() foldable = session.query(Foldable).filter_by(prediction_code=prediction_code).first() if not (domain and foldable): raise Exception("No domain or foldable record (or both) could be found for code {0}".format(prediction_code)) # Check/make work dir if not os.path.isdir(BASE_DIR): os.mkdir(WORK_DIR) # Check NR if not os.path.isfile(NR_DB): raise Exception("Given NR database {0} not valid".format(NR_DB)) # Make and change to working dir working_dir = os.path.join(WORK_DIR, prediction_code) os.mkdir(working_dir) print "Created working directory {0}".format(working_dir) os.chdir(working_dir) # Create foldable FASTA file fasta_file = os.path.join(working_dir, "{0}.fasta".format(prediction_code)) with open(fasta_file, 'w') as handle: handle.write(">domain:{0}|code:{1}|foldable_seq_key:{2}\n".format(domain.id, foldable.prediction_code, foldable.sequence_key)) handle.write("{0}\n".format(foldable.sequence.sequence)) print "Created foldable record fasta file {0}".format(fasta_file) # Currently DB does not hold psipred ss2-format files (damn). Run psipred, ignore return pred (only need file) print "Running Psipred on foldable sequence via HPFPsipredWrap..." HPFPsipredWrap(sequence_key=foldable.sequence_key, nr_db=NR_DB, ginzu_version=4, dir=working_dir, autorun=True, dbstore=True, debug=True) # Complete and exit print "Setup for Rosetta ab initio complete. Clean up files as necessary" print "(NOTE: Psipred ss2 outfile (vformat) is the <seqkey>.psipred.2 file)"
def xml(id): from hpf.hddb.db import Session, Family session = Session() family = session.query(Family).get(id) filename = "%i.xml" % family.id if runtime().opt(GZIP): import gzip filename = "%s.gz" % filename handle = gzip.open(filename, "w") else: handle = open(filename, "w") try: doc = FamilyFeatureBuilder( lambda: DefaultXMLGenerator(handle, pretty=True), lambda handler: StructureFeatureProvider(handler), lambda handler: ColumnFeatureProvider(handler), lambda handler: IeaFeatureProvider(handler), lambda handler: SelectionFeatureProvider(handler)) doc.buildDocument(family) finally: handle.close() session.close()
class OIDImporter(object): """ Import a set of OID files into the database """ def __init__(self, familyName, alignFile, alignColcullLog, alignSeqcullLog, treeFile, treeDiagCharsFile, codemlFile=None, alignFormat="fasta", oid_key=None): self.familyName = familyName self.treeFile = treeFile self.treeDiagCharsFile = treeDiagCharsFile self.alignFile = alignFile self.alignColcullLog = alignColcullLog self.alignSeqcullLog = alignSeqcullLog self.codemlFile = codemlFile self.alignFormat = alignFormat self.oid_key = oid_key def merge(self): from hpf.hddb.db import Session, Family self.session = Session() self.family = self.session.query(Family).filter( Family.name == self.familyName).first() if not self.family: runtime().debug("Creating family", self.familyName) self._family() self._alignment() self._tree() else: self.alignment = self.family.alignment self.tree = self.alignment.tree runtime().debug("Found family", self.family.id) if not self.family.alignments[0].tree.codeml: runtime().debug("Importing codeml") self._codeml() else: runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id) # Commit the session, close, and finish self.session.commit() self.session.close() def _index(self, name): n = name.split("#")[-1] if n.startswith("N"): n = n[1:] assert n.isdigit() return n def _tree(self): session = self.session # # Load the tree file and rename the taxa. # from Bio.Nexus.Nexus import Nexus # nex=Nexus(self.treeFile) # self.nexus = nex.trees[0] from Bio.Nexus.Trees import Tree as NewickTree tree_str = open(self.treeFile).read() self.nexus = NewickTree(tree_str) # Rename all the taxa. for id in self.nexus.get_terminals(): node = self.nexus.node(id) node.data.taxon = self._index(node.data.taxon) # Create the DB object from hpf.hddb.db import Tree self.tree = Tree(alignment_key=self.alignment.id, text=self.nexus.to_string(plain=False, plain_newick=True), filename=self.treeFile) session.add(self.tree) session.flush() # Now add in the node references self.nexus.name = self.tree.id assert self.tree.id != None runtime().debug("Added tree", self.tree) from hpf.hddb.db import TreeNodeFactory nodes = list(TreeNodeFactory().create(self.nexus)) for node in nodes: node.ancestor_node = node.ancestor.id if node.ancestor else None # This should add the new object into the session self.tree.nodes.append(node) #session.add(node) session.flush() runtime().debug("Appended", len(nodes), "tree nodes") session.flush() # Now import the diagnostic characters and reference the nodes. from hpf.amnh.oid import DiagCharsParser from hpf.hddb.db import TreeFactory biotree = TreeFactory(name_func=lambda node: str(node.id)).create( self.tree.nodes, self.tree.id) parser = DiagCharsParser(biotree) runtime().debug(self.treeDiagCharsFile) with open(self.treeDiagCharsFile) as handle: diagchars = list(parser.parse(handle)) runtime().debug("DiagChars", len(diagchars)) for d in diagchars: session.add(d) session.flush() def _codeml(self): if not self.codemlFile: return assert self.family.id != None assert self.tree.id != None # We need to convert the columns to the original alignment indices mapper = CulledColumnMapper(self.alignment, self.alignment.culled_columns) parser = PositiveSelectionParser() models = list(parser.parse(self.codemlFile)) runtime().debug("Found", len(models), "models") for i, model in enumerate(models): model.tree_key = self.tree.id self.session.add(model) self.session.flush() ps = list(model.ps) runtime().debug("Found", len(ps), "sites in model", model.model) for j, site in enumerate(ps): site.codeml_key = model.id # Indices in CodeML start at 1, convert to 0 and then map orig = site.column site.column = mapper[site.column - 1] runtime().debug("column", orig, "mapped to", site.column, site.probability) try: self.session.add(site) except: runtime().debug(i, ":", j, " failure on column", orig, "mapped to", site.column, site.probability) raise runtime().debug("Finished with model") self.session.flush() # with open(self.codemlFile) as handle: # text = handle.read() # from hpf.hddb.db import CodeML # self.codeml = CodeML(tree_key=self.tree.id, # filename=self.codemlFile, # text=text) # self.session.add(self.codeml) # self.session.flush() # parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml) # with open(self.codemlFile) as handle: # for selection in parser.parse(handle): # selection.codeml_key = self.codeml.id # self.session.merge(selection) runtime().debug("finished import codeml") def _alignment(self): session = self.session # Read the alignment from Bio import AlignIO with open(self.alignFile) as handle: align = AlignIO.read(handle, self.alignFormat) # Rename 'id' with the correct protein key for record in align: record.id = self._index(record.id) # Write to a text buffer and create the DB object text = StringIO() AlignIO.write([align], text, self.alignFormat) from hpf.hddb.db import Alignment self.alignment = Alignment(family_key=self.family.id, format=self.alignFormat, filename=self.alignFile, text=text.getvalue()) # Add to session and flush session.add(self.alignment) session.flush() # Flip through the proteins in the alignment and add # the records. for record in align: protein_key = record.id assert protein_key != 0 and protein_key != None, protein_key runtime().debug("protein: ", protein_key) from hpf.hddb.db import AlignmentProtein s = AlignmentProtein(alignment_key=self.alignment.id, protein_key=protein_key, sequence=str(record.seq)) session.add(s) session.flush() # There may exist multiple alignments, but the definition # of membership in the family is done here. from hpf.hddb.db import FamilyProtein fs = FamilyProtein(family_key=self.family.id, protein_key=protein_key, seed=True) session.merge(fs) # Now read the colulmn culling log. Indices start at 0 here. from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull with open(self.alignColcullLog) as handle: for line in handle: column, gap, taxa, ratio = line.split() col = AlignmentColcull(alignment_key=self.alignment.id, column=column, gap_percentage=ratio) session.merge(col) with open(self.alignSeqcullLog) as handle: #rice#1182215 0.712765957446808 for line in handle: parts = line.split() seq, score = parts seq = self._index(seq) #seq.split("#")[-1] if not seq.isdigit(): print parts, "SEQ:", seq assert false cul = AlignmentSeqcull(alignment_key=self.alignment.id, protein_key=seq, score=score) session.flush() def _family(self): session = self.session from hpf.hddb.db import Family self.family = Family(name=self.familyName, experiment_key=0) session.add(self.family) session.flush()
def tasks(): from hpf.hddb.db import Session, Family session = Session() ids = session.query(Family.id).filter(Family.manually_curated == 0).all() session.close() return [i[0] for i in ids]