Ejemplo n.º 1
0
def populate_seqlength(sdict):
    from hpf.hddb.db import Session, Sequence
    session = Session()
    for seq_id in sdict.keys():
        seq_obj = session.query(Sequence).get(seq_id)
        sdict[seq_id].sequence_length = len(seq_obj.sequence)
    print "Getting sequence length for all sequences complete"
Ejemplo n.º 2
0
    def setUp(self,):
        session = Session()
        # family_id  = 19187
        # protein_id = 1151960
        # domain_id  = 1302435
        family_id = 18883
        protein_id = 1063014
        domain_id = 1212855

        self.family = session.query(Family).get(family_id)
        self.protein = session.query(Protein).get(protein_id)
        self.domain = session.query(Domain).get(domain_id)

        self.ps_sites = get_possel_sites(self.family)
        (self.pdb_id, self.pdb_chain, self.struct) = get_domain_struct(self.domain)

        print "Testing on Family {0}, Protein {1}, Domain {2}, Structure {3}, PDB {4}{5}".format(
            family_id, protein_id, domain_id, self.struct.id, self.pdb_id, self.pdb_chain
        )
        print "Sites of +Sel for family: ", self.ps_sites

        self.pdb_struct = BioPDBStruct(self.pdb_id, self.pdb_chain, debug=True)
        self.cluster_id_str = "Family {0}, Protein {1}, Domain {2}, Structure {3}".format(
            family_id, protein_id, domain_id, self.struct.id
        )
Ejemplo n.º 3
0
def astral_to_domain(experiment_id, threshold=0.5, dbstore=False):
    """
    Fetches all known-type domains for givein experiment ID. Computes astral overlap to
    domains, prints and optionally stores in hpf DB (table astral_domain_overlap)
    Will only store overlaps >= threshold parameter
    """
    from hpf.hddb.db import Session, push_to_db,  AstralDomainOverlap, Protein, Domain
    from hpf.structure_comparison.overlap import overlap
    from hpf.structure_comparison.astral_util import get_astrals, get_astral_startstop, parse_astral_chain

    # Create session and get all domains
    session = Session()
    domains = session.query(Domain).join(Protein).filter(Protein.experiment_key==experiment_id).filter(Domain.domain_type.in_(['psiblast','fold_recognition'])).all()
    print "Considering {0} domains to compute astral overlap for".format(len(domains))

    # For each domain, get representative astrals, calculate overlap, and store (optional)
    missing_astral = 0
    for domain in domains:
        domain_pdb_start = domain.region.parent_start
        domain_pdb_stop  = domain.region.parent_stop

        astrals = get_astrals(domain, session)
        if not astrals:
            #print "No astrals found for domain {0}".format(domain.id)
            missing_astral += 1
            continue

        for astral in astrals:
            try:
                (astral_start, astral_stop) = get_astral_startstop(astral)
                overlap_ratio = overlap(astral_start, astral_stop, domain_pdb_start, domain_pdb_stop)
            except ValueError:
                print "Negative overlap for domain {0} ({1}-{2}), Astral {3} (PDB {4}{5})".format(
                        domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain)
                print "Ignoring, moving to next astral.."
                continue
            except:
                print "Error calculating overlap for  domain {0} ({1}-{2}), Astral {3} (PBD {4}{5})".format(
                        domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain)
                raise

            if dbstore and overlap_ratio >= float(threshold):
                chain = parse_astral_chain(astral.chain)
                atod_dbo = AstralDomainOverlap(astral_id=astral.id, 
                                               astral_sid=astral.sid, 
                                               domain_id=domain.id, 
                                               astral_start=astral_start, 
                                               astral_stop=astral_stop, 
                                               domain_start=domain_pdb_start, 
                                               domain_stop=domain_pdb_stop, 
                                               pdb_id=astral.pdbid, 
                                               chain=chain, 
                                               overlap=overlap_ratio,
                                              )
                push_to_db(session, atod_dbo, exception_str="Error in pushing AstralDomainOverlap {0} to DB".format(atod_dbo), raise_on_duplicate=False)
            
            #print "Domain {0} ({1}-{2}), Astral {3} (PDB {4}{5}), Astral overlap {6}".format(domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain, overlap_ratio)

    print "Calculating astral to domain overlap for experiment {0} complete".format(experiment_id)
    print "{0} of {1} known-structure domains had no astral entries".format(missing_astral, len(domains))
Ejemplo n.º 4
0
class AlignmentToPDBMapper():
# Creates a mapper between a family alignment and a pdb structure
# NOTE: This is an abstraction of all of the confused and undocumented mappers below.
# Use this, because it is clear and because it works. KEEP IT SIMPLE.
# UNIT TESTS in pdb/tests/alignment_pdbmapper_test.py
# Access three public maps:
#   alignment_pdbseq_map  - alignment col => pdb seqres
#   pdbseq_pdbatom_map    - pdb seqres    => pdb atom
#   alignment_pdbatom_map - alignment col => pdb atom

    def __init__(self, family, protein, domain, debug=False):
        self.DEBUG = debug
        self.session = Session()
        self.family = family
        self.protein = protein
        self.domain  = domain
        self.alignment = self.family.alignments[0]
        if self.domain.sccs:
            self.pdbseqres = self.session.query(PDBSeqRes).filter_by(sequence_key=self.domain.parent_id[3:], chain=self.domain.sccs.chain).first()
        else:
            self.pdbseqres = self.session.query(PDBSeqRes).filter_by(sequence_key=self.domain.parent_id[3:]).first()
        self.pdbid = self.pdbseqres.pdb.pdbId+self.pdbseqres.chain

        # Create map: alignment column -> pdb seq res
        self.alignment_pdbseq_map = PDBDomainAlignmentMapper(self.protein, self.domain, self.alignment.alignment, self.pdbseqres, inverse=True)

        # Create map: pdb seq res -> pdb atom res
        self.pdbseq_pdbatom_map = PDBAtomSeqResMapper(self.pdbseqres, inverse=False)

        # Create map: alignment column -> pdb atom res
        self.alignment_pdbatom_map = AlignmentToPDBAtomMapper(self.alignment_pdbseq_map, self.pdbseq_pdbatom_map)
Ejemplo n.º 5
0
def xml(id):
    from hpf.hddb.db import Session, Family
    session = Session()
    family = session.query(Family).get(id)
    filename = "%i.xml" % family.id

    if runtime().opt(GZIP):
        import gzip
        filename = "%s.gz" % filename
        handle = gzip.open(filename,"w")
    else:
        handle = open(filename,"w")

    try:
        doc = FamilyFeatureBuilder(
            lambda: DefaultXMLGenerator(handle,pretty=True),
            lambda handler: StructureFeatureProvider(handler),
            lambda handler: ColumnFeatureProvider(handler),
            lambda handler: IeaFeatureProvider(handler),
            lambda handler: SelectionFeatureProvider(handler)
            )
        doc.buildDocument(family)
    finally:
        handle.close()
    session.close()
Ejemplo n.º 6
0
 def __init__(self, 
          protein, 
          domain, 
          alignment, 
          pdbseqres=None, 
          **kwargs):
     """
     @type protein: hpf.hddb.db.Protein
     @type domain: hpf.hddb.db.Domain
     @type alignment: Bio.Align.Generic
     @param pdbseqres: (Optional) Specify a PDB chain's sequence.
     """
     super(PDBDomainAlignmentMapper,self).__init__( protein, domain, alignment, **kwargs)
     self._chain = pdbseqres
     
     # If no pdbseqres given, fetch PDBSeqRes ORM object from db by sequence_key (a parsed domain.parent_id). 
     if self._chain==None:
         from hpf.hddb.db import Session,PDBSeqRes
         Session = Session()
         parent_id = int(self._domain.parent_id[3:])
         if domain.sccs:
             self._chain = Session.query(PDBSeqRes).filter_by(sequence_key=parent_id, chain=domain.sccs.chain).first()
         else:
             self._chain = Session.query(PDBSeqRes).filter(PDBSeqRes.sequence_key==parent_id).first()
     pdbid = self._chain.pdb.pdbId+self._chain.chain
     self._pdbid = pdbid;
     self._seed = self._seed_alignment()
Ejemplo n.º 7
0
    def merge(self):
        from hpf.hddb.db import Session, Family
        self.session = Session()

        self.family = self.session.query(Family).filter(
            Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml",
                            self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()
Ejemplo n.º 8
0
Archivo: oid.py Proyecto: bsmithers/hpf
 def __getitem__(self, key):
     from hpf.hddb.db import Session, SequenceAc
     amnh = self._oid_amnh[key]
     session = Session()
     try:
         ac = session.query(SequenceAc).filter(SequenceAc.ac==amnh).first()
         return ac.protein_key if ac else None
     finally:
         session.close()
Ejemplo n.º 9
0
    def setUp(self, ):
        session = Session()
        self.fam  = session.query(Family).get(19187)
        self.prot = session.query(Protein).get(1171456)
        self.dom  = session.query(Domain).get(1307995)
        self.alignment = self.fam.alignment.alignment
        self.psr  = session.query(PDBSeqRes).filter_by(sequence_key=self.dom.parent_id[3:]).first()

        self.ptpdba = ProteinToPDBAtom(self.prot,self.dom,self.alignment, pdbseqres=self.psr, inverse=False)
        self.pdbmap = AlignmentToPDBMapper(self.fam.id, self.prot.id, self.dom.id, debug=True)
Ejemplo n.º 10
0
def main():
    from hpf.hddb.db import Session, Sequence, Astral
    session = Session()
    astral_records = session.query(Astral)
    with open(args.outfile, 'w') as handle:
        for astral in astral_records:
            print "{0}\r".format(astral.sccs),
            handle.write(">{0}|{1}{2}|{3}\n".format(astral.sccs, astral.pdbid, astral.chain, astral.sequence_key))
            handle.write("{0}\n".format(astral.sequence.sequence))
    print "Exporting SCOP sequences to file {0} complete".format(args.outfile)
Ejemplo n.º 11
0
def ids_from_db():
# Retrieves a list of local (hpf db) sequence IDs for the sequences you want to calculate enrichment on
# These must be the same ID as in the ID field of pfam/interpro results

    # Get sequence IDs for all humanrna proteins (773 list)
    session = Session()
    proteins = session.query(Protein).filter_by(experiment_key=1177).all()
    protein_seqids = []
    for protein in proteins:
        protein_seqids.append(protein.sequence_key)
    return protein_seqids
Ejemplo n.º 12
0
    def setUp(self, ):
        self.family = 19187;
        self.protein = 1171456;
        self.domain  = 1307995;
        
        session = Session()
        self.family = session.query(Family).get(self.family)
        self.protein = session.query(Protein).get(self.protein)
        self.domain  = session.query(Domain).get(self.domain)

        self.pdbmap = AlignmentToPDBMapper(self.family, self.protein, self.domain, debug=True)
Ejemplo n.º 13
0
    def setUp(self, ):
        self.family_id  = 18883
        self.protein_id = 1063014
        self.domain_id  = 1212855

        session = Session()
        self.family = session.query(Family).get(self.family_id)
        self.protein = session.query(Protein).get(self.protein_id)
        self.domain  = session.query(Domain).get(self.domain_id)

        self.pdbmap = AlignmentToPDBMapper(self.family, self.protein, self.domain, debug=True)
Ejemplo n.º 14
0
def init_dict():
# Creates a returns a dict of the form sequence id => Signaprint obj. Initially, signaprint object is empty
    from hpf.hddb.db import Session, Protein
    from sqlalchemy import distinct
    session = Session()
    mouse_seqs = session.query(distinct(Protein.sequence_key)).filter_by(experiment_key=1171)
    signaprint_dict = dict()
    for (seq,) in mouse_seqs:
        signaprint_dict[seq] = Signaprint(sequence_id=seq)
    print "Dict of {0} sequence => Signaprints created".format(len(signaprint_dict))
    return signaprint_dict
Ejemplo n.º 15
0
 def _make_seqfile(self, ):
     from hpf.hddb.db import Session, Sequence
     session=Session()
     sequence = session.query(Sequence).get(self.sequence_id)
     if not sequence:
         raise Exception("Getting sequence object from database failed")
     outhandle = open(self.sequence_file, 'w')
     outhandle.write(">hpf_seqid|{0}\n".format(sequence.id))
     outhandle.write("{0}\n".format(sequence.sequence))
     outhandle.close()
     session.close()
Ejemplo n.º 16
0
def ids_from_db():
    # Retrieves a list of local (hpf db) sequence IDs for the sequences you want to calculate enrichment on
    # These must be the same ID as in the ID field of pfam/interpro results

    # Get sequence IDs for all humanrna proteins (773 list)
    session = Session()
    proteins = session.query(Protein).filter_by(experiment_key=1177).all()
    protein_seqids = []
    for protein in proteins:
        protein_seqids.append(protein.sequence_key)
    return protein_seqids
Ejemplo n.º 17
0
Archivo: oid.py Proyecto: bsmithers/hpf
def index(records):
    """
    In-place rename's record id's from OID name to HPF sequence_key
    """
    from hpf.hddb.db import Session, SequenceAc
    mapping = oid_amnh()
    session = Session()
    for record in records:
        amnh = mapping[record.id]
        id = str(session.query(SequenceAc).filter(SequenceAc.ac==amnh).one().sequence_key)
        record.id = id
Ejemplo n.º 18
0
def tasks(experiment_id):
    print "Interpro driver::tasks:: Getting tasks (sequences) for experiment {0}".format(experiment_id)
    session = Session()
    proteins = session.query(Protein.sequence_key).filter_by(experiment_key=experiment_id)
    experiment_seqs = list()
    for protein in proteins:
        experiment_seqs.append(protein[0])
    if proteins.count() != len(experiment_seqs):
        raise Exception("Number of sequences extracted from experiment {0} does not match number of proteins".format(experiment_id))
    unique_seqs = list(set(experiment_seqs))
    print "{0} tasks (unique sequences) from {1} sequences retrieved".format(len(unique_seqs), len(experiment_seqs))
    return unique_seqs
Ejemplo n.º 19
0
def translate_foldables(dir, update_db=True):
    """Translates foldable fasta files in the manner described above. If update_db is true,
    adds new sequences translated to hpf.sequence table and updates hpf.filesystemOutfile 
    (foldable records) to link to the new sequence key.
    """
    from hashlib import sha1
    from hpf.hddb.reexport_foldables import write_fasta
    from hpf.hddb.db import push_to_db, Session, Sequence, FilesystemOutfile
   
    print "Translating foldable fastas found in dir {0}".format(dir)
    
    files = os.listdir(dir)
    for file in files:
        try: 
            code = parse_code_from_file(file)
        except IOError as e:
            print "{0}. Ignoring file..".format(e)
        sequence_key, sequence = parse_foldable_file(file)

        # If the sequence contains a non-standard AA code, translate nonstandard to normal codes
        if re.search(nonstandard_pattern, sequence):
            print "{0} contains nonstandard AA codes. Translating".format(file)
            print "\tOriginal  : {0}".format(sequence)
            
            translated_seq_id = "None"
            for nsaa in non_standard.keys():
                sequence = sequence.replace(nsaa, non_standard[nsaa])
            
            print "\tTranslated: {0}".format(sequence)
            
            if update_db:
                # Add new sequence to DB (push_ will return None if seq_dbo is already in DB)
                print "Adding translated sequence to the DB"
                session = Session()
                seq_dbo = Sequence(sequence=sequence, sha1=sha1(sequence).hexdigest())
                seq_dbo = push_to_db(session, seq_dbo, exception_str="Pushing sequence for code {0} failed".format(code), raise_on_duplicate=False)
                if not seq_dbo:
                    seq_dbo = session.query(Sequence).filter_by(sha1=sha1(sequence).hexdigest()).first()
                
                # Get foldable record and change seq key to new translated seq's id
                print "Updating foldable record from old ({0}) to new ({1}) sequence key".format(sequence_key, seq_dbo.id)
                foldable_dbo = session.query(FilesystemOutfile).filter_by(prediction_code=code).first()
                if not foldable_dbo:
                    raise Exception("Foldable record not found in DB for code {0}".format(code))
                foldable_dbo.sequence_key = seq_dbo.id
                session.flush()
                
                translated_seq_id = seq_dbo.id
            
            print "Writing translated foldable to file {0}".format(file)
            with open(file, 'w') as handle:
                write_fasta(handle, translated_seq_id, len(sequence), sequence)
    print "Translating foldables complete"
Ejemplo n.º 20
0
def _get_decoy_struct(domain, probability_cutoff=0.8):
# Returns a DB Structure ORM object
    global session
    if session == None:
        session = Session()

    # Get decoy with highest MCM value that meets probability cutoff
    mcm = session.query(McmData).filter_by(outfile_key=domain.outfile_key).order_by('probability desc').first()
    if not mcm:
        print "Domain {0} has no decoy structures".format(self.domain.id)
        return None
    elif mcm.probability < probability_cutoff:
        print "Domain {0} structure (prob: {1}) does not meet cutoff {2}".format(domain.id, mcm.probability, probability_cutoff)
        return None
    return mcm.structure
Ejemplo n.º 21
0
    def __init__(self, sequence_key, nr_db, ginzu_version="4", dir=None, autorun=True, dbstore=True, debug=True):
        """Variables:
        self.prediction - The PsipredPrediction object returned from Psipred32(...).run()
        self.dbo        - If dbstore=True, the Psipred ORM object (DataBaseObject) from the HPF DB
        """
        from hpf.hddb.db import Session, Sequence

        self.sequence_key  = sequence_key
        self.nr_db = os.path.abspath(os.path.expanduser(nr_db))
        self.ginzu_version = ginzu_version
        self.dir = dir if dir else os.getcwd()
        self.dbstore = dbstore
        self.debug = debug

	#kdrew: commenting out because "nr" is not a file but a location
        #if not os.path.isfile(self.nr_db):
        #    raise Exception("Must provide a valid NR database file")
        
        self.session  = Session()
        self.sequence = self.session.query(Sequence).get(self.sequence_key)
        if not self.sequence:
            raise Exception("No sequence with key {0} exists in DB".format(self.sequence_key))
        
        self.fasta_file = "{0}.fasta".format(self.sequence_key)
        self.chkpt_file = "{0}.chk".format(self.sequence_key)
        self.psipred_file = "{0}.psipred".format(self.sequence_key)

        # Set in run (dbo optionally)
        self.prediction = None
        self.dbo = None

        if autorun:
            self.prediction = self.run()
Ejemplo n.º 22
0
Archivo: oid.py Proyecto: bsmithers/hpf
    def merge(self):
        from hpf.hddb.db import Session, Family

        self.session = Session()

        self.family = self.session.query(Family).filter(Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()
Ejemplo n.º 23
0
def cluster_driver(family_id):
    session = Session()
    family = session.query(Family).get(family_id)
    if family == None:
        raise Exception("Family {0} could not be fetched from the database".format(family_id))

    # Get sites for family (+Sel and TODO: firedb)
    ps_sites = get_possel_sites(family)
    # DEBUG
    print "Start cluster analysis for family {0}".format(family.id)
    print "Family {0} sites of +sel: ".format(family.id), ps_sites

    # Get repr protein from family (first one)
    protein = family.proteins[0]

    # Attempt clustering on structures for all domains in protein
    for domain in protein.domains:
        try:
            (pdb_id, pdb_chain, struct) = get_domain_struct(domain)
        except Exception as e:
            print e
            print "Domain {0} (Family {1}, Protein {2}) has no valid structure. Skipping..".format(
                domain.id, family.id, protein.id
            )
            continue

        # Get domain-specific sites
        domain_sites = get_domain_sites(family, protein, domain, ps_sites)
        # DEBUG
        print "Family {0}, Protein {1}, Domain {2} local sites: ".format(family.id, protein.id, domain.id), domain_sites

        # Create BioPDBStruct object to call cluster analysis on
        pdb_struct = BioPDBStruct(pdb_id, pdb_chain, debug=True)
        cluster_id_str = "Family {0}, Protein {1}, Domain {2}, Structure {3}".format(
            family.id, protein.id, domain.id, struct.id
        )
        try:
            pdb_struct.cluster_analysis(
                domain_sites, sample_size=samples, store_file=results_file, tag=cluster_id_str, report=True
            )
        except Exception as e:
            print e
            print "Can not complete clustering analysis on domain {0}. Skipping..".format(domain.id)
            continue

    # DEBUG
    print "Clustering for domain/structs in Family {0} (Protein {1}) complete".format(family.id, protein.id)
Ejemplo n.º 24
0
def tasks():
    # Task -> a family id to process clustering on
    # Get list of families to cluster (GENERAL version would get list of proteins)
    session = Session()
    # TEST: limit query for testing
    # families = session.query(Family).filter_by(manually_curated=0).limit(5).all()
    families = session.query(Family).filter_by(manually_curated=0).all()
    family_ids = []
    for fam in families:
        family_ids.append(fam.id)
    if family_ids == []:
        raise Exception("No families to cluster were retrieved from the db")

    # TEST - manually populate family test set
    # family_ids = [19187, 18883]

    return family_ids
Ejemplo n.º 25
0
    def setUp(self, ):
        from hpf.pdb.psipred import HPFPsipredWrap
        session = Session()
        self.sequence_key  = 8560575
        self.ginzu_version = 4
        self.sequence = session.query(Sequence).get(self.sequence_key).sequence
        self.reference_pred = session.query(PsipredORM).filter_by(sequence_key=self.sequence_key, ginzu_version=self.ginzu_version).first().prediction

        self.db = BASEPATH + "small_db"

        # Run HPFPsipredWrap to get the new SS string
        self.created_pred = HPFPsipredWrap(sequence_key=self.sequence_key,
                                           nr_db=self.db,
                                           ginzu_version=self.ginzu_version,
                                           autorun=True,
                                           dbstore=False
                                          ).get_prediction_string()
Ejemplo n.º 26
0
def _get_pdb_struct(domain, ):
# Returns a DB Structure ORM object
    global session
    if session == None:
        session = Session()

    # Check for objects needed to query for pdb struct.
    if domain.parent_id == None or domain.sccs == None:
        print "Domain {0} has no sccs record (cannot find PDB). Returning None..".format(domain.id)
        return None

    # Query to get the PDBSeqRes record mathing the domain, return structure from PDBSeqRes.
    psr = session.query(PDBSeqRes).filter_by(sequence_key=domain.parent_id[3:], chain=domain.sccs.chain).first()
    if not psr:
        print "No PDB record found for domain {0} (seq {1}, chain {2}). Returning None.".format(domain.id, domain.parent_id[3:], domain.sccs.chain)
        return None
    elif psr.structure == None:
        print "Domain {0} PDBSeqRes entry has no structure. Returning None..".format(domain.id)
    return psr.structure
Ejemplo n.º 27
0
def export_fasta(outhandle, experiment):
    """
    outhandle     - a filehandle for writing sequences to
    experiment    - the hpf.experiment DB id to fetch sequences from
    """
    print "Exporting all sequences from Experiment {0}".format(experiment)
    
    session = Session()
    proteins = session.query(Protein).filter_by(experiment_key=experiment)
    num_seqs = proteins.count()

    i = 0
    for protein in proteins:
        outhandle.write(">hpf|{0}|{1}|{2}|{3} ({4})\n".format(protein.sequence_key, protein.id, protein.experiment_key, protein.experiment.name, protein.experiment.short_name))
        outhandle.write("{0}\n".format(protein.sequence.sequence))
        sys.stdout.write("{0}     of {1}\r".format(i, num_seqs))
        i += 1
    
    print "{0} sequences from experiment {1} exported".format(i, experiment)
Ejemplo n.º 28
0
 def __enter__(self):
     if not self.dir:
         self.dir = mkdtemp()
     from hpf.hddb.db import Session
     self.session = Session()
     self.list_handle = open(self.mammoth_list,"w")
     self.info_handle = open(self.info_file,"w")
     
     print >>self.info_handle, "\t".join(McmDBExporter.columns)
     print >>self.list_handle, "MAMMOTH List\n%s" % self.dir
     return self
Ejemplo n.º 29
0
def populate_mgi(sdict):
    from hpf.hddb.db import Session, SequenceAc
    session = Session()
    mgi_count = 0
    for seq in sdict.keys():
        acs = session.query(SequenceAc).join(SequenceAc.protein).filter_by(sequence_key=seq, experiment_key=1171).all()
        mgi_ids = set()
        for ac in acs:
            if ac.ac2 == "None":
                continue
            mgi_ids.add(ac.ac2)
        if len(mgi_ids) == 0:
            sdict[seq].mgi = "None"
        elif len(mgi_ids) == 1:
            sdict[seq].mgi = list(mgi_ids)[0]
            mgi_count += 1
        else:
            sdict[seq].mgi = list(mgi_ids)
            mgi_count += 1
    print "{0} sequences assigned an MGI ID".format(mgi_count)
Ejemplo n.º 30
0
def main(experiment, outfile):
    from hpf.hddb.db import Session, Protein
    session = Session()
    
    gn_pattern = r"GN=(?P<gene_name>\S+)"
    
    handle = open(outfile, 'w')
    proteins = session.query(Protein).filter_by(experiment_key=experiment)
    
    for p in proteins:
        gn_found = re.search(gn_pattern, p.ac.description)
        gene_name = gn_found.group('gene_name') if gn_found else "None"
        #if p.ac.ac2:
        #    handle.write("{0}\t{1}\t{2}\n".format(p.sequence_key, p.ac.ac2, gene_name))
        #else:
        handle.write("{0}\t{1}\t{2}\n".format(p.sequence_key, p.ac.ac, gene_name))
        sys.stdout.write("protein: {0}    {1}               \r".format(p.sequence_key, gene_name))
    
    handle.close()
    print "Map complete"
Ejemplo n.º 31
0
 def _db_setup(self, ):
     print "FastaFile db_setup"
     
     # Create a session via hpf.hddb.db Session (a Session object from sessionmaker, sqlalchemy).
     self.session = Session()
     print "Connected to database: {0}".format(engine.url)
     
     # Query the DB session for the given experiment table (if none, no experiment of that ID).
     self.experiment = self.session.query(Experiment).filter(Experiment.id == self.experiment_id).first()
     if self.experiment == None: 
         raise ValueError("Experiment {0} does not exist in the database {1}.".format(self.experiment_id, engine.url))
Ejemplo n.º 32
0
def main(prediction_code):
    print "Running Rosetta ab init setup for code {0}".format(prediction_code)

    # Set up DB and query for foldable and domain records
    print "Opening session and querying DB for domain and foldable"
    session = Session()
    domain = session.query(Domain).filter_by(ibm_prediction_code=prediction_code).first()
    foldable = session.query(Foldable).filter_by(prediction_code=prediction_code).first()
    if not (domain and foldable):
        raise Exception("No domain or foldable record (or both) could be found for code {0}".format(prediction_code))
    
    # Check/make work dir
    if not os.path.isdir(BASE_DIR):
        os.mkdir(WORK_DIR)

    # Check NR
    if not os.path.isfile(NR_DB):   
        raise Exception("Given NR database {0} not valid".format(NR_DB))

    # Make and change to working dir
    working_dir = os.path.join(WORK_DIR, prediction_code)
    os.mkdir(working_dir)
    print "Created working directory {0}".format(working_dir)
    os.chdir(working_dir)

    # Create foldable FASTA file
    fasta_file = os.path.join(working_dir, "{0}.fasta".format(prediction_code))
    with open(fasta_file, 'w') as handle:
        handle.write(">domain:{0}|code:{1}|foldable_seq_key:{2}\n".format(domain.id, foldable.prediction_code, foldable.sequence_key))
        handle.write("{0}\n".format(foldable.sequence.sequence))
    print "Created foldable record fasta file {0}".format(fasta_file)

    # Currently DB does not hold psipred ss2-format files (damn). Run psipred, ignore return pred (only need file)
    print "Running Psipred on foldable sequence via HPFPsipredWrap..."
    HPFPsipredWrap(sequence_key=foldable.sequence_key, nr_db=NR_DB, ginzu_version=4, dir=working_dir, autorun=True, dbstore=True, debug=True)

    # Complete and exit
    print "Setup for Rosetta ab initio complete. Clean up files as necessary"
    print "(NOTE: Psipred ss2 outfile (vformat) is the <seqkey>.psipred.2 file)"
Ejemplo n.º 33
0
def xml(id):
    from hpf.hddb.db import Session, Family
    session = Session()
    family = session.query(Family).get(id)
    filename = "%i.xml" % family.id

    if runtime().opt(GZIP):
        import gzip
        filename = "%s.gz" % filename
        handle = gzip.open(filename, "w")
    else:
        handle = open(filename, "w")

    try:
        doc = FamilyFeatureBuilder(
            lambda: DefaultXMLGenerator(handle, pretty=True),
            lambda handler: StructureFeatureProvider(handler),
            lambda handler: ColumnFeatureProvider(handler),
            lambda handler: IeaFeatureProvider(handler),
            lambda handler: SelectionFeatureProvider(handler))
        doc.buildDocument(family)
    finally:
        handle.close()
    session.close()
Ejemplo n.º 34
0
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """
    def __init__(self,
                 familyName,
                 alignFile,
                 alignColcullLog,
                 alignSeqcullLog,
                 treeFile,
                 treeDiagCharsFile,
                 codemlFile=None,
                 alignFormat="fasta",
                 oid_key=None):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family
        self.session = Session()

        self.family = self.session.query(Family).filter(
            Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml",
                            self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree
        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree
        self.tree = Tree(alignment_key=self.alignment.id,
                         text=self.nexus.to_string(plain=False,
                                                   plain_newick=True),
                         filename=self.treeFile)
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory
        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            #session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory
        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(
            self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment,
                                    self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column,
                                site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig,
                                    "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()


#        with open(self.codemlFile) as handle:
#            text = handle.read()
#        from hpf.hddb.db import CodeML
#        self.codeml = CodeML(tree_key=self.tree.id,
#                             filename=self.codemlFile,
#                             text=text)
#        self.session.add(self.codeml)
#        self.session.flush()
#        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
#        with open(self.codemlFile) as handle:
#            for selection in parser.parse(handle):
#                selection.codeml_key = self.codeml.id
#                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO
        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment
        self.alignment = Alignment(family_key=self.family.id,
                                   format=self.alignFormat,
                                   filename=self.alignFile,
                                   text=text.getvalue())
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein
            s = AlignmentProtein(alignment_key=self.alignment.id,
                                 protein_key=protein_key,
                                 sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein
            fs = FamilyProtein(family_key=self.family.id,
                               protein_key=protein_key,
                               seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull
        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id,
                                       column=column,
                                       gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            #rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                #seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id,
                                       protein_key=seq,
                                       score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family
        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()
Ejemplo n.º 35
0
def tasks():
    from hpf.hddb.db import Session, Family
    session = Session()
    ids = session.query(Family.id).filter(Family.manually_curated == 0).all()
    session.close()
    return [i[0] for i in ids]