Beispiel #1
0
def get_best_record_key(records):
    # Takes records, a list of LocalHumanFiltered objects.
    # Returns the protein sequence key of the "best" filtered record, where best
    # is (th.) the best blast hit with the highest confident sccs entry
    # ACTUAL return is a triple: for the best protein, (query id, seq key, prot key, experiment key)

    if not records:
        raise Exception("Given records list contains no records")

    #DEBUG
    for rec in records:
        print "\tHit: {0}, {1}, {2}, eval: {3} bitscore {4}".format(
            rec.hit_id, rec.hit_protein_id, rec.hit_experiment_id,
            rec.hit_evalue, rec.hit_bitscore)

    session = ScopedSession()

    # Sort records by blast score (record.hit_bitscore), highest first
    records.sort(key=lambda record: record.hit_bitscore, reverse=True)

    # record is LocalHumanFiltered obj: query_id, hit_id, hit_protein_id, hit_experiment_id, hit_evalue, hit_bitscore
    best_sccs_conf = 0.0
    best_record = None
    for record in records:

        # Get protein ORM object matching record
        protein = session.query(Protein).get(record.hit_protein_id)
        if not protein:
            raise Exception(
                "No protein fetched for record protein ID {0}".format(
                    record.hit_protein_id))
        try:
            prot_best_sccs = _get_best_sccs(protein)
        except Exception as e:
            print e
            continue

        if float(prot_best_sccs.confidence) == 1.0:
            return (record.query_id, protein.sequence_key, protein.id,
                    protein.experiment_key)
        elif float(prot_best_sccs.confidence) > best_sccs_conf:
            best_sccs_conf = float(prot_best_sccs.confidence)
            best_record = record
    if best_record == None:
        raise Exception("Could not find a best protein sequence")

    hit_id, hit_protein_id, hit_experiment_id = map(
        int, (best_record.hit_id, best_record.hit_protein_id,
              best_record.hit_experiment_id))
    return (record.query_id, hit_id, hit_protein_id, hit_experiment_id)
Beispiel #2
0
def get_best_homolog(records):
# Takes a list of Filtered objects. Returns the filtered object with
# best blast score and best structural coverage.
    if not records:
        raise LookupError("Given records list contains no records")
    
    session = ScopedSession()
    
    # Sort records by blast score (record.hit_bitscore), highest first
    records.sort(key=lambda record: record.hit_bitscore, reverse=True)

    best_struct_score = 0.0
    best_record = None
    best_protein = None
    for record in records:
        # Get protein ORM object matching record
        protein = session.query(Protein).get(record.hit_protein_id)
        if not protein:
            print "No protein fetched for query {0} protein ID {1}. Skipping protein..".format(record.query_id, record.hit_protein_id)
            continue
        try:
            protein_score = structure_score(protein)
        except LookupError as l:
            print "Exception {0}. Skipping protein {1}..".format(l, protein.id)
            continue
        
        if protein_score == 1.0:
            best_record = record
            best_protein = protein
            break
        elif protein_score > best_struct_score:
            best_struct_score = protein_score
            best_record = record
            best_protein = protein
    if best_record == None:
        raise LookupError("Could not find a protein with structure")
    
    # Get structure list for best protein
    hit_structures = get_structure_ids(best_protein)
    
    # HomologComparisonProtein prototype:
    # (self, query_id, hit_id, hit_protein, hit_experiment, evalue, bitscore, structures, num_domains)
    return ( HomologComparisonProtein(best_record.query_id, best_record.hit_id, \
                best_record.hit_protein_id, best_record.hit_experiment_id, \
                best_record.hit_evalue, best_record.hit_bitscore, \
                hit_structures, len(best_protein.domains) \
                ) \
            )
def get_best_record_key(records):
    # Takes records, a list of LocalHumanFiltered objects.
    # Returns the protein sequence key of the "best" filtered record, where best
    # is (th.) the best blast hit with the highest confident sccs entry
    # ACTUAL return is a triple: for the best protein, (query id, seq key, prot key, experiment key)

    if not records:
        raise Exception("Given records list contains no records")

    # DEBUG
    for rec in records:
        print "\tHit: {0}, {1}, {2}, eval: {3} bitscore {4}".format(
            rec.hit_id, rec.hit_protein_id, rec.hit_experiment_id, rec.hit_evalue, rec.hit_bitscore
        )

    session = ScopedSession()

    # Sort records by blast score (record.hit_bitscore), highest first
    records.sort(key=lambda record: record.hit_bitscore, reverse=True)

    # record is LocalHumanFiltered obj: query_id, hit_id, hit_protein_id, hit_experiment_id, hit_evalue, hit_bitscore
    best_sccs_conf = 0.0
    best_record = None
    for record in records:

        # Get protein ORM object matching record
        protein = session.query(Protein).get(record.hit_protein_id)
        if not protein:
            raise Exception("No protein fetched for record protein ID {0}".format(record.hit_protein_id))
        try:
            prot_best_sccs = _get_best_sccs(protein)
        except Exception as e:
            print e
            continue

        if float(prot_best_sccs.confidence) == 1.0:
            return (record.query_id, protein.sequence_key, protein.id, protein.experiment_key)
        elif float(prot_best_sccs.confidence) > best_sccs_conf:
            best_sccs_conf = float(prot_best_sccs.confidence)
            best_record = record
    if best_record == None:
        raise Exception("Could not find a best protein sequence")

    hit_id, hit_protein_id, hit_experiment_id = map(
        int, (best_record.hit_id, best_record.hit_protein_id, best_record.hit_experiment_id)
    )
    return (record.query_id, hit_id, hit_protein_id, hit_experiment_id)
Beispiel #4
0
def _get_pdb_struct(domain,):
    # Returns a triple (pdb id, chain, structure ORM object) for a PDB (/known struct) domain
    session = ScopedSession()

    # Check for objects needed to query for pdb struct.
    if domain.parent_id == None or domain.sccs == None:
        raise Exception("Domain {0} has no sccs record (cannot find PDB)".format(domain.id))

    # Query to get the PDBSeqRes record matching the domain, return structure info
    psr = session.query(PDBSeqRes).filter_by(sequence_key=domain.parent_id[3:], chain=domain.sccs.chain).first()
    if not psr:
        raise Exception(
            "No PDB record found for domain {0} (seq {1}, chain {2})".format(
                domain.id, domain.parent_id[3:], domain.sccs.chain
            )
        )
    elif psr.structure == None:
        raise Exception("Domain {0} PDBSeqRes entry has no structure".format(domain.id))
    return (psr.pdb.pdbId, psr.chain, psr.structure)
Beispiel #5
0
def db_add(
    domain_id,
    astral_id,
    astral_sid,
    domain_start,
    domain_stop,
    astral_start,
    astral_stop,
    pdb_id,
    chain,
    overlap,
    session=None,
):
    # Create a domain_to_astral ORM object and push it to the DB. Create a scoped session for push.
    # Raise exception if push fails
    # If session not given, create scoped session for the DB push
    if not session:
        print "db_add:: No session provided. Creating scoped session"
        session = ScopedSession()

    if astral_start == None or astral_stop == None:
        dtoa_obj = DomainAstralOverlap(
            domain_id=domain_id,
            astral_id=astral_id,
            astral_sid=astral_sid,
            domain_start=domain_start,
            domain_stop=domain_stop,
            pdb_id=pdb_id,
            chain=chain,
            overlap=overlap,
        )
    else:
        dtoa_obj = DomainAstralOverlap(
            domain_id=domain_id,
            astral_id=astral_id,
            astral_sid=astral_sid,
            domain_start=domain_start,
            domain_stop=domain_stop,
            astral_start=int(astral_start),
            astral_stop=int(astral_stop),
            pdb_id=pdb_id,
            chain=chain,
            overlap=overlap,
        )
    session.add(dtoa_obj)
    try:
        session.flush()
    except IntegrityError:
        print "DomainToAstral {0} object already in database. Skipping".format(dtoa_obj)
        session.rollback()
    except Exception as e:
        print "Error in pushing DomainToAstral object {0} to database".format(dtoa_obj)
        raise
    return dtoa_obj
    def pairwise_max_sum(self, source_list, target_list):
    # Returns the sum of the maximum pair score where a pair is a (top element, bottom element) pair
    # Implented to check the pdb.mammoth table ORM object, to get astral-v-astrall struct comp scores
    # If an element in a list is None, add nothing to total.
      
        #DEBUG
        #print "Source structs: ", source_list
        #print "Target structs: ", target_list

        session = ScopedSession()
        sum = 0.0
        
        # Compare each source structure to all target structures
        for source_struct in source_list:
            if source_struct == None:
                continue
            max_score = 0.0
            for target_struct in target_list:
                if target_struct == None:
                    continue
                struct_comp_obj = session.query(AstralComparison).filter_by(prediction=source_struct, experiment=target_struct).first()
                if not struct_comp_obj:
                    struct_comp_obj = session.query(AstralComparison).filter_by(prediction=target_struct, experiment=source_struct).first()
                    if not struct_comp_obj:
                        #DEBUG
                        #print "pms:: could not fetch comparison from the DB for target: {0}".format(target_struct)
                        continue
                #DEBUG
                #print "pms:: struct comp object found for target: {0}, score {1}, cur max: {2}".format(target_struct, float(struct_comp_obj.zscore), max_score)
                
                if  float(struct_comp_obj.zscore) > max_score:
                    max_score = float(struct_comp_obj.zscore)
            
            #DEBUG
            #print "Max score: {0} for source '{1}' against targets:".format(max_score, source_struct), target_list
            
            sum += max_score
        return sum
def output_db(list):
# Takes a list of HomologComparisonProteinPair objects to store in the DB,
# table hpf.homolog_structure_allvall[...] 
    session = ScopedSession()
    for pair in list:
        hsa_obj = HomologStructAllVAll(source_id=pair.source.query_id, target_id=pair.target.query_id, score=pair.similarity)
        session.add(hsa_obj)
        try:
            session.flush()
        except IntegrityError:
            print "Object {0} already in database. Rolling back and skipping..".format(hsa_obj)
            session.rollback()
            continue
        except:
            print "Could not add protein comparison object {0} to DB".format(hsa_obj)
            raise
Beispiel #8
0
 def _setup_db(self, ):
     if not self.session:
         from hpf.hddb.db import ScopedSession
         self.session = ScopedSession()
Beispiel #9
0
class HpfHHPredResultFile(HHPredResultFile):
    """A subclass of HHPredResultFile that requires an hhpred version key and parses sequence_key from file
    Creates a 'dbo' variable (as a property) to represent the corresponding hpf.hddb.db ORM object from the DB.
        'dbo' is None if HHPredResultFile is not represented in DB, otherwise is the corresponding ORM object.
    Adds the 'db_store' convenience method to push this object to the DB.
    Adds session instance variables to avoid redundant session creation
    """
    
    def __init__(self, version_key=None, *args, **kwargs):
        if not version_key:
            raise Exception("Hpf HHPredResultFile must be provided with a version key. See hhpred_version DB table")
        super(HpfHHPredResultFile,self).__init__(*args, **kwargs)
        self.version_key = version_key
        self.sequence_key = self._parse_seqkey(self.query)
        self.session = None
        self._dbo = None

    def _get_dbo(self, ):
        if not self._dbo:
            self._dbo = self._fetch_dbo()
        return self._dbo
    def _set_dbo(self, value):
        self._dbo = value
    dbo = property(_get_dbo, _set_dbo)
    
    def _parse_seqkey(self, str):
        """Attempts to parse an HPF sequence key from given string. This is set to parse 
        HHPred 'query' field of form 'hpf|<seqkey>|<proteinkey>|<experimentkey>|<experiment>'
        """
        return int( str.split('|')[1] )

    def _setup_db(self, ):
        if not self.session:
            from hpf.hddb.db import ScopedSession
            self.session = ScopedSession()
    
    def _fetch_dbo(self, ):
        """Queries the database for an already existing HHPRF (same sequence key and version).
        If query find matching HHPRF entry, returns it. If no matching entries in DB, returns None.
        """
        from hpf.hddb.db import HHPredResultFile as HHPRF
        if not self.session:
            self._setup_db()
        rf = self.session.query(HHPRF).filter_by(sequence_key=self.sequence_key, version_key=self.version_key).first()
        return rf

    def db_store(self, ):
        """Pushes the HHPredResultFile object to the HPF database (via hpd.hddb.db.HHPredRFFactory)
        Flow: get session; if HHPredRF object NOT already represented in DB: make ORM object; push to DB;
        set dbo property to pushed object; return dbo object
        Returns the added ORM object
        """
        if not self.session:
            self._setup_db()
        if self.dbo:
            print "HHPredResultFile already represented in DB: {0}. Returning...".format(self.dbo)
        else:
            from hpf.hddb.db import HHPredRFFactory, push_to_db
            hrf_dbo = HHPredRFFactory().create(self, debug=self.debug)
            push_to_db(self.session, hrf_dbo, exception_str="Failed to add {0} to the DB".format(hrf_dbo))
            self.dbo = hrf_dbo
        return self.dbo

    def db_store_hits(self, ):
        """A convenience function to push all self.hits Hit objects to the DB via HHPredHit.db_store()
        Raises warning if no hits are available.
        Raises warning if # of hits parsed (in self.hits) is different from # of hits in DB
        If HHpredResultFile object has not yet been pushed to DB, pushes it.
        Returns list of HHPredHit ORM objects.
        """
        if not self.hits:
            raise Warning("No hit objects to push to DB")
        if not self.dbo:
            self.db_store()
        if not self.session:
            self._setup_db()
        rf_key = self.dbo.id
        hit_dbos = list()
        for hit in self.hits:
            hit_dbos.append( hit.db_store(self.session, rf_key) )
        if self.debug:
            print "Hits parsed: {0}, Hits in DB: {1}".format(len(self.hits), len(hit_dbos))
        if len(self.hits) != len(hit_dbos):
            raise Warning("Number of hits parsed from file does not match number added to DB")
        return hit_dbos