コード例 #1
0
def astral_to_domain(experiment_id, threshold=0.5, dbstore=False):
    """
    Fetches all known-type domains for givein experiment ID. Computes astral overlap to
    domains, prints and optionally stores in hpf DB (table astral_domain_overlap)
    Will only store overlaps >= threshold parameter
    """
    from hpf.hddb.db import Session, push_to_db,  AstralDomainOverlap, Protein, Domain
    from hpf.structure_comparison.overlap import overlap
    from hpf.structure_comparison.astral_util import get_astrals, get_astral_startstop, parse_astral_chain

    # Create session and get all domains
    session = Session()
    domains = session.query(Domain).join(Protein).filter(Protein.experiment_key==experiment_id).filter(Domain.domain_type.in_(['psiblast','fold_recognition'])).all()
    print "Considering {0} domains to compute astral overlap for".format(len(domains))

    # For each domain, get representative astrals, calculate overlap, and store (optional)
    missing_astral = 0
    for domain in domains:
        domain_pdb_start = domain.region.parent_start
        domain_pdb_stop  = domain.region.parent_stop

        astrals = get_astrals(domain, session)
        if not astrals:
            #print "No astrals found for domain {0}".format(domain.id)
            missing_astral += 1
            continue

        for astral in astrals:
            try:
                (astral_start, astral_stop) = get_astral_startstop(astral)
                overlap_ratio = overlap(astral_start, astral_stop, domain_pdb_start, domain_pdb_stop)
            except ValueError:
                print "Negative overlap for domain {0} ({1}-{2}), Astral {3} (PDB {4}{5})".format(
                        domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain)
                print "Ignoring, moving to next astral.."
                continue
            except:
                print "Error calculating overlap for  domain {0} ({1}-{2}), Astral {3} (PBD {4}{5})".format(
                        domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain)
                raise

            if dbstore and overlap_ratio >= float(threshold):
                chain = parse_astral_chain(astral.chain)
                atod_dbo = AstralDomainOverlap(astral_id=astral.id, 
                                               astral_sid=astral.sid, 
                                               domain_id=domain.id, 
                                               astral_start=astral_start, 
                                               astral_stop=astral_stop, 
                                               domain_start=domain_pdb_start, 
                                               domain_stop=domain_pdb_stop, 
                                               pdb_id=astral.pdbid, 
                                               chain=chain, 
                                               overlap=overlap_ratio,
                                              )
                push_to_db(session, atod_dbo, exception_str="Error in pushing AstralDomainOverlap {0} to DB".format(atod_dbo), raise_on_duplicate=False)
            
            #print "Domain {0} ({1}-{2}), Astral {3} (PDB {4}{5}), Astral overlap {6}".format(domain.id, domain_pdb_start, domain_pdb_stop, astral.sid, astral.pdbid, astral.chain, overlap_ratio)

    print "Calculating astral to domain overlap for experiment {0} complete".format(experiment_id)
    print "{0} of {1} known-structure domains had no astral entries".format(missing_astral, len(domains))
コード例 #2
0
def structure_representation(domain):
    """
    Parameters:
        domain  -   hpf.hddb.db.Domain object
    Return:
        [int]   -   list of hpf.hddb.Structure keys (IDs)
    """
    if domain.domain_type in ("psiblast", "fold_recognition"):

        if domain.astral_domain_overlap:
            # Remove astrals with overlap less than cutoff
            #thresholded_astral_overlaps = [a for a in domain.astral_domain_overlap if a.overlap >= ASTRAL2DOMAIN_OVERLAP_CUTOFF]
            thresholded_astral_overlaps  = list()
            for a in domain.astral_domain_overlap:
                if float(a.overlap) >= ASTRAL2DOMAIN_OVERLAP_CUTOFF:
                    thresholded_astral_overlaps.append(a)

                    #print "DEBUG: Adding {0} to thresholded astrals".format(a)
                    #print "DEBUG: overlap: {0}, >= {1}: {2}".format(a.overlap, ASTRAL2DOMAIN_OVERLAP_CUTOFF, a.overlap >= ASTRAL2DOMAIN_OVERLAP_CUTOFF)
    
            # Remove Astrals with same structure key (just in case)
            astral_struct_dict = dict()
            for a in thresholded_astral_overlaps:
                astral_struct_dict[a.astral.structure_key] = a
            clean_astral_overlaps = astral_struct_dict.values()
    
            if (len(clean_astral_overlaps) < 2):
                return [a.astral.structure_key for a in clean_astral_overlaps]
            
            #print "\tDEBUG: Clean astrals: ", clean_astral_overlaps
            
            # Remove astrals with same astral_start/astral_stop pair as those already in the set (further cleaning step)
            start_stop_pairs = []
            nonredun_astral_overlaps = []
            for a in clean_astral_overlaps:
                if (a.astral_start, a.astral_stop) not in start_stop_pairs:
                    start_stop_pairs.append((a.astral_start, a.astral_stop))
                    nonredun_astral_overlaps.append(a)
            clean_astral_overlaps = nonredun_astral_overlaps

            # Build up graph of thresholded astrals. 
            ## Node ID is astral structure key 
            ## Score is # of domain residues covered by astral (astral.overlap * astral.length)
            import networkx
            from itertools import combinations
            from hpf.graph import max_score_path, path_score
            from hpf.structure_comparison.overlap import overlap
            dag = networkx.DiGraph()
            for a in clean_astral_overlaps:
                dag.add_node(a.astral.structure_key, score=(a.overlap * (a.astral_stop - a.astral_start + 1)))
    
            #print "\tDEBUG: Graph nodes: ", dag.nodes()
    
            ## Build up graph edges. Nodes are connected if the are "complimentary" (non- or nearly non-overlapping in domain space)
            for (a, b) in combinations(clean_astral_overlaps, 2):
                a2a_overlap = overlap(a.astral_start, a.astral_stop, b.astral_start, b.astral_stop)
                if a2a_overlap < ASTRAL2ASTRAL_OVERLAP_CUTOFF:
                    dag.add_edge(a.astral.structure_key, b.astral.structure_key)
    
            #print "\tDEBUG: Graph edges: ", dag.edges()
   
            # Check for overlarge graph before starting recursive methods
            if len(dag.edges()) > 50:
                warnings.warn("Too many redundant nodes in graph for domain {0}, skipping..".format(domain))
                return []

            # Find max scoring path from all nodes, keeping max (could be written better to store as graph was built. Oh well)
            ## NOTE: a path is a list of nodes, and a node is just an astral ID (when fetched via graph.get_nodes()
            max_score = -1
            max_path = None
            for node in dag.nodes():
                path = max_score_path(dag, node)
                score = path_score(dag, path)
    
                #print "DEBUG: Path {0}, Score {1}".format(path, score)
    
                if score > max_score:
                    max_score = score
                    max_path = path
            return max_path if max_path else []

    elif domain.domain_type in ("pfam", "msa", "unassigned"):
        if domain.mcmdata:
            # Remove MCM entries with the same struct key, keeping that with the better probability
            mcm_dict = dict()
            for m in domain.mcmdata:
                if m.structure_key in mcm_dict.keys():
                    if m.probability > mcm_dict[m.structure_key].probability:
                        mcm_dict[m.structure_key] = m
                else:
                    mcm_dict[m.structure_key] = m
    
            # Sort the unique MCM entries by probability (high to low), keeping the highest 5
            mcms = mcm_dict.values()
            if (not mcms):
                return []
            mcms = sorted(mcm_dict.values(), key=lambda k: float(k.probability), reverse=True)[:5]
    
            # Return structure keys for highest 5 non-duplicate MCMs
            return [m.structure_key for m in mcms]
    return []