def harvest_elegiable_donor_sites(self,projected_donors={},forced_codingblock_ends={},next=None,
    store_all_projected_sites=False,
    allow_phase_shift=False,
    enlarge_5p_boundary_by=None, # in AA coordinates
    enlarge_3p_boundary_by=None, # in AA coordinates
    ALIGNED_DONOR_MAX_TRIPLET_DISTANCE=None,
    MIN_DONOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_DONOR=False,
    NON_CANONICAL_MIN_DONOR_PSSM_SCORE=None ):
    """
    Harvest elegiable donor sites from this CodingBlockGraph into a DonorSiteCollectionGraph
    """

    if next and next.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]:
        message = "next must be a CodingBlock graph object, not a %s" % next.__class__.__name__
        raise InproperlyAppliedArgument, message

    # update minimal pssm score to stg collection object
    stg = DonorSiteCollectionGraph()
    stg.MIN_PSSM_SCORE = MIN_DONOR_PSSM_SCORE
    stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_DONOR_MAX_TRIPLET_DISTANCE

    # First, process each individual organism.
    # (A) obtain elegiable splice site range
    # (B) scan for splice sites
    # (C) add the projected sites to the graph
    # (D) add splice sites to the stg collection graph
    for org in self.organism_set():
        # take the first (and only) orf of this organism
        theorf = self.get_orfs_of_graph(organism=org)[0]

        if forced_codingblock_ends.has_key(org):
            # the node that represents this site
            cbgEnd = forced_codingblock_ends[org]
            cbgEndNode = ( org,theorf.id,cbgEnd.pos )
            # add to the collection graph
            stg.add_node_and_object(cbgEndNode,cbgEnd)
            # ready with this organism, no splice site setting!
            #continue

        if next.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph":
            # continue; all `donor` boundaries are hard-set 
            # no splice_site_range or actual site prediction needed
            continue

        ########################################################################
        ### get the considered splice site range
        ########################################################################

        # calculate considered splice site range based on EOF Orf object
        # take theorf.endPY + 2 (two) !, because EOF Orf is the start of the
        # STOP codon. Example:
        # ... tca TAG tac gtc ...
        # ... tca                   EOF Orf
        #         TAG               STOP codon
        #     ..a taG Tac gt.       perfect DONOR Site; PSSM-score ~7.7

        # calculate considered splice site range based on EOF Orf object
        (min_aa_pos, min_nt_pos) = self.minimal_eligable_donor_site_position(org)
        (max_aa_pos, max_nt_pos) = (theorf.endPY+2)/3, theorf.endPY+2

        if next and org in next.organism_set():
            (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org,nextcbg=next)
        else:
            (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org)
        if next_max_nt_pos < max_nt_pos:
             # minimal range falls within the orf's start point
             (max_aa_pos, max_nt_pos) = (next_max_aa_pos, next_max_nt_pos)

        if enlarge_5p_boundary_by:
            min_aa_pos = min_aa_pos - enlarge_5p_boundary_by
            min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3)
        if enlarge_3p_boundary_by:
            max_aa_pos = max_aa_pos + enlarge_3p_boundary_by
            max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3)


        # set range to stg Collection objects
        stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos)

        if forced_codingblock_ends.has_key(org):
            # ready with this organism, no splice site setting!
            continue

        ########################################################################
        ### obtain splice sites for current collection
        ########################################################################

        # scan for splice sites
        theorf.scan_orf_for_pssm_splice_sites(splicetype="donor",
                min_pssm_score=MIN_DONOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_DONOR,
                non_canonical_min_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
                forced=True)

        # first, add the projected splicesites (they overrule true sites)
        if projected_donors.has_key(org):
            for projsite in projected_donors[org]:
                # check if we can ignore this site
                if not store_all_projected_sites:
                    if projsite.pos < min_nt_pos: continue
                    if max_nt_pos and projsite.pos > max_nt_pos: continue
                # create and add this projected site!
                projNode = ( org,theorf.id,projsite.pos )
                stg.add_node_and_object(projNode,projsite)

        # add the splice sites to the graph
        for dsq in theorf._donor_sites:

            if org == 'mgg' and theorf.id == 98: print dsq, dsq.pos, max_nt_pos

            # check if we can ignore this site
            if dsq.pos < min_nt_pos: continue
            if max_nt_pos and dsq.pos > max_nt_pos: continue

            # the node that represents this site
            dsqNode = ( org,theorf.id,dsq.pos )

            # check if this splice site is not already added as a projected site
            if dsqNode not in stg.get_nodes():
                stg.add_node_and_object(dsqNode,dsq)


    # now loop over all aligned combinations of organisms
    for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems():
        # only proces this combination if both organisms have splice sites!
        if g1 not in stg.organism_set(): continue
        if g2 not in stg.organism_set(): continue

        # now loop over all donor sites in Query and Sbjct
        # and align them in a graph; an edge is added if 2 sites
        # are less then ``ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3`` apart from each other
        for dsq in stg.get_organism_objects(g1):
            # the node that represents this site
            dsqNode  = ( g1,o1,dsq.pos )
            dsqClass = dsq.__class__.__name__

            for dss in stg.get_organism_objects(g2):
                # the node that represents this site
                dssNode  = ( g2,o2,dss.pos )
                dssClass = dss.__class__.__name__

                if 'CodingBlockEnd' in [ dsqClass,dssClass ]:
                    if dsqClass == dssClass:
                        # both CodingBlockEnd objects
                        dist = 0
                    else:
                        # calculate the distance in aligned nt positions
                        dist = pacbporf.get_distance_aligned_nucleotide_positions(
                                query = dsq.pos, sbjct = dss.pos
                                )

                    # check for the distance constrain
                    if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue

                else:
                    # Both Donor sites; check for phase compatibility
                    if not allow_phase_shift and dsq.phase != dss.phase: continue

                    # calculate the distance in aligned nt positions
                    dist = pacbporf.get_distance_aligned_nucleotide_positions(
                            query = dsq.pos, sbjct = dss.pos
                            )

                    if dsq.phase == dss.phase:
                        # ignore uniformly aligned sites here
                        pass
                    elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\
                    dsq.phase != dss.phase and min([dsq.pssm_score, dss.pssm_score ]) >= MIN_DONOR_SITE_PHASE_SHIFT_PSSM_SCORE:
                        #print "PhaseShift:", dist, (g1,dsq.pos), (g2,dss.pos), min([dsq.pssm_score, dss.pssm_score ])
                        pass # a potential splice site phase shift
                    else:
                        continue
    
                    # check for the distance constrain for sites with uniform phase
                    if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue

                # calculate binary entropies from Query
                if dsqClass == 'SpliceDonor':
                    dsqPositionPos, phaseQ = pacbporf.dnaposition_query(dsq.pos,forced_return=True)
                    entropyQ = pacbporf.alignment_entropy(dsqPositionPos,method='donor')
                elif dsqClass == 'ProjectedSpliceDonor':
                    entropyQ = dsq.entropy
                elif dsqClass == 'CodingBlockEnd':
                    entropyQ = 1.0
                else:
                    raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]"

                # calculate binary entropies from Sbjct
                if dssClass == 'SpliceDonor':
                    dssPositionPos, phaseS = pacbporf.dnaposition_query(dss.pos,forced_return=True)
                    entropyS = pacbporf.alignment_entropy(dssPositionPos,method='donor')
                elif dssClass == 'ProjectedSpliceDonor':
                    entropyS = dss.entropy
                elif dssClass == 'CodingBlockEnd':
                    entropyS = 1.0
                else:
                    raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]"

                # if here, then we have an aligned splice site!
                # calculate weight from distance, add edge and binary entropy values
                wt = 1.0 / ( 1.0 + float(dist/3) )
                stg.add_edge(dsqNode,dssNode,wt=wt)
                stg._edge_binary_entropies[(dsqNode,dssNode)] = (entropyQ,entropyS)
                stg._edge_binary_entropies[(dssNode,dsqNode)] = (entropyS,entropyQ)

    # return filled splicesitecollection graph
    return stg
Exemple #2
0
def ExonCollectionGraph2DonorSiteCollectionGraph(gra):
    """
    Convert ECG -> DonorSiteCollectionGraph

    @attention: only in use when ECG is NOT a FinalExon ECG

    @rtype:  DonorSiteCollectionGraph
	@return: DonorSiteCollectionGraph instance to be placed in the CBG
    """
    newgra = DonorSiteCollectionGraph()
    newgra.ALIGNED_SITE_AA_OFFSET = 10
    newgra.MIN_PSSM_SCORE = -0.0
    for node in gra.get_nodes():
        donor = gra._node_object[node].donor
        if donor.__class__.__name__ == 'CodingBlockEnd':
            phase = gra.donor_phase()
            # return a ProjectedSpliceSite
            projDonor = CodingBlockEnd2ProjectedSpliceDonor(donor,phase=phase)
            newnode = ( node[0], node[1], projDonor.pos )
            newgra.add_node_and_object(newnode,projDonor)
            newgra._node_pssm[newnode] = donor.pssm_score
        else:
            newnode = ( node[0], node[1], donor.pos )
            newgra.add_node_and_object(newnode,donor)
            newgra._node_pssm[newnode] = donor.pssm_score
    for nodeA,nodeB in newgra.pairwisecrosscombinations_node():
        newgra.add_edge(nodeA,nodeB,wt=1.0,entropy=1.0)
    # return the donorsitecollection
    return newgra
Exemple #3
0
def harvest_elegiable_donor_sites(self,projected_donors={},forced_codingblock_ends={},next=None,
    store_all_projected_sites=False,
    allow_phase_shift=False,
    enlarge_5p_boundary_by=None, # in AA coordinates
    enlarge_3p_boundary_by=None, # in AA coordinates
    ALIGNED_DONOR_MAX_TRIPLET_DISTANCE=None,
    MIN_DONOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_DONOR=False,
    NON_CANONICAL_MIN_DONOR_PSSM_SCORE=None ):
    """
    Harvest elegiable donor sites from this CodingBlockGraph into a DonorSiteCollectionGraph
    """

    if next and next.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]:
        message = "next must be a CodingBlock graph object, not a %s" % next.__class__.__name__
        raise InproperlyAppliedArgument, message

    # update minimal pssm score to stg collection object
    stg = DonorSiteCollectionGraph()
    stg.MIN_PSSM_SCORE = MIN_DONOR_PSSM_SCORE
    stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_DONOR_MAX_TRIPLET_DISTANCE

    # First, process each individual organism.
    # (A) obtain elegiable splice site range
    # (B) scan for splice sites
    # (C) add the projected sites to the graph
    # (D) add splice sites to the stg collection graph
    for org in self.organism_set():
        # take the first (and only) orf of this organism
        theorf = self.get_orfs_of_graph(organism=org)[0]

        if forced_codingblock_ends.has_key(org):
            # the node that represents this site
            cbgEnd = forced_codingblock_ends[org]
            cbgEndNode = ( org,theorf.id,cbgEnd.pos )
            # add to the collection graph
            stg.add_node_and_object(cbgEndNode,cbgEnd)
            # ready with this organism, no splice site setting!
            #continue

        if next.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph":
            # continue; all `donor` boundaries are hard-set 
            # no splice_site_range or actual site prediction needed
            continue

        ########################################################################
        ### get the considered splice site range
        ########################################################################

        # calculate considered splice site range based on EOF Orf object
        # take theorf.endPY + 2 (two) !, because EOF Orf is the start of the
        # STOP codon. Example:
        # ... tca TAG tac gtc ...
        # ... tca                   EOF Orf
        #         TAG               STOP codon
        #     ..a taG Tac gt.       perfect DONOR Site; PSSM-score ~7.7

        # calculate considered splice site range based on EOF Orf object
        (min_aa_pos, min_nt_pos) = self.minimal_eligable_donor_site_position(org)
        (max_aa_pos, max_nt_pos) = (theorf.endPY+2)/3, theorf.endPY+2

        if next and org in next.organism_set():
            (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org,nextcbg=next)
        else:
            (next_max_aa_pos, next_max_nt_pos) = self.maximal_eligable_donor_site_position(org)
        if next_max_nt_pos < max_nt_pos:
             # minimal range falls within the orf's start point
             (max_aa_pos, max_nt_pos) = (next_max_aa_pos, next_max_nt_pos)

        if enlarge_5p_boundary_by:
            min_aa_pos = min_aa_pos - enlarge_5p_boundary_by
            min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3)
        if enlarge_3p_boundary_by:
            max_aa_pos = max_aa_pos + enlarge_3p_boundary_by
            max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3)


        # set range to stg Collection objects
        stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos)

        if forced_codingblock_ends.has_key(org):
            # ready with this organism, no splice site setting!
            continue

        ########################################################################
        ### obtain splice sites for current collection
        ########################################################################

        # scan for splice sites
        theorf.scan_orf_for_pssm_splice_sites(splicetype="donor",
                min_pssm_score=MIN_DONOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_DONOR,
                non_canonical_min_pssm_score=NON_CANONICAL_MIN_DONOR_PSSM_SCORE,
                forced=True)

        # first, add the projected splicesites (they overrule true sites)
        if projected_donors.has_key(org):
            for projsite in projected_donors[org]:
                # check if we can ignore this site
                if not store_all_projected_sites:
                    if projsite.pos < min_nt_pos: continue
                    if max_nt_pos and projsite.pos > max_nt_pos: continue
                # create and add this projected site!
                projNode = ( org,theorf.id,projsite.pos )
                stg.add_node_and_object(projNode,projsite)

        # add the splice sites to the graph
        for dsq in theorf._donor_sites:

            # check if we can ignore this site
            if dsq.pos < min_nt_pos: continue
            if max_nt_pos and dsq.pos > max_nt_pos: continue

            # the node that represents this site
            dsqNode = ( org,theorf.id,dsq.pos )

            # check if this splice site is not already added as a projected site
            if dsqNode not in stg.get_nodes():
                stg.add_node_and_object(dsqNode,dsq)


    # now loop over all aligned combinations of organisms
    for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems():
        # only proces this combination if both organisms have splice sites!
        if g1 not in stg.organism_set(): continue
        if g2 not in stg.organism_set(): continue

        # now loop over all donor sites in Query and Sbjct
        # and align them in a graph; an edge is added if 2 sites
        # are less then ``ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3`` apart from each other
        for dsq in stg.get_organism_objects(g1):
            # the node that represents this site
            dsqNode  = ( g1,o1,dsq.pos )
            dsqClass = dsq.__class__.__name__

            for dss in stg.get_organism_objects(g2):
                # the node that represents this site
                dssNode  = ( g2,o2,dss.pos )
                dssClass = dss.__class__.__name__

                if 'CodingBlockEnd' in [ dsqClass,dssClass ]:
                    if dsqClass == dssClass:
                        # both CodingBlockEnd objects
                        dist = 0
                    else:
                        # calculate the distance in aligned nt positions
                        dist = pacbporf.get_distance_aligned_nucleotide_positions(
                                query = dsq.pos, sbjct = dss.pos
                                )

                    # check for the distance constrain
                    if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue

                else:
                    # Both Donor sites; check for phase compatibility
                    if not allow_phase_shift and dsq.phase != dss.phase: continue

                    # calculate the distance in aligned nt positions
                    dist = pacbporf.get_distance_aligned_nucleotide_positions(
                            query = dsq.pos, sbjct = dss.pos
                            )

                    if dsq.phase == dss.phase:
                        # ignore uniformly aligned sites here
                        pass
                    elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\
                    dsq.phase != dss.phase and min([dsq.pssm_score, dss.pssm_score ]) >= MIN_DONOR_SITE_PHASE_SHIFT_PSSM_SCORE:
                        #print "PhaseShift:", dist, (g1,dsq.pos), (g2,dss.pos), min([dsq.pssm_score, dss.pssm_score ])
                        pass # a potential splice site phase shift
                    else:
                        continue
    
                    # check for the distance constrain for sites with uniform phase
                    if dist > ALIGNED_DONOR_MAX_TRIPLET_DISTANCE*3: continue

                # calculate binary entropies from Query
                if dsqClass == 'SpliceDonor':
                    dsqPositionPos, phaseQ = pacbporf.dnaposition_query(dsq.pos,forced_return=True)
                    entropyQ = pacbporf.alignment_entropy(dsqPositionPos,method='donor')
                elif dsqClass == 'ProjectedSpliceDonor':
                    entropyQ = dsq.entropy
                elif dsqClass == 'CodingBlockEnd':
                    entropyQ = 1.0
                else:
                    raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]"

                # calculate binary entropies from Sbjct
                if dssClass == 'SpliceDonor':
                    dssPositionPos, phaseS = pacbporf.dnaposition_query(dss.pos,forced_return=True)
                    entropyS = pacbporf.alignment_entropy(dssPositionPos,method='donor')
                elif dssClass == 'ProjectedSpliceDonor':
                    entropyS = dss.entropy
                elif dssClass == 'CodingBlockEnd':
                    entropyS = 1.0
                else:
                    raise "NOT in [ SpliceDonor, ProjectedSpliceDonor, CodingBlockEnd ]"

                # if here, then we have an aligned splice site!
                # calculate weight from distance, add edge and binary entropy values
                wt = 1.0 / ( 1.0 + float(dist/3) )
                stg.add_edge(dsqNode,dssNode,wt=wt)
                stg._edge_binary_entropies[(dsqNode,dssNode)] = (entropyQ,entropyS)
                stg._edge_binary_entropies[(dssNode,dsqNode)] = (entropyS,entropyQ)

    # return filled splicesitecollection graph
    return stg