def harvest_elegiable_acceptor_sites(self,projected_acceptors={},forced_codingblock_ends={},prev=None,
    store_all_projected_sites=False,
    allow_phase_shift=False,
    enlarge_5p_boundary_by=None, # in AA coordinates
    enlarge_3p_boundary_by=None, # in AA coordinates
    ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE=None,
    MIN_ACCEPTOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_ACCEPTOR=None,
    NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE=None ):
    """
    Harvest elegiable acceptor sites from this CodingBlockGraph into a AcceptorSiteCollectionGraph
    """
    if prev and prev.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]:
        message = "prev must be a CodingBlock graph object, not a %s" % prev.__class__.__name__
        raise InproperlyAppliedArgument, message

    # update minimal pssm score to stg collection object
    stg = AcceptorSiteCollectionGraph()
    stg.MIN_PSSM_SCORE = MIN_ACCEPTOR_PSSM_SCORE
    stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE

    # First, proces each individual organism.
    # (A) obtain elegiable splice site range
    # (B) scan for splice sites
    # (C) add the projected sites to the graph
    # (D) add splice sites to the stg collection graph
    for org in self.organism_set():
        # take the first (and only) orf of this organism
        theorf = self.get_orfs_of_graph(organism=org)[0]

        if forced_codingblock_ends.has_key(org):
            # the node that represents this site
            cbgSta = forced_codingblock_ends[org]
            cbgStaNode = ( org,theorf.id,cbgSta.pos )
            # add to the collection graph
            stg.add_node_and_object(cbgStaNode,cbgSta)
            # ready with this organism, no splice site setting!
            #continue

        if prev.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph":
            # continue; all `acceptor` boundaries are hard-set
            # no splice_site_range or actual site prediction needed
            continue

        ########################################################################
        ### get the considered splice site range
        ########################################################################

        (max_aa_pos, max_nt_pos) = self.maximal_eligable_acceptor_site_position(org)
        (min_aa_pos, min_nt_pos) = (theorf.startPY-2)/3, theorf.startPY-2

        if prev and org in prev.organism_set():
            (next_min_aa_pos, next_min_nt_pos) = self.minimal_eligable_acceptor_site_position(org,prevcbg=prev)
        else:
            (next_min_aa_pos, next_min_nt_pos) = self.minimal_eligable_acceptor_site_position(org)
        if next_min_nt_pos > min_nt_pos:
             # minimal range falls within the orf's start point
             (min_aa_pos, min_nt_pos) = (next_min_aa_pos, next_min_nt_pos)
        if enlarge_5p_boundary_by:
            min_aa_pos = min_aa_pos - enlarge_5p_boundary_by
            min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3)
        if enlarge_3p_boundary_by:
            max_aa_pos = max_aa_pos + enlarge_3p_boundary_by
            max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3)

        # set range to stg Collection objects
        stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos)

        if forced_codingblock_ends.has_key(org):
            # ready with this organism, no splice site setting!
            continue

        ########################################################################
        ### obtain splice sites for current collection
        ########################################################################

        # scan for splice sites
        theorf.scan_orf_for_pssm_splice_sites(splicetype="acceptor",
                min_pssm_score=MIN_ACCEPTOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_ACCEPTOR,
                non_canonical_min_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE)

        # first, add the projected splicesites (they overrule true sites)
        if projected_acceptors.has_key(org):
            for projsite in projected_acceptors[org]:
                # check if we can ignore this site
                if not store_all_projected_sites:
                    if projsite.pos < min_nt_pos: continue
                    if max_nt_pos and projsite.pos > max_nt_pos: continue
                # create and add this projected site!
                projNode = ( org,theorf.id,projsite.pos )
                stg.add_node_and_object(projNode,projsite)

        # add the splice sites to the graph
        for asq in theorf._acceptor_sites:
            # check if we can ignore this site
            if asq.pos < min_nt_pos: continue
            if max_nt_pos and asq.pos > max_nt_pos: continue

            # the node that represents this site
            asqNode = ( org,theorf.id,asq.pos )

            # check if this splice site is not already added as a projected site
            if asqNode not in stg.get_nodes():
                stg.add_node_and_object(asqNode,asq)


    # now loop over all aligned combinations of organisms
    for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems():
        # only proces this combination if both organisms have splice sites!
        if g1 not in stg.organism_set(): continue
        if g2 not in stg.organism_set(): continue

        # now loop over all acceptor sites in Query and Sbjct
        # and align them in a graph; an edge is added if 2 sites
        # are less then ``ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3`` apart from each other
        for asq in stg.get_organism_objects(g1):
            # the node that represents this site
            asqNode = ( g1,o1,asq.pos )
            asqClass = asq.__class__.__name__

            for ass in stg.get_organism_objects(g2):
                # the node that represents this site
                assNode = ( g2,o2,ass.pos )
                assClass = ass.__class__.__name__

                if 'CodingBlockStart' in [ asqClass,assClass ]:
                    if asqClass == assClass:
                        # both CodingBlockEnd objects
                        dist = 0
                    else:
                        # calculate the distance in aligned nt positions
                        dist = pacbporf.get_distance_aligned_nucleotide_positions(
                                query = asq.pos, sbjct = ass.pos
                                )

                        # check for the distance constrain
                        if dist > ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3: continue

                else:
                    # Both Acceptor sites; check for phase compatibility
                    if not allow_phase_shift and asq.phase != ass.phase: continue

                    # calculate the distance in aligned nt positions
                    dist = pacbporf.get_distance_aligned_nucleotide_positions(
                            query = asq.pos, sbjct = ass.pos
                            )

                    if asq.phase == ass.phase:
                        # ignore uniformly aligned sites here
                        pass
                    elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\
                    asq.phase != ass.phase and min([ asq.pssm_score, ass.pssm_score ]) >= MIN_ACCEP_SITE_PHASE_SHIFT_PSSM_SCORE:
                        #print "PhaseShift:", dist, (g1,asq.pos), (g2,ass.pos), min([ asq.pssm_score, ass.pssm_score ]) 
                        pass # a potential splice site phase shift
                    else:
                        continue

                    # check for the distance constrain for sites of uniform phase
                    if dist > ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3: continue

                # calculate binary entropies from Query
                if asqClass == 'SpliceAcceptor':
                    asqPositionPos, phaseQ = pacbporf.dnaposition_query(asq.pos,forced_return=True)
                    entropyQ = pacbporf.alignment_entropy(asqPositionPos,method='acceptor')
                elif asqClass == 'ProjectedSpliceAcceptor':
                    entropyQ = asq.entropy
                elif asqClass == 'CodingBlockStart':
                    entropyQ = 1.0
                else:
                    raise "NOT a SpliceAcceptor or a ProjectedSpliceAcceptor"

                # calculate binary entropies from Sbjct
                if assClass == 'SpliceAcceptor':
                    assPositionPos, phaseS = pacbporf.dnaposition_query(ass.pos,forced_return=True)
                    entropyS = pacbporf.alignment_entropy(assPositionPos,method='acceptor')
                elif assClass == 'ProjectedSpliceAcceptor':
                    entropyS = ass.entropy
                elif assClass == 'CodingBlockStart':
                    entropyS = 1.0
                else:
                    raise "NOT a SpliceAcceptor or a ProjectedSpliceAcceptor"

                # if here, then we have an aligned splice site!
                # calculate weight from distance, add edge and binary entropy values
                wt = 1.0 / ( 1.0 + float(dist/3) )
                stg.add_edge(asqNode,assNode,wt=wt)
                stg._edge_binary_entropies[(asqNode,assNode)] = (entropyQ,entropyS)
                stg._edge_binary_entropies[(assNode,asqNode)] = (entropyS,entropyQ)

    # return filled splicesitecollection graph
    return stg
Exemple #2
0
def ExonCollectionGraph2AcceptorSiteCollectionGraph(gra):
    """
    Convert ECG -> AcceptorSiteCollectionGraph

    @attention: only in use when ECG is NOT a FirstExon ECG

    @rtype:  AcceptorSiteCollectionGraph
    @return: AcceptorSiteCollectionGraph instance to be placed in the CBG
    """
    newgra = AcceptorSiteCollectionGraph()
    newgra.ALIGNED_SITE_AA_OFFSET = 10
    newgra.MIN_PSSM_SCORE = -0.0
    for node in gra.get_nodes():
        accep = gra._node_object[node].acceptor
        if accep.__class__.__name__ == 'CodingBlockStart':
            phase = gra.acceptor_phase()
            # return a ProjectedSpliceSite
            projAccep = CodingBlockStart2ProjectedSpliceAcceptor(accep,phase=phase)
            newnode = ( node[0], node[1], projAccep.pos )
            newgra.add_node_and_object(newnode,projAccep)
            newgra._node_pssm[newnode] = accep.pssm_score
        else:
            newnode = ( node[0], node[1], accep.pos )
            newgra.add_node_and_object(newnode,accep)
            newgra._node_pssm[newnode] = accep.pssm_score
    for nodeA,nodeB in newgra.pairwisecrosscombinations_node():
        entropyQ = 1.0
        entropyS = 1.0
        newgra.add_edge(nodeA,nodeB,wt=1.0)
        newgra._edge_binary_entropies[(nodeA,nodeB)] = (entropyQ,entropyS)
        newgra._edge_binary_entropies[(nodeB,nodeA)] = (entropyS,entropyQ)
    return newgra
Exemple #3
0
def harvest_elegiable_acceptor_sites(self,projected_acceptors={},forced_codingblock_ends={},prev=None,
    store_all_projected_sites=False,
    allow_phase_shift=False,
    enlarge_5p_boundary_by=None, # in AA coordinates
    enlarge_3p_boundary_by=None, # in AA coordinates
    ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE=None,
    MIN_ACCEPTOR_PSSM_SCORE=None,ALLOW_NON_CANONICAL_ACCEPTOR=None,
    NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE=None ):
    """
    Harvest elegiable acceptor sites from this CodingBlockGraph into a AcceptorSiteCollectionGraph
    """
    if prev and prev.__class__.__name__ not in ["CodingBlockGraph","LowSimilarityRegionCodingBlockGraph"]:
        message = "prev must be a CodingBlock graph object, not a %s" % prev.__class__.__name__
        raise InproperlyAppliedArgument, message

    # update minimal pssm score to stg collection object
    stg = AcceptorSiteCollectionGraph()
    stg.MIN_PSSM_SCORE = MIN_ACCEPTOR_PSSM_SCORE
    stg.ALIGNED_SITE_AA_OFFSET = ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE

    # First, proces each individual organism.
    # (A) obtain elegiable splice site range
    # (B) scan for splice sites
    # (C) add the projected sites to the graph
    # (D) add splice sites to the stg collection graph
    for org in self.organism_set():
        # take the first (and only) orf of this organism
        theorf = self.get_orfs_of_graph(organism=org)[0]

        if forced_codingblock_ends.has_key(org):
            # the node that represents this site
            cbgSta = forced_codingblock_ends[org]
            cbgStaNode = ( org,theorf.id,cbgSta.pos )
            # add to the collection graph
            stg.add_node_and_object(cbgStaNode,cbgSta)
            # ready with this organism, no splice site setting!
            #continue

        if prev.__class__.__name__ == "LowSimilarityRegionCodingBlockGraph":
            # continue; all `acceptor` boundaries are hard-set
            # no splice_site_range or actual site prediction needed
            continue

        ########################################################################
        ### get the considered splice site range
        ########################################################################

        (max_aa_pos, max_nt_pos) = self.maximal_eligable_acceptor_site_position(org)
        (min_aa_pos, min_nt_pos) = (theorf.startPY-2)/3, theorf.startPY-2

        if prev and org in prev.organism_set():
            (next_min_aa_pos, next_min_nt_pos) = self.minimal_eligable_acceptor_site_position(org,prevcbg=prev)
        else:
            (next_min_aa_pos, next_min_nt_pos) = self.minimal_eligable_acceptor_site_position(org)
        if next_min_nt_pos > min_nt_pos:
             # minimal range falls within the orf's start point
             (min_aa_pos, min_nt_pos) = (next_min_aa_pos, next_min_nt_pos)
        if enlarge_5p_boundary_by:
            min_aa_pos = min_aa_pos - enlarge_5p_boundary_by
            min_nt_pos = min_nt_pos - (enlarge_5p_boundary_by*3)
        if enlarge_3p_boundary_by:
            max_aa_pos = max_aa_pos + enlarge_3p_boundary_by
            max_nt_pos = max_nt_pos + (enlarge_3p_boundary_by*3)

        # set range to stg Collection objects
        stg.set_consideredsplicesiterange(org,min_nt_pos,max_nt_pos)

        if forced_codingblock_ends.has_key(org):
            # ready with this organism, no splice site setting!
            continue

        ########################################################################
        ### obtain splice sites for current collection
        ########################################################################

        # scan for splice sites
        theorf.scan_orf_for_pssm_splice_sites(splicetype="acceptor",
                min_pssm_score=MIN_ACCEPTOR_PSSM_SCORE,allow_non_canonical=ALLOW_NON_CANONICAL_ACCEPTOR,
                non_canonical_min_pssm_score=NON_CANONICAL_MIN_ACCEPTOR_PSSM_SCORE)

        # first, add the projected splicesites (they overrule true sites)
        if projected_acceptors.has_key(org):
            for projsite in projected_acceptors[org]:
                # check if we can ignore this site
                if not store_all_projected_sites:
                    if projsite.pos < min_nt_pos: continue
                    if max_nt_pos and projsite.pos > max_nt_pos: continue
                # create and add this projected site!
                projNode = ( org,theorf.id,projsite.pos )
                stg.add_node_and_object(projNode,projsite)

        # add the splice sites to the graph
        for asq in theorf._acceptor_sites:
            # check if we can ignore this site
            if asq.pos < min_nt_pos: continue
            if max_nt_pos and asq.pos > max_nt_pos: continue

            # the node that represents this site
            asqNode = ( org,theorf.id,asq.pos )

            # check if this splice site is not already added as a projected site
            if asqNode not in stg.get_nodes():
                stg.add_node_and_object(asqNode,asq)


    # now loop over all aligned combinations of organisms
    for ( (a,b,c,d),(g1,o1),(g2,o2) ), pacbporf in self.pacbps.iteritems():
        # only proces this combination if both organisms have splice sites!
        if g1 not in stg.organism_set(): continue
        if g2 not in stg.organism_set(): continue

        # now loop over all acceptor sites in Query and Sbjct
        # and align them in a graph; an edge is added if 2 sites
        # are less then ``ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3`` apart from each other
        for asq in stg.get_organism_objects(g1):
            # the node that represents this site
            asqNode = ( g1,o1,asq.pos )
            asqClass = asq.__class__.__name__

            for ass in stg.get_organism_objects(g2):
                # the node that represents this site
                assNode = ( g2,o2,ass.pos )
                assClass = ass.__class__.__name__

                if 'CodingBlockStart' in [ asqClass,assClass ]:
                    if asqClass == assClass:
                        # both CodingBlockEnd objects
                        dist = 0
                    else:
                        # calculate the distance in aligned nt positions
                        dist = pacbporf.get_distance_aligned_nucleotide_positions(
                                query = asq.pos, sbjct = ass.pos
                                )

                        # check for the distance constrain
                        if dist > ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3: continue

                else:
                    # Both Acceptor sites; check for phase compatibility
                    if not allow_phase_shift and asq.phase != ass.phase: continue

                    # calculate the distance in aligned nt positions
                    dist = pacbporf.get_distance_aligned_nucleotide_positions(
                            query = asq.pos, sbjct = ass.pos
                            )

                    if asq.phase == ass.phase:
                        # ignore uniformly aligned sites here
                        pass
                    elif allow_phase_shift and dist <= MAX_SPLICE_SITE_PHASE_SHIFT_NT_DISTANCE and\
                    asq.phase != ass.phase and min([ asq.pssm_score, ass.pssm_score ]) >= MIN_ACCEP_SITE_PHASE_SHIFT_PSSM_SCORE:
                        #print "PhaseShift:", dist, (g1,asq.pos), (g2,ass.pos), min([ asq.pssm_score, ass.pssm_score ]) 
                        pass # a potential splice site phase shift
                    else:
                        continue

                    # check for the distance constrain for sites of uniform phase
                    if dist > ALIGNED_ACCEPTOR_MAX_TRIPLET_DISTANCE*3: continue

                # calculate binary entropies from Query
                if asqClass == 'SpliceAcceptor':
                    asqPositionPos, phaseQ = pacbporf.dnaposition_query(asq.pos,forced_return=True)
                    entropyQ = pacbporf.alignment_entropy(asqPositionPos,method='acceptor')
                elif asqClass == 'ProjectedSpliceAcceptor':
                    entropyQ = asq.entropy
                elif asqClass == 'CodingBlockStart':
                    entropyQ = 1.0
                else:
                    raise "NOT a SpliceAcceptor or a ProjectedSpliceAcceptor"

                # calculate binary entropies from Sbjct
                if assClass == 'SpliceAcceptor':
                    assPositionPos, phaseS = pacbporf.dnaposition_query(ass.pos,forced_return=True)
                    entropyS = pacbporf.alignment_entropy(assPositionPos,method='acceptor')
                elif assClass == 'ProjectedSpliceAcceptor':
                    entropyS = ass.entropy
                elif assClass == 'CodingBlockStart':
                    entropyS = 1.0
                else:
                    raise "NOT a SpliceAcceptor or a ProjectedSpliceAcceptor"

                # if here, then we have an aligned splice site!
                # calculate weight from distance, add edge and binary entropy values
                wt = 1.0 / ( 1.0 + float(dist/3) )
                stg.add_edge(asqNode,assNode,wt=wt)
                stg._edge_binary_entropies[(asqNode,assNode)] = (entropyQ,entropyS)
                stg._edge_binary_entropies[(assNode,asqNode)] = (entropyS,entropyQ)

    # return filled splicesitecollection graph
    return stg