Example #1
0
    def create_intermediary_lsrcbgs(self,verbose=False):
        """
        Create lsrCBGs in between CBGs with identical sets of nodes

        @attention: function can be called as often as desired

        @rtype:  Boolean
        @return: True or False weather or not lsrCBGs are created
        """
        # return status boolean weather or not a lsrCBG is added
        RETURN_STATUS_LSRCBG_IS_ADDED = False

        # loop BACKWARDS over the CBGs in case of an insert
        for i in range(len(self)-1,0,-1):
            # get combinations of 2 neighbouring CBGs
            (firstCBG,secondCBG) = self.codingblockgraphs[i-1:i+1]
            # ignore if one of them IS_IGNORED, lsrCBG or SPLITTED
            if firstCBG.IS_IGNORED:  continue
            if secondCBG.IS_IGNORED: continue
            if firstCBG._short_name  == "lsrCBG": continue
            if secondCBG._short_name == "lsrCBG": continue
            if firstCBG.IS_3P_SPLITTED:  continue
            if secondCBG.IS_5P_SPLITTED: continue
            # ignore if not all mutual nodes
            if firstCBG.node_set().symmetric_difference(secondCBG.get_nodes()):
                # reset possible bogus IS_SPLITTED variable settings
                # in CBGS. Can be instantiated by deletion of CBGs
                firstCBG.IS_3P_SPLITTED = False
                secondCBG.IS_5P_SPLITTED = False
                if not firstCBG.IS_5P_SPLITTED:
                    firstCBG.IS_SPLITTED = False
                if not secondCBG.IS_3P_SPLITTED:
                    secondCBG.IS_SPLITTED = False
                continue

            # If this point is reached, firstCBG and secondCBG are CBGs with
            # exactly the same nodes
            # create intermediate lsrCBG
            lsrCBG = create_intermediate_lowsimilarity_region(firstCBG,secondCBG)

            # check if this lsrCBG got any nodes (lsromsr not added!)
            if lsrCBG.get_nodes():
                ################################################################
                if verbose:
                    print lsrCBG
                    print "potential inframe intron:",
                    print lsrCBG.potentially_contains_inframe_intron()
                ################################################################
                # update the status of CBG firstCBG and secondCBG
                firstCBG.IS_SPLITTED     = True
                firstCBG.IS_3P_SPLITTED  = True
                secondCBG.IS_SPLITTED    = True
                secondCBG.IS_5P_SPLITTED = True
                # insert the LowSimilarityRegionCodingBlockGraph
                # at the proper position
                self.codingblockgraphs.insert(i,lsrCBG)
                RETURN_STATUS_LSRCBG_IS_ADDED = True

        # return the status weather or not a lsrCBG is added
        return RETURN_STATUS_LSRCBG_IS_ADDED
Example #2
0
def cbg_cexpander_inframe_intron_search(self,
        min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON,
        min_intron_nt_length = MIN_INTRON_NT_LENGTH,
        verbose=False):
        """
        @type  self: CodingBlockGraph
        @param self: CodingBlockGraph instance

        @type  min_total_pssm_score: float
        @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON

        @type  min_intron_nt_length: integer
        @param min_intron_nt_length: MIN_INTRON_NT_LENGTH

        @type  verbose: Boolean
        @param verbose: print status/debugging messages to STDOUT

        @rtype:  list or False
        @return: list with new (sub)CBGs or False when not splitted
        """
        ########################################################################
        if verbose:
            stw = StopWatch(name="cexpCbgIfIntron")
            stw.start()
        ########################################################################

        # return variable; list of splitted CBGs.
        return_cbg_list = [ self ]

        # create cexpander multiplealignment blocks
        cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander,
                verbose=verbose)

        # In freak-accident cases (one in thousends of times), cexpander produces
        # unequal amount of 1's in the binarystrings. This is theoretically impossible.
        # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns
        # False in these cases. Catch this here by quiting current 
        # cbg_cexpander_inframe_intron_search() function call and return False
        TODO=True
        if not cbgMA: return False

        ########################################################################
        if verbose:
            print stw.lap()
            blockscnt = len( cbgMA[ cbgMA.keys()[0] ] )
            print self
            print "BLOCKS:", blockscnt, self._cexpander.binarystring,
            print self._cexpander.projected_on
            for org in cbgMA.keys():
                print org, "\t", 
                for blockid in range(0,blockscnt):
                    if cbgMA[org][blockid].count("1") >= 1:
                        print len(cbgMA[org][blockid]), 
                    else:
                        print cbgMA[org][blockid], 
                print ""
        ########################################################################

        # loop over the aligned cexpander blocks and check the 
        # non-uniformly aligned blocks for length variation
        blockscnt  = len( cbgMA[ cbgMA.keys()[0] ] )
        oricbgomsr = self.overall_minimal_spanning_range()

        for blockid in range(0,blockscnt):
            # obtain non-uniformly aligned AA lengths for this block
            lengths = {}
            for org in cbgMA.keys():
                lengths[org] = cbgMA[org][blockid].count("0")
            # skip the uniformly aligned blocks
            if list(Set(lengths.values())) == [0]: continue
            ####################################################################
            if verbose: print stw.lap(), "lengths:", lengths
            ####################################################################

            # obtain coordinates for this area
            lsrcoords = {}
            for org in cbgMA.keys():
                node = self.node_by_organism(org)
                coordSta = min(oricbgomsr[node])
                # make summation of length of preceeding (non)aligned blocks
                for i in range(0,blockid):
                    coordSta += cbgMA[org][i].count("1") +\
                                cbgMA[org][i].count("0")
                # end coord is start coord + length of current block
                coordEnd = coordSta + lengths[org]
                lsrcoords[org] = ( coordSta, coordEnd )

            ####################################################################
            if verbose: print stw.lap(), "lsrcoords:", lsrcoords
            ####################################################################

            # translate AA lengths to NT lengths
            for k in lengths.keys(): lengths[k] = lengths[k]*3

            # check lenght discrepancy and assign putative inframe introns
            putative_inframe_intron_orgs =\
                _length_discrepancy_to_potential_inframe_introns(lengths)

            if not putative_inframe_intron_orgs:
                # no length discrepancy that can represent an inframe intron
                continue

            # organisms/genes for which an inframe intron can be an improvement
            # data dictionary. Keys: 'max_nt_length', 'min_nt_length', 
            # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm'
            inframe_intron_criteria = {}

            # find putative inframe introns in assigned genes/organisms
            putative_inframe_introns = {}
            for org in putative_inframe_intron_orgs:
                # assign inframe intron criteria for this organism
                inframe_intron_criteria[org] = {
                    'min_nt_length'     : min_intron_nt_length,
                    'min_total_pssm'    : min_total_pssm_score,
                    'min_donor_pos'     : (min(lsrcoords[org]) - 5) * 3,
                    'max_acceptor_pos'  : (max(lsrcoords[org]) + 5) * 3,
                    }

                # search for potential introns that can be responsible for this event
                theorf = self.get_orfs_of_graph(organism=org)[0]
                introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf,
                            min_intron_nt_length=min_intron_nt_length
                            )

                ################################################################
                if verbose: print "introns:", org, len(introns), "raw"
                ################################################################

                # filter introns for all outside the OMSR, to short, to long,
                # total pssm_score etc
                introns = _filter_putative_inframe_intron_list(
                        introns,org,inframe_intron_criteria)
                putative_inframe_introns[org] = introns
                ################################################################
                if verbose: print "introns:", org, len(introns), "filtered"
                ################################################################

            # check if all putative_inframe_intron_orgs have indeed introns
            # and check if all have at least a single intron phase in common
            if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]:
                # no introns in one or more organisms/genes -> continue
                continue
            if len( putative_inframe_introns )> 1:
                # do phase check in all organisms/genes
                phases = Set([0,1,2])
                for org, intronlist in putative_inframe_introns.iteritems():
                    thisphases = Set([ intron.phase for intron in intronlist ])
                    phases.intersection_update(thisphases)
                if len(phases) == 0:
                    ################################################################
                    if verbose: print "no mutual phase -> no cbgIF.is_optimal()"
                    ################################################################
                    # no mutual phase -> no cbgIF.is_optimal() possible lateron
                    continue
            else:
                pass

            # if an intron in at least a single organism is still there,
            # then split the involved pacbps in the `original` cbgL, the last
            # added CBG element in the return_cbg_list, and make a (virtual)
            # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps!
            cbgR = self.deepcopy()
            cbgL = self.deepcopy()

            # loop over the organisms/genes with inframe introns split
            # the Pacbps of these orgs in both to-become L and R CBGs 
            inframe_intron_orgs = putative_inframe_introns.keys()
            for org in inframe_intron_orgs:
                ################################################################
                if verbose:
                    print "splitting PACBPs for org:", org
                    print "L", cbgL
                    print "R", cbgL
                ################################################################
                node = self.node_by_organism(org)
                replacementsL = {}
                replacementsR = {}
                for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems():
                    if node in [node1,node2]:
                        # get the pacbp of this pacbporf and split it!
                        pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                        org1 = self.organism_by_node(node1)
                        org2 = self.organism_by_node(node2)

                        if org1 in putative_inframe_introns.keys() and\
                        org2 in putative_inframe_introns.keys() and\
                        inframe_intron_orgs.index(org) > 0:
                            # already splitted; both orgs are inframe introns!
                            continue

                        # make split coordinates relative
                        splitL = lsrcoords[org1][0] - pacbp.query_start
                        splitR = lsrcoords[org1][1] - pacbp.query_start

                        pacbpL = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitL,splitL),returnside='left')
                        pacbpR = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitR,splitR),returnside='rigth')

                        # check if both cbgL and cbgR make sence
                        # if not -> return False!
                        if not pacbpL: return False
                        if not pacbpR: return False

                        ########################################################
                        if verbose:
                            print "#", node1, node2, lsrcoords[org1], 
                            print "L:", splitL, "R:", splitR
                            print pacbp
                            print pacbpL
                            print pacbpR
                        ########################################################

                        # pacbpL -> extented pacbporfL -> store to replacementsL
                        newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfL.extend_pacbporf_after_stops()
                        replacementsL[(key,node1,node2)] = newpacbporfL

                        # pacbpR -> extented pacbporfR -> store to replacementsR
                        newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfR.extend_pacbporf_after_stops()
                        replacementsR[(key,node1,node2)] = newpacbporfR


                # do the pacbporf replacements in both CBGs
                statusL = _update_cbg_with_pacbporf_replacements(
                            cbgL,replacementsL)
                statusR = _update_cbg_with_pacbporf_replacements(
                            cbgR,replacementsR)

                # check if both cbgL and cbgR make sence
                if not statusL or not statusR:
                    # return unchanged cbg status -> False
                    return False
                    


            # Verify the interface between cbgL and cbgR.
            # Most likely, the sites are nicely alignable.
            cbgIF = CodingBlockGraphInterface(cbgL,cbgR)
            cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.harvest_splice_sites()
            cbgIF.find_conserved_splice_sites()

            ####################################################################
            if verbose:
                print cbgL
                print cbgIF
                print cbgR
                cbgIF.interfaceproperties()
            ####################################################################
            # check the properties of the CBGinterface
            if cbgIF.optimalitycheck().count(True) >= 2:
                # yes; is_compatible and donor and/or acceptor is optimal
                cbgL._CBGinterface3p = cbgIF
                cbgR._CBGinterface5p = cbgIF
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, cbgR ]
                ################################################################
                if verbose: print "INFRAME INTRON CONFIRMED!!"
                ################################################################
            else:
                # no compatible interface... although intron(s) was/were found!
                # (at least) two options are now open:
                # 1. enforce the intron(s) and create cbgIF with _forced_ends
                # 2. ignore the intron(s) and create an intermediate lsrCBG

                # 1. is `tricky`. First, how sure is this inframe intron,
                # what type of criteria do we assume etc etc.
                # second, how to create a coorect cbgIF? It must be an
                # IS_SPLITTED interface, of which the boundaries might fall
                # outside the OMSR's of the CBGs.

                # 2. ignore the intron(s) and create an intermediate lsrCBG
                lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR)
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, lsrCBG, cbgR ]
                ################################################################
                if verbose:
                    print "no INFRAME INTRON -> lsrCBG"
                    print cbgL
                    print " ", lsrCBG._CBGinterface5p
                    print " ", lsrCBG
                    print " ", lsrCBG._CBGinterface3p
                    print cbgR
                    self.printmultiplealignment()
                    print cbgL
                    cbgL.printmultiplealignment()
                    print cbgR
                    cbgR.printmultiplealignment()
                ################################################################

        # EOF this function.
        # return False if this CBG remained intact, list of splits when splitted
        if len(return_cbg_list) == 1:
            return False
        else:
            return return_cbg_list
Example #3
0
    def check_lsrcbgs_for_inframe_introns(self,verbose=False):
        """
        Check the lsrCBGs in the GSG and see if these regions can better be explained by an inframe intron
        """
        INFRAME_INTRONS_PREDICTED = 0
        LSR_RECREATED             = 0
        for cbgpos in range(len(self)-1,-1,-1):
            cbg = self.codingblockgraphs[cbgpos]
            if cbg.__class__.__name__ != 'LowSimilarityRegionCodingBlockGraph':
                continue
            # do the inframe intron analyses on a lsrCBG
            inframeintrons = cbg.potentially_contains_inframe_intron(verbose=verbose)
            # aparantly it seems possible to create one or more introns in the lsrCBG
            if inframeintrons:
                # get the bordering CBGs
                prev = self.codingblockgraphs[cbgpos-1]
                next = self.codingblockgraphs[cbgpos+1]
                # make CBGInterface between prev and next;
                # reset the _IS_SPLITTED tags!
                prev._splicedonorgraph = None
                prev._CBGinterface3p   = None
                prev._forced_3p_ends   = {}
                prev.IS_3P_SPLITTED    = False
                prev.IS_SPLITTED       = prev.IS_5P_SPLITTED
                next._spliceacceptorgraph = None
                next._CBGinterface5p   = None
                next._forced_5p_ends   = {}
                next.IS_5P_SPLITTED    = False
                next.IS_SPLITTED       = next.IS_3P_SPLITTED
        
                # create an actual CBGInterface of both CBGs around the lsrCBG
                cbgIF = CodingBlockGraphInterface(prev,next)
                if verbose: print cbgIF
                # re-harvest splice sites; store ALL the intron-projected sites
                cbgIF.harvest_splice_sites(allow_phase_shift=False,store_all_projected_sites=True)
                if verbose: print cbgIF
                # now remove all non-projected splice-sites in organisms that
                # are not reported to have a potential inframe intron
                cbgIF.allow_intron_in_organisms(inframeintrons)
                cbgIF.find_conserved_splice_sites()
                if verbose:
                    print cbgIF
                    print "compatible:", cbgIF.is_compatible(), "optimal:", cbgIF.is_optimal()
                    print cbgIF._optimal_aligned_donor
                    print cbgIF._optimal_aligned_acceptor
                # yes, this is what we expect; a compatible CBGInterface!
                # this very likely represents an inframe intron!
                if cbgIF.is_compatible():
                    # remove the lsrCBG from the GSG
                    lsrCBG = self.codingblockgraphs.pop(cbgpos)
                    # set the CBGInterface object in next and prev CBG
                    prev._CBGinterface3p = cbgIF
                    next._CBGinterface5p = cbgIF
                    # increase the counter of number of inframe introns predicted
                    INFRAME_INTRONS_PREDICTED+=1
                    ############################################################
                    if verbose: print "INFRAME INTRON PREDICTED!!"
                    ############################################################

                else:
                    # nope, this does not seem like a proper inframe intron
                    # reset the CBGs and the lsrCBG objects as they were!

                    # If this point is reached, `first` and `second` are CBGs with exactly the same nodes
                    # create intermediate lsrCBG
                    prev.IS_SPLITTED    = True
                    prev.IS_3P_SPLITTED = True
                    next.IS_SPLITTED    = True
                    next.IS_5P_SPLITTED = True
                    lsrCBG = create_intermediate_lowsimilarity_region(prev,next)
                    self.codingblockgraphs[cbgpos]   = lsrCBG

                    # recreate the CBGInterfaces (I)
                    cbgIFa = CodingBlockGraphInterface(prev,lsrCBG)
                    cbgIFa.harvest_splice_sites()
                    cbgIFa.find_conserved_splice_sites()
                    # set the interface object to the CBGs in GSG
                    prev._CBGinterface3p   = cbgIFa
                    lsrCBG._CBGinterface5p = cbgIFa

                    # recreate the CBGInterfaces (II)
                    cbgIFb = CodingBlockGraphInterface(lsrCBG,next)
                    cbgIFb.harvest_splice_sites()
                    cbgIFb.find_conserved_splice_sites()
                    # set the interface object to the CBGs in GSG
                    lsrCBG._CBGinterface3p = cbgIFb
                    next._CBGinterface5p   = cbgIFb

                    ############################################################
                    if verbose: print "NO COMPATIBLE SITE!"
                    ############################################################

                    ###for org in inframeintrons:
                    ###    print org, "NO COMPATIBLE SITES FOUND!"
                    ###    print prev
                    ###    print cbgIF
                    ###    print next
                    ###    theorf = next.get_orfs_of_graph(organism = org )[0]
                    ###    print theorf
                    ###    theorf.printproteinanddna()
                    ###    for donor in theorf._donor_sites: print donor
                    ###    for acceptor in theorf._acceptor_sites: print acceptor
        
        # return number of found inframe introns
        return INFRAME_INTRONS_PREDICTED
Example #4
0
    def search_for_lowsimilarity_regions(self,aligned_intron_min_aa_length=ALIGNED_INTRON_MIN_AA_LENGTH,verbose=False):
        """
        Search CBGs in genestructure for lowsimilarity regions
        """

        ################################################################
        if verbose:
            stw = StopWatch(name='lsrCBGsearch')
            stw.start()
        ################################################################

        # Loop reversed through genestructure to make sure that once
        # a CBG is splitted, the positions of the remainder of the
        # list stay intact.
        for posinGSG in range(len(self)-1,-1,-1):
            sg = self.codingblockgraphs[posinGSG]
            # skip IGNORED, lsrCBG and CBGs that are incomplete (still await HMM completion) 
            if sg.IS_IGNORED: continue
            if sg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue
            if sg.node_count() < self.EXACT_SG_NODE_COUNT: continue

            if verbose: print stw.lap(), posinGSG, "start"

            # check for potential aligned intron
            if sg.potentially_contains_aligned_intron(window_aa_size=aligned_intron_min_aa_length):
                ########################################################
                if verbose:
                    print stw.lap(), posinGSG, "found"
                    for k,v in sg.getomsrproteinsequences().iteritems():
                        print ">%s\n%s\n" % (k,v)
                    print "ABOUT TO SPLIT:", sg
                    print sg._cexpander.binarystring,
                    print sg._cexpander.projected_on
                    sg.printmultiplealignment()
                    for k,pacbp in sg.pacbps.iteritems(): print k, pacbp
                ########################################################
                # now actually split by inframe intron
                res = sg.split_codingblock_by_inframe_intron()
                if len(res) == 1:
                    # no inframe intron found here
                    pass
                else:
                    # prepare the CBGs for insertion 
                    for pos in range(0,len(res)):
                        splittedCBG = res[pos]
                        splittedCBG.extend_pacbporfs(self.input)
                        splittedCBG.update_edge_weights_by_minimal_spanning_range()
                        splittedCBG.IS_SPLITTED = True
                        if pos > 0:
                            splittedCBG.IS_5P_SPLITTED = True
                            splittedCBG.IS_FIRST = False
                        if pos < len(res)-1:
                            splittedCBG.IS_3P_SPLITTED = True
                            splittedCBG.IS_LAST = False
                        # (re)create the cache for the splitted CBGs
                        splittedCBG.create_cache()
                        ################################################
                        if verbose:
                            print stw.lap(), posinGSG, "done!"
                            print "SUCCESFULLY SPLITTED:", splittedCBG
                            splittedCBG.printmultiplealignment()
                            print splittedCBG._cexpander.binarystring, 
                            print splittedCBG._cexpander.projected_on
                            print splittedCBG._omsr
                            for trf in splittedCBG._cexpander._transferblocks:
                                print trf.binarystring, trf.projected_on
                            for k,v in splittedCBG._cexpander.inputsequences.iteritems():
                                print v,"\t",k
                            for _org,orflist in splittedCBG.get_orfs_of_graph().iteritems():
                                print orflist[0], _org
                            for pacbp in splittedCBG.pacbps.values():
                                print pacbp
                                pacbp.print_protein(_linesize=100)
                        ################################################

                    # create lsrCBGs and cbgIFs between them by looping in reversed
                    # order over all pairs of CBGs (because lsrCBG insertion in list)
                    for pos in range(len(res)-2,-1,-1):
                        cbgL,cbgR = res[pos:pos+2]
                        lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                        res.insert(pos+1,lsrCBG)
                        # create cbgIF between the CBGs and the lsrCBG
                        # just create -> cbgIF with lsrCBG is immediately is_optimal()
                        cbgIFa = CodingBlockGraphInterface(cbgL,lsrCBG)
                        cbgIFb = CodingBlockGraphInterface(lsrCBG,cbgR)
                        # set cbgIF objects to the CBGs and the lsrCBG
                        cbgL._CBGinterface3p   = cbgIFa
                        lsrCBG._CBGinterface5p = cbgIFa
                        lsrCBG._CBGinterface3p = cbgIFb
                        cbgR._CBGinterface5p   = cbgIFb

                    # update the first and last CBG in this list with the
                    # cbgIFs of the parental CBG (variable sg)
                    res[0]._CBGinterface5p =  sg._CBGinterface5p
                    res[-1]._CBGinterface3p = sg._CBGinterface3p
                    # update the original IS_FIRST/IS_LAST status
                    res[0].IS_FIRST = sg.IS_FIRST
                    res[-1].IS_LAST = sg.IS_LAST

                    # and set splittedCBGs to genestructure
                    # by replacing the existing CBG (variable sg) on the
                    # position posinGSG with the list op splitted CBGs
                    self.codingblockgraphs.__setslice__(posinGSG,posinGSG+1,res)

            else:
                # nope, no potential inframe intron; just append
                ###print sg.total_weight(), False
                pass
Example #5
0
    def gsg_cexpander_enlarge_lsrcbgs(self,verbose=False):
        """
        """
        lsr_coords_changed = 0

        for pos in range(1,len(self)-1):
            if self.codingblockgraphs[pos].__class__.__name__ !=\
            'LowSimilarityRegionCodingBlockGraph':
                continue
            # get previous and next CBG
            prevCBG = self.codingblockgraphs[pos-1]
            nextCBG = self.codingblockgraphs[pos+1]

            # obtain current CBG data for logging when something fails
            strreprPrevCbg = str(prevCBG)
            strreprLsrCbg  = str(self.codingblockgraphs[pos])
            strreprNextCbg = str(nextCBG)

            # deepcoy Pacbps in case cexpander omsr border gaps
            # operations mingles the CBG(s)
            bckp_prevcbg_pacbps = deepcopy(prevCBG.pacbps)
            bckp_nextcbg_pacbps = deepcopy(nextCBG.pacbps)

            try:
                # optimize the CBGs around the lsrCBG with cexpander data
                statusP = lib_cexpander.cexpander_checkCBG4omsrbordergaps(
                        prevCBG, omit5pside = True )
                statusN = lib_cexpander.cexpander_checkCBG4omsrbordergaps(
                        nextCBG, omit3pside = True )
                if statusP or statusN:
                    # if one or both CBGs changed -> new lsrCBG
                    if statusP: prevCBG.create_cache()
                    if statusN: nextCBG.create_cache()
                    newLsrCBG = create_intermediate_lowsimilarity_region(
                            prevCBG,nextCBG)
                    prepare_lsrcbg_and_cbg_for_gsg_insertion(prevCBG,newLsrCBG)
                    prepare_lsrcbg_and_cbg_for_gsg_insertion(newLsrCBG,nextCBG)
                    self.codingblockgraphs[pos] = newLsrCBG
                    ############################################################
                    if verbose:
                        print "gsg_cexpander_enlarge_lsrcbgs WAS:"
                        print strreprPrevCbg
                        print strreprLsrCbg
                        print strreprNextCbg
                        print "gsg_cexpander_enlarge_lsrcbgs IS:"
                        print prevCBG
                        print newLsrCBG
                        print nextCBG
                    ############################################################
                    lsr_coords_changed += 1

            except NoOverallMinimalSpanningRange:
                # NoOverallMinimalSpanningRange Exception;
                # that is - normally - the signal for deleting this CBG.
                # However, here it is a SEVERE problem. The CBG is 'lost' due to
                # the cexpander optimization. This will result in a later crash
                ########################################################################
                if verbose:
                    print "SeriousWarning: CBG lost due to gsg_cexpander_enlarge_lsrcbgs"
                    print "NoOverallMinimalSpanningRange"
                    print strreprPrevCbg
                    print strreprLsrCbg
                    print strreprNextCbg
                ########################################################################
                # Restore CBGs and lsrCBG in state as before this operation
                prevCBG.pacbps = bckp_prevcbg_pacbps
                prevCBG.create_cache()
                nextCBG.pacbps = bckp_nextcbg_pacbps
                nextCBG.create_cache()
                restoredLsrCBG = create_intermediate_lowsimilarity_region(
                            prevCBG,nextCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(prevCBG,restoredLsrCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(restoredLsrCBG,nextCBG)
                self.codingblockgraphs[pos] = restoredLsrCBG

            except lib_cexpander.ZeroUniformlyAlignedPositions:
                # due to optimization, the multiple alignment collapsed
                # that is - normally - the signal for deleting this CBG.
                # However, here it is a SEVERE problem. The CBG is 'lost' due to
                # the cexpander optimization. This will result in a later crash
                ########################################################################
                if verbose:
                    print "SeriousWarning: CBG lost due to gsg_cexpander_enlarge_lsrcbgs"
                    print "lib_cexpander.ZeroUniformlyAlignedPositions"
                    print strreprPrevCbg
                    print strreprLsrCbg
                    print strreprNextCbg
                ########################################################################
                # Restore CBGs and lsrCBG in state as before this operation
                prevCBG.pacbps = bckp_prevcbg_pacbps
                prevCBG.create_cache()
                nextCBG.pacbps = bckp_nextcbg_pacbps
                nextCBG.create_cache()
                restoredLsrCBG = create_intermediate_lowsimilarity_region(
                            prevCBG,nextCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(prevCBG,restoredLsrCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(restoredLsrCBG,nextCBG)
                self.codingblockgraphs[pos] = restoredLsrCBG

            except:
                # unexpected exception -> raise!
                raise "UnExpectedException in checkCBGs4omsrbordergaps"

        # return the counter how much lsrCBGs are changed
        return lsr_coords_changed
Example #6
0
def _place_cbg_in_partialgsg(cbglist,
                             partialGSG,
                             optimizetinyexoninterface=True,
                             omit_conditional_addition=False,
                             verbose=False):
    """
    @type  cbglist: [] 
    @param cbglist: list with CodingBlockGraphs that might be palced in the (partial)GSG

    @type  partialGSG: GeneStructureOfCodingBlockGraphs
    @param partialGSG: partial GeneStructureOfCodingBlockGraphs in which the CBGs are tried to be inserted into

    @rtype:  Boolean 
    @return: are any CBGs from cbglist placed into partialGSG?
    """
    # import function here to prevent circular import
    # TODO: make correct import!
    from genestructure_intermediatecbg import intermediateCBG_node_comparison

    placed_in_partialGSG = []
    curgsglen = len(partialGSG)
    while cbglist:
        for i in range(0, len(cbglist)):
            cbg = cbglist[i]
            # placeability check in the GSG with function's settings
            # for topological check or not
            placeability = partialGSG.add_codingblock(
                cbg,
                only_try_adding=True,
                omit_conditional_addition=omit_conditional_addition)
            ############################################################
            if verbose: print i, cbg, placeability
            ############################################################
            if not placeability: continue

            # place in the partialGSG and find the inserted position
            added = partialGSG.add_codingblock(
                cbg, omit_conditional_addition=omit_conditional_addition)
            cbgposingsg = _cbg_position_in_gsg(cbg, partialGSG)

            # do intermediateCBG_node_comparison() in the insert position
            if cbgposingsg > 0 and cbgposingsg < len(partialGSG) - 1:
                prevCBG = partialGSG.codingblockgraphs[cbgposingsg - 1]
                nextCBG = partialGSG.codingblockgraphs[cbgposingsg + 1]
                if False == intermediateCBG_node_comparison(
                        prevCBG, cbg, nextCBG):
                    # erroneou CBG insert -> continue
                    continue

            # replace proper pacbporfs from the parents
            if cbgposingsg > 0:
                prevCBG = partialGSG.codingblockgraphs[cbgposingsg - 1]
                replacements1 = partialGSG.codingblockgraphs[
                    cbgposingsg]._recrute_pacbporfs_from_parental_cbg(
                        prevCBG, verbose=verbose)
            if cbgposingsg < len(partialGSG) - 1:
                nextCBG = partialGSG.codingblockgraphs[cbgposingsg + 1]
                replacements2 = partialGSG.codingblockgraphs[
                    cbgposingsg]._recrute_pacbporfs_from_parental_cbg(
                        nextCBG, verbose=verbose)

            # create cbgIFs
            created = partialGSG.create_cbginterfaces()

            # check if one of the direct neighbouring CBGs has
            # the same set of nodes -> signal for a lsrCBG
            cbginterface_isa_lsrcbg = False
            cbginterface_isa_lsrcbg_asses_cbgIFa = None
            cbginterface_isa_lsrcbg_asses_cbgIFb = None
            lsrCBG = None
            if cbgposingsg > 0 and len(cbg.node_set().intersection(
                    partialGSG.codingblockgraphs[
                        cbgposingsg - 1].node_set())) == cbg.node_count():
                # left/5p of cbg is a CBG in the partialGSG with identical node set
                cbginterface_isa_lsrcbg = True
                cbginterface_isa_lsrcbg_asses_cbgIFa = False
                cbginterface_isa_lsrcbg_asses_cbgIFb = True
                lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region(
                    partialGSG.codingblockgraphs[cbgposingsg - 1], cbg)
            if cbgposingsg < len(partialGSG) - 1 and len(
                    cbg.node_set().intersection(partialGSG.codingblockgraphs[
                        cbgposingsg + 1].node_set())) == cbg.node_count():
                # right/3p of cbg is a CBG in the partialGSG with identical node set
                cbginterface_isa_lsrcbg = True
                cbginterface_isa_lsrcbg_asses_cbgIFa = True
                cbginterface_isa_lsrcbg_asses_cbgIFb = False
                lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region(
                    cbg, partialGSG.codingblockgraphs[cbgposingsg + 1])

            # assess the created CBGinterfaces
            cbgIFa = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface5p
            cbgIFb = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface3p
            if cbgIFa:
                if optimizetinyexoninterface:
                    cbgIFa.optimizetinyexoninterface()
                cbgIFaCheck = cbgIFa.optimalitycheck()
            else:
                cbgIFaCheck = [None, None, None]
            if cbgIFb:
                if optimizetinyexoninterface:
                    cbgIFb.optimizetinyexoninterface()
                cbgIFbCheck = cbgIFb.optimalitycheck()
            else:
                cbgIFbCheck = [None, None, None]

            # check if this freshly placed CBG makes sense to place in the partialGSG
            if cbginterface_isa_lsrcbg and lsrCBG:
                is_lsrcbg_addable = True
                if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\
                cbgIFaCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\
                cbgIFbCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if is_lsrcbg_addable:
                    # addable in the partialGSG; add the lsrCBG too
                    added = partialGSG.add_codingblock(
                        lsrCBG, omit_conditional_addition=True)
                    # create cbgInterfaces for the novel added lsrCBG
                    partialGSG.create_cbginterfaces()
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue
                else:
                    # nope, not addable; pass here and solve
                    # removal of this (lsr)CBG lateron
                    pass

            elif cbginterface_isa_lsrcbg and not lsrCBG:
                is_lsrcbg_addable = True
                if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\
                cbgIFaCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\
                cbgIFbCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if is_lsrcbg_addable:
                    # addable in the partialGSG as a tight fit to existing CBGs
                    # without a lsrCBG. Weird & rare case but can happen!
                    # re-create cbgInterface for the novel added CBG because
                    # it is not recognized asa splitted interface yet
                    partialGSG.codingblockgraphs[
                        cbgposingsg]._CBGinterface5p = None
                    partialGSG.codingblockgraphs[
                        cbgposingsg]._CBGinterface3p = None
                    if cbgposingsg > 0:
                        partialGSG.codingblockgraphs[cbgposingsg -
                                                     1]._CBGinterface3p = None
                    if cbgposingsg < len(partialGSG) - 1:
                        partialGSG.codingblockgraphs[cbgposingsg +
                                                     1]._CBGinterface5p = None
                    # now recreate cbgInterfaces
                    partialGSG.create_cbginterfaces()
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue
                else:
                    # nope, not addable; pass here and solve
                    # removal of this (lsr)CBG lateron
                    pass

            elif cbgIFa and cbgIFb:
                if cbgIFaCheck.count(True) >= 2 or cbgIFbCheck.count(
                        True) >= 2:
                    ############################################################
                    if verbose:
                        print "PLACEDab\n", cbgIFa, "\n", cbg, "\n", cbgIFb
                    ############################################################
                    # succesfully placed; leave in place
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue

            elif cbgIFa:
                if cbgIFaCheck.count(True) >= 2:
                    ############################################################
                    if verbose: print "PLACEDa\n", cbgIFa, "\n", cbg
                    ############################################################
                    # succesfully placed; leave in place
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue

            elif cbgIFb:
                if cbgIFbCheck.count(True) >= 2:
                    ############################################################
                    if verbose: print "PLACEDb\n", cbg, "\n", cbgIFb
                    ############################################################
                    # succesfully placed; leave in place
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue

            else:
                # what else!?
                raise "No cbgIFs at all in partialGSG %s for cbg %s" % (
                    partialGSG, cbg)

            ############################################################
            if verbose:
                print i, "NOTPLACABLE!",
                print cbg
                print "cbgIFa:", cbgIFa
                print "cbgIFb:", cbgIFb
            ############################################################

            # Remove the falsely placed CBG and recreate original cbgIFs
            partialGSG.codingblockgraphs.pop(cbgposingsg)
            created = partialGSG.create_cbginterfaces()
            # done with trying to add this CBGS. Do next...

        if placed_in_partialGSG:
            # remove the CBGs that are placed in the partialGSG
            _remove_placed_cbgs_from_list(placed_in_partialGSG, cbglist)
            # reset placed_in_partialGSG to empty list
            placed_in_partialGSG = []
        else:
            # no cbgs placed in the GSG -> break the while loop
            break

    # check if a CBG was added
    if len(partialGSG) > curgsglen:
        return True
    else:
        return False
def _place_cbg_in_partialgsg(cbglist,partialGSG,
    optimizetinyexoninterface=True,
    omit_conditional_addition=False,
    verbose=False):
    """
    @type  cbglist: [] 
    @param cbglist: list with CodingBlockGraphs that might be palced in the (partial)GSG

    @type  partialGSG: GeneStructureOfCodingBlockGraphs
    @param partialGSG: partial GeneStructureOfCodingBlockGraphs in which the CBGs are tried to be inserted into

    @rtype:  Boolean 
    @return: are any CBGs from cbglist placed into partialGSG?
    """
    # import function here to prevent circular import
    # TODO: make correct import!
    from genestructure_intermediatecbg import intermediateCBG_node_comparison

    placed_in_partialGSG = []
    curgsglen = len(partialGSG)
    while cbglist:
        for i in range(0,len(cbglist)):
            cbg = cbglist[i]
            # placeability check in the GSG with function's settings
            # for topological check or not
            placeability = partialGSG.add_codingblock(cbg,
                    only_try_adding=True,
                    omit_conditional_addition=omit_conditional_addition
                    )
            ############################################################
            if verbose: print i, cbg, placeability
            ############################################################
            if not placeability: continue

            # place in the partialGSG and find the inserted position
            added = partialGSG.add_codingblock(cbg,omit_conditional_addition=omit_conditional_addition)
            cbgposingsg = _cbg_position_in_gsg(cbg,partialGSG)

            # do intermediateCBG_node_comparison() in the insert position
            if cbgposingsg > 0 and cbgposingsg < len(partialGSG)-1:
                prevCBG = partialGSG.codingblockgraphs[cbgposingsg-1]
                nextCBG = partialGSG.codingblockgraphs[cbgposingsg+1]
                if False == intermediateCBG_node_comparison(prevCBG,cbg,nextCBG):
                    # erroneou CBG insert -> continue
                    continue

            # replace proper pacbporfs from the parents
            if cbgposingsg > 0:
                prevCBG = partialGSG.codingblockgraphs[cbgposingsg-1]
                replacements1 = partialGSG.codingblockgraphs[cbgposingsg]._recrute_pacbporfs_from_parental_cbg(prevCBG,verbose=verbose)
            if cbgposingsg < len(partialGSG)-1:
                nextCBG = partialGSG.codingblockgraphs[cbgposingsg+1]
                replacements2 = partialGSG.codingblockgraphs[cbgposingsg]._recrute_pacbporfs_from_parental_cbg(nextCBG,verbose=verbose)

            # create cbgIFs
            created = partialGSG.create_cbginterfaces()

            # check if one of the direct neighbouring CBGs has
            # the same set of nodes -> signal for a lsrCBG
            cbginterface_isa_lsrcbg = False
            cbginterface_isa_lsrcbg_asses_cbgIFa = None 
            cbginterface_isa_lsrcbg_asses_cbgIFb = None
            lsrCBG = None
            if cbgposingsg > 0 and len(cbg.node_set().intersection(
            partialGSG.codingblockgraphs[cbgposingsg-1].node_set())) == cbg.node_count():
                # left/5p of cbg is a CBG in the partialGSG with identical node set
                cbginterface_isa_lsrcbg = True
                cbginterface_isa_lsrcbg_asses_cbgIFa = False
                cbginterface_isa_lsrcbg_asses_cbgIFb = True 
                lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region(
                        partialGSG.codingblockgraphs[cbgposingsg-1], cbg )
            if cbgposingsg < len(partialGSG)-1 and len(cbg.node_set().intersection(
            partialGSG.codingblockgraphs[cbgposingsg+1].node_set())) == cbg.node_count():
                # right/3p of cbg is a CBG in the partialGSG with identical node set
                cbginterface_isa_lsrcbg = True
                cbginterface_isa_lsrcbg_asses_cbgIFa = True 
                cbginterface_isa_lsrcbg_asses_cbgIFb = False 
                lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region(
                        cbg, partialGSG.codingblockgraphs[cbgposingsg+1] )

            # assess the created CBGinterfaces
            cbgIFa = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface5p
            cbgIFb = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface3p
            if cbgIFa:
                if optimizetinyexoninterface: cbgIFa.optimizetinyexoninterface()
                cbgIFaCheck = cbgIFa.optimalitycheck()
            else:
                cbgIFaCheck = [ None, None, None ]
            if cbgIFb:
                if optimizetinyexoninterface: cbgIFb.optimizetinyexoninterface()
                cbgIFbCheck = cbgIFb.optimalitycheck()
            else:
                cbgIFbCheck = [ None, None, None ]


            # check if this freshly placed CBG makes sense to place in the partialGSG
            if cbginterface_isa_lsrcbg and lsrCBG:
                is_lsrcbg_addable = True
                if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\
                cbgIFaCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\
                cbgIFbCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if is_lsrcbg_addable:
                    # addable in the partialGSG; add the lsrCBG too
                    added = partialGSG.add_codingblock(lsrCBG,omit_conditional_addition=True)
                    # create cbgInterfaces for the novel added lsrCBG
                    partialGSG.create_cbginterfaces()
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue
                else:
                    # nope, not addable; pass here and solve
                    # removal of this (lsr)CBG lateron
                    pass

            elif cbginterface_isa_lsrcbg and not lsrCBG:
                is_lsrcbg_addable = True
                if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\
                cbgIFaCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\
                cbgIFbCheck.count(True) < 2:
                    is_lsrcbg_addable = False
                if is_lsrcbg_addable:
                    # addable in the partialGSG as a tight fit to existing CBGs
                    # without a lsrCBG. Weird & rare case but can happen!
                    # re-create cbgInterface for the novel added CBG because
                    # it is not recognized asa splitted interface yet
                    partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface5p = None
                    partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface3p = None
                    if cbgposingsg > 0:
                        partialGSG.codingblockgraphs[cbgposingsg-1]._CBGinterface3p = None
                    if cbgposingsg < len(partialGSG)-1: 
                        partialGSG.codingblockgraphs[cbgposingsg+1]._CBGinterface5p = None
                    # now recreate cbgInterfaces
                    partialGSG.create_cbginterfaces()
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue
                else:
                    # nope, not addable; pass here and solve
                    # removal of this (lsr)CBG lateron
                    pass

            elif cbgIFa and cbgIFb:
                if cbgIFaCheck.count(True) >= 2 or cbgIFbCheck.count(True) >= 2:
                    ############################################################
                    if verbose: print "PLACEDab\n", cbgIFa, "\n", cbg, "\n", cbgIFb
                    ############################################################
                    # succesfully placed; leave in place
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue

            elif cbgIFa:
                if cbgIFaCheck.count(True) >= 2:
                    ############################################################
                    if verbose: print "PLACEDa\n", cbgIFa, "\n", cbg
                    ############################################################
                    # succesfully placed; leave in place
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue

            elif cbgIFb:
                if cbgIFbCheck.count(True) >= 2:
                    ############################################################
                    if verbose: print "PLACEDb\n", cbg, "\n", cbgIFb
                    ############################################################
                    # succesfully placed; leave in place
                    placed_in_partialGSG.append(i)
                    # continue trying adding the next CBG
                    continue

            else:
                # what else!?
                raise "No cbgIFs at all in partialGSG %s for cbg %s" % ( partialGSG, cbg )


            ############################################################
            if verbose:
                print i, "NOTPLACABLE!",
                print cbg
                print "cbgIFa:", cbgIFa
                print "cbgIFb:", cbgIFb
            ############################################################

            # Remove the falsely placed CBG and recreate original cbgIFs
            partialGSG.codingblockgraphs.pop(cbgposingsg)
            created = partialGSG.create_cbginterfaces()
            # done with trying to add this CBGS. Do next...

        if placed_in_partialGSG:
            # remove the CBGs that are placed in the partialGSG
            _remove_placed_cbgs_from_list(placed_in_partialGSG,cbglist)
            # reset placed_in_partialGSG to empty list
            placed_in_partialGSG = []
        else:
            # no cbgs placed in the GSG -> break the while loop
            break

    # check if a CBG was added 
    if len(partialGSG) > curgsglen:
        return True
    else:
        return False