def create_intermediary_lsrcbgs(self,verbose=False): """ Create lsrCBGs in between CBGs with identical sets of nodes @attention: function can be called as often as desired @rtype: Boolean @return: True or False weather or not lsrCBGs are created """ # return status boolean weather or not a lsrCBG is added RETURN_STATUS_LSRCBG_IS_ADDED = False # loop BACKWARDS over the CBGs in case of an insert for i in range(len(self)-1,0,-1): # get combinations of 2 neighbouring CBGs (firstCBG,secondCBG) = self.codingblockgraphs[i-1:i+1] # ignore if one of them IS_IGNORED, lsrCBG or SPLITTED if firstCBG.IS_IGNORED: continue if secondCBG.IS_IGNORED: continue if firstCBG._short_name == "lsrCBG": continue if secondCBG._short_name == "lsrCBG": continue if firstCBG.IS_3P_SPLITTED: continue if secondCBG.IS_5P_SPLITTED: continue # ignore if not all mutual nodes if firstCBG.node_set().symmetric_difference(secondCBG.get_nodes()): # reset possible bogus IS_SPLITTED variable settings # in CBGS. Can be instantiated by deletion of CBGs firstCBG.IS_3P_SPLITTED = False secondCBG.IS_5P_SPLITTED = False if not firstCBG.IS_5P_SPLITTED: firstCBG.IS_SPLITTED = False if not secondCBG.IS_3P_SPLITTED: secondCBG.IS_SPLITTED = False continue # If this point is reached, firstCBG and secondCBG are CBGs with # exactly the same nodes # create intermediate lsrCBG lsrCBG = create_intermediate_lowsimilarity_region(firstCBG,secondCBG) # check if this lsrCBG got any nodes (lsromsr not added!) if lsrCBG.get_nodes(): ################################################################ if verbose: print lsrCBG print "potential inframe intron:", print lsrCBG.potentially_contains_inframe_intron() ################################################################ # update the status of CBG firstCBG and secondCBG firstCBG.IS_SPLITTED = True firstCBG.IS_3P_SPLITTED = True secondCBG.IS_SPLITTED = True secondCBG.IS_5P_SPLITTED = True # insert the LowSimilarityRegionCodingBlockGraph # at the proper position self.codingblockgraphs.insert(i,lsrCBG) RETURN_STATUS_LSRCBG_IS_ADDED = True # return the status weather or not a lsrCBG is added return RETURN_STATUS_LSRCBG_IS_ADDED
def cbg_cexpander_inframe_intron_search(self, min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON, min_intron_nt_length = MIN_INTRON_NT_LENGTH, verbose=False): """ @type self: CodingBlockGraph @param self: CodingBlockGraph instance @type min_total_pssm_score: float @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON @type min_intron_nt_length: integer @param min_intron_nt_length: MIN_INTRON_NT_LENGTH @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list or False @return: list with new (sub)CBGs or False when not splitted """ ######################################################################## if verbose: stw = StopWatch(name="cexpCbgIfIntron") stw.start() ######################################################################## # return variable; list of splitted CBGs. return_cbg_list = [ self ] # create cexpander multiplealignment blocks cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander, verbose=verbose) # In freak-accident cases (one in thousends of times), cexpander produces # unequal amount of 1's in the binarystrings. This is theoretically impossible. # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns # False in these cases. Catch this here by quiting current # cbg_cexpander_inframe_intron_search() function call and return False TODO=True if not cbgMA: return False ######################################################################## if verbose: print stw.lap() blockscnt = len( cbgMA[ cbgMA.keys()[0] ] ) print self print "BLOCKS:", blockscnt, self._cexpander.binarystring, print self._cexpander.projected_on for org in cbgMA.keys(): print org, "\t", for blockid in range(0,blockscnt): if cbgMA[org][blockid].count("1") >= 1: print len(cbgMA[org][blockid]), else: print cbgMA[org][blockid], print "" ######################################################################## # loop over the aligned cexpander blocks and check the # non-uniformly aligned blocks for length variation blockscnt = len( cbgMA[ cbgMA.keys()[0] ] ) oricbgomsr = self.overall_minimal_spanning_range() for blockid in range(0,blockscnt): # obtain non-uniformly aligned AA lengths for this block lengths = {} for org in cbgMA.keys(): lengths[org] = cbgMA[org][blockid].count("0") # skip the uniformly aligned blocks if list(Set(lengths.values())) == [0]: continue #################################################################### if verbose: print stw.lap(), "lengths:", lengths #################################################################### # obtain coordinates for this area lsrcoords = {} for org in cbgMA.keys(): node = self.node_by_organism(org) coordSta = min(oricbgomsr[node]) # make summation of length of preceeding (non)aligned blocks for i in range(0,blockid): coordSta += cbgMA[org][i].count("1") +\ cbgMA[org][i].count("0") # end coord is start coord + length of current block coordEnd = coordSta + lengths[org] lsrcoords[org] = ( coordSta, coordEnd ) #################################################################### if verbose: print stw.lap(), "lsrcoords:", lsrcoords #################################################################### # translate AA lengths to NT lengths for k in lengths.keys(): lengths[k] = lengths[k]*3 # check lenght discrepancy and assign putative inframe introns putative_inframe_intron_orgs =\ _length_discrepancy_to_potential_inframe_introns(lengths) if not putative_inframe_intron_orgs: # no length discrepancy that can represent an inframe intron continue # organisms/genes for which an inframe intron can be an improvement # data dictionary. Keys: 'max_nt_length', 'min_nt_length', # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm' inframe_intron_criteria = {} # find putative inframe introns in assigned genes/organisms putative_inframe_introns = {} for org in putative_inframe_intron_orgs: # assign inframe intron criteria for this organism inframe_intron_criteria[org] = { 'min_nt_length' : min_intron_nt_length, 'min_total_pssm' : min_total_pssm_score, 'min_donor_pos' : (min(lsrcoords[org]) - 5) * 3, 'max_acceptor_pos' : (max(lsrcoords[org]) + 5) * 3, } # search for potential introns that can be responsible for this event theorf = self.get_orfs_of_graph(organism=org)[0] introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf, min_intron_nt_length=min_intron_nt_length ) ################################################################ if verbose: print "introns:", org, len(introns), "raw" ################################################################ # filter introns for all outside the OMSR, to short, to long, # total pssm_score etc introns = _filter_putative_inframe_intron_list( introns,org,inframe_intron_criteria) putative_inframe_introns[org] = introns ################################################################ if verbose: print "introns:", org, len(introns), "filtered" ################################################################ # check if all putative_inframe_intron_orgs have indeed introns # and check if all have at least a single intron phase in common if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]: # no introns in one or more organisms/genes -> continue continue if len( putative_inframe_introns )> 1: # do phase check in all organisms/genes phases = Set([0,1,2]) for org, intronlist in putative_inframe_introns.iteritems(): thisphases = Set([ intron.phase for intron in intronlist ]) phases.intersection_update(thisphases) if len(phases) == 0: ################################################################ if verbose: print "no mutual phase -> no cbgIF.is_optimal()" ################################################################ # no mutual phase -> no cbgIF.is_optimal() possible lateron continue else: pass # if an intron in at least a single organism is still there, # then split the involved pacbps in the `original` cbgL, the last # added CBG element in the return_cbg_list, and make a (virtual) # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps! cbgR = self.deepcopy() cbgL = self.deepcopy() # loop over the organisms/genes with inframe introns split # the Pacbps of these orgs in both to-become L and R CBGs inframe_intron_orgs = putative_inframe_introns.keys() for org in inframe_intron_orgs: ################################################################ if verbose: print "splitting PACBPs for org:", org print "L", cbgL print "R", cbgL ################################################################ node = self.node_by_organism(org) replacementsL = {} replacementsR = {} for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems(): if node in [node1,node2]: # get the pacbp of this pacbporf and split it! pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) org1 = self.organism_by_node(node1) org2 = self.organism_by_node(node2) if org1 in putative_inframe_introns.keys() and\ org2 in putative_inframe_introns.keys() and\ inframe_intron_orgs.index(org) > 0: # already splitted; both orgs are inframe introns! continue # make split coordinates relative splitL = lsrcoords[org1][0] - pacbp.query_start splitR = lsrcoords[org1][1] - pacbp.query_start pacbpL = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitL,splitL),returnside='left') pacbpR = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitR,splitR),returnside='rigth') # check if both cbgL and cbgR make sence # if not -> return False! if not pacbpL: return False if not pacbpR: return False ######################################################## if verbose: print "#", node1, node2, lsrcoords[org1], print "L:", splitL, "R:", splitR print pacbp print pacbpL print pacbpR ######################################################## # pacbpL -> extented pacbporfL -> store to replacementsL newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL, pacbporf.orfQ,pacbporf.orfS) newpacbporfL.extend_pacbporf_after_stops() replacementsL[(key,node1,node2)] = newpacbporfL # pacbpR -> extented pacbporfR -> store to replacementsR newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR, pacbporf.orfQ,pacbporf.orfS) newpacbporfR.extend_pacbporf_after_stops() replacementsR[(key,node1,node2)] = newpacbporfR # do the pacbporf replacements in both CBGs statusL = _update_cbg_with_pacbporf_replacements( cbgL,replacementsL) statusR = _update_cbg_with_pacbporf_replacements( cbgR,replacementsR) # check if both cbgL and cbgR make sence if not statusL or not statusR: # return unchanged cbg status -> False return False # Verify the interface between cbgL and cbgR. # Most likely, the sites are nicely alignable. cbgIF = CodingBlockGraphInterface(cbgL,cbgR) cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() ) cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() ) cbgIF.harvest_splice_sites() cbgIF.find_conserved_splice_sites() #################################################################### if verbose: print cbgL print cbgIF print cbgR cbgIF.interfaceproperties() #################################################################### # check the properties of the CBGinterface if cbgIF.optimalitycheck().count(True) >= 2: # yes; is_compatible and donor and/or acceptor is optimal cbgL._CBGinterface3p = cbgIF cbgR._CBGinterface5p = cbgIF cbgL.copy_5pcbginterface_from_othercbg(self) cbgR.copy_3pcbginterface_from_othercbg(self) return_cbg_list = [ cbgL, cbgR ] ################################################################ if verbose: print "INFRAME INTRON CONFIRMED!!" ################################################################ else: # no compatible interface... although intron(s) was/were found! # (at least) two options are now open: # 1. enforce the intron(s) and create cbgIF with _forced_ends # 2. ignore the intron(s) and create an intermediate lsrCBG # 1. is `tricky`. First, how sure is this inframe intron, # what type of criteria do we assume etc etc. # second, how to create a coorect cbgIF? It must be an # IS_SPLITTED interface, of which the boundaries might fall # outside the OMSR's of the CBGs. # 2. ignore the intron(s) and create an intermediate lsrCBG lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR) prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR) cbgL.copy_5pcbginterface_from_othercbg(self) cbgR.copy_3pcbginterface_from_othercbg(self) return_cbg_list = [ cbgL, lsrCBG, cbgR ] ################################################################ if verbose: print "no INFRAME INTRON -> lsrCBG" print cbgL print " ", lsrCBG._CBGinterface5p print " ", lsrCBG print " ", lsrCBG._CBGinterface3p print cbgR self.printmultiplealignment() print cbgL cbgL.printmultiplealignment() print cbgR cbgR.printmultiplealignment() ################################################################ # EOF this function. # return False if this CBG remained intact, list of splits when splitted if len(return_cbg_list) == 1: return False else: return return_cbg_list
def check_lsrcbgs_for_inframe_introns(self,verbose=False): """ Check the lsrCBGs in the GSG and see if these regions can better be explained by an inframe intron """ INFRAME_INTRONS_PREDICTED = 0 LSR_RECREATED = 0 for cbgpos in range(len(self)-1,-1,-1): cbg = self.codingblockgraphs[cbgpos] if cbg.__class__.__name__ != 'LowSimilarityRegionCodingBlockGraph': continue # do the inframe intron analyses on a lsrCBG inframeintrons = cbg.potentially_contains_inframe_intron(verbose=verbose) # aparantly it seems possible to create one or more introns in the lsrCBG if inframeintrons: # get the bordering CBGs prev = self.codingblockgraphs[cbgpos-1] next = self.codingblockgraphs[cbgpos+1] # make CBGInterface between prev and next; # reset the _IS_SPLITTED tags! prev._splicedonorgraph = None prev._CBGinterface3p = None prev._forced_3p_ends = {} prev.IS_3P_SPLITTED = False prev.IS_SPLITTED = prev.IS_5P_SPLITTED next._spliceacceptorgraph = None next._CBGinterface5p = None next._forced_5p_ends = {} next.IS_5P_SPLITTED = False next.IS_SPLITTED = next.IS_3P_SPLITTED # create an actual CBGInterface of both CBGs around the lsrCBG cbgIF = CodingBlockGraphInterface(prev,next) if verbose: print cbgIF # re-harvest splice sites; store ALL the intron-projected sites cbgIF.harvest_splice_sites(allow_phase_shift=False,store_all_projected_sites=True) if verbose: print cbgIF # now remove all non-projected splice-sites in organisms that # are not reported to have a potential inframe intron cbgIF.allow_intron_in_organisms(inframeintrons) cbgIF.find_conserved_splice_sites() if verbose: print cbgIF print "compatible:", cbgIF.is_compatible(), "optimal:", cbgIF.is_optimal() print cbgIF._optimal_aligned_donor print cbgIF._optimal_aligned_acceptor # yes, this is what we expect; a compatible CBGInterface! # this very likely represents an inframe intron! if cbgIF.is_compatible(): # remove the lsrCBG from the GSG lsrCBG = self.codingblockgraphs.pop(cbgpos) # set the CBGInterface object in next and prev CBG prev._CBGinterface3p = cbgIF next._CBGinterface5p = cbgIF # increase the counter of number of inframe introns predicted INFRAME_INTRONS_PREDICTED+=1 ############################################################ if verbose: print "INFRAME INTRON PREDICTED!!" ############################################################ else: # nope, this does not seem like a proper inframe intron # reset the CBGs and the lsrCBG objects as they were! # If this point is reached, `first` and `second` are CBGs with exactly the same nodes # create intermediate lsrCBG prev.IS_SPLITTED = True prev.IS_3P_SPLITTED = True next.IS_SPLITTED = True next.IS_5P_SPLITTED = True lsrCBG = create_intermediate_lowsimilarity_region(prev,next) self.codingblockgraphs[cbgpos] = lsrCBG # recreate the CBGInterfaces (I) cbgIFa = CodingBlockGraphInterface(prev,lsrCBG) cbgIFa.harvest_splice_sites() cbgIFa.find_conserved_splice_sites() # set the interface object to the CBGs in GSG prev._CBGinterface3p = cbgIFa lsrCBG._CBGinterface5p = cbgIFa # recreate the CBGInterfaces (II) cbgIFb = CodingBlockGraphInterface(lsrCBG,next) cbgIFb.harvest_splice_sites() cbgIFb.find_conserved_splice_sites() # set the interface object to the CBGs in GSG lsrCBG._CBGinterface3p = cbgIFb next._CBGinterface5p = cbgIFb ############################################################ if verbose: print "NO COMPATIBLE SITE!" ############################################################ ###for org in inframeintrons: ### print org, "NO COMPATIBLE SITES FOUND!" ### print prev ### print cbgIF ### print next ### theorf = next.get_orfs_of_graph(organism = org )[0] ### print theorf ### theorf.printproteinanddna() ### for donor in theorf._donor_sites: print donor ### for acceptor in theorf._acceptor_sites: print acceptor # return number of found inframe introns return INFRAME_INTRONS_PREDICTED
def search_for_lowsimilarity_regions(self,aligned_intron_min_aa_length=ALIGNED_INTRON_MIN_AA_LENGTH,verbose=False): """ Search CBGs in genestructure for lowsimilarity regions """ ################################################################ if verbose: stw = StopWatch(name='lsrCBGsearch') stw.start() ################################################################ # Loop reversed through genestructure to make sure that once # a CBG is splitted, the positions of the remainder of the # list stay intact. for posinGSG in range(len(self)-1,-1,-1): sg = self.codingblockgraphs[posinGSG] # skip IGNORED, lsrCBG and CBGs that are incomplete (still await HMM completion) if sg.IS_IGNORED: continue if sg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue if sg.node_count() < self.EXACT_SG_NODE_COUNT: continue if verbose: print stw.lap(), posinGSG, "start" # check for potential aligned intron if sg.potentially_contains_aligned_intron(window_aa_size=aligned_intron_min_aa_length): ######################################################## if verbose: print stw.lap(), posinGSG, "found" for k,v in sg.getomsrproteinsequences().iteritems(): print ">%s\n%s\n" % (k,v) print "ABOUT TO SPLIT:", sg print sg._cexpander.binarystring, print sg._cexpander.projected_on sg.printmultiplealignment() for k,pacbp in sg.pacbps.iteritems(): print k, pacbp ######################################################## # now actually split by inframe intron res = sg.split_codingblock_by_inframe_intron() if len(res) == 1: # no inframe intron found here pass else: # prepare the CBGs for insertion for pos in range(0,len(res)): splittedCBG = res[pos] splittedCBG.extend_pacbporfs(self.input) splittedCBG.update_edge_weights_by_minimal_spanning_range() splittedCBG.IS_SPLITTED = True if pos > 0: splittedCBG.IS_5P_SPLITTED = True splittedCBG.IS_FIRST = False if pos < len(res)-1: splittedCBG.IS_3P_SPLITTED = True splittedCBG.IS_LAST = False # (re)create the cache for the splitted CBGs splittedCBG.create_cache() ################################################ if verbose: print stw.lap(), posinGSG, "done!" print "SUCCESFULLY SPLITTED:", splittedCBG splittedCBG.printmultiplealignment() print splittedCBG._cexpander.binarystring, print splittedCBG._cexpander.projected_on print splittedCBG._omsr for trf in splittedCBG._cexpander._transferblocks: print trf.binarystring, trf.projected_on for k,v in splittedCBG._cexpander.inputsequences.iteritems(): print v,"\t",k for _org,orflist in splittedCBG.get_orfs_of_graph().iteritems(): print orflist[0], _org for pacbp in splittedCBG.pacbps.values(): print pacbp pacbp.print_protein(_linesize=100) ################################################ # create lsrCBGs and cbgIFs between them by looping in reversed # order over all pairs of CBGs (because lsrCBG insertion in list) for pos in range(len(res)-2,-1,-1): cbgL,cbgR = res[pos:pos+2] lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR) res.insert(pos+1,lsrCBG) # create cbgIF between the CBGs and the lsrCBG # just create -> cbgIF with lsrCBG is immediately is_optimal() cbgIFa = CodingBlockGraphInterface(cbgL,lsrCBG) cbgIFb = CodingBlockGraphInterface(lsrCBG,cbgR) # set cbgIF objects to the CBGs and the lsrCBG cbgL._CBGinterface3p = cbgIFa lsrCBG._CBGinterface5p = cbgIFa lsrCBG._CBGinterface3p = cbgIFb cbgR._CBGinterface5p = cbgIFb # update the first and last CBG in this list with the # cbgIFs of the parental CBG (variable sg) res[0]._CBGinterface5p = sg._CBGinterface5p res[-1]._CBGinterface3p = sg._CBGinterface3p # update the original IS_FIRST/IS_LAST status res[0].IS_FIRST = sg.IS_FIRST res[-1].IS_LAST = sg.IS_LAST # and set splittedCBGs to genestructure # by replacing the existing CBG (variable sg) on the # position posinGSG with the list op splitted CBGs self.codingblockgraphs.__setslice__(posinGSG,posinGSG+1,res) else: # nope, no potential inframe intron; just append ###print sg.total_weight(), False pass
def gsg_cexpander_enlarge_lsrcbgs(self,verbose=False): """ """ lsr_coords_changed = 0 for pos in range(1,len(self)-1): if self.codingblockgraphs[pos].__class__.__name__ !=\ 'LowSimilarityRegionCodingBlockGraph': continue # get previous and next CBG prevCBG = self.codingblockgraphs[pos-1] nextCBG = self.codingblockgraphs[pos+1] # obtain current CBG data for logging when something fails strreprPrevCbg = str(prevCBG) strreprLsrCbg = str(self.codingblockgraphs[pos]) strreprNextCbg = str(nextCBG) # deepcoy Pacbps in case cexpander omsr border gaps # operations mingles the CBG(s) bckp_prevcbg_pacbps = deepcopy(prevCBG.pacbps) bckp_nextcbg_pacbps = deepcopy(nextCBG.pacbps) try: # optimize the CBGs around the lsrCBG with cexpander data statusP = lib_cexpander.cexpander_checkCBG4omsrbordergaps( prevCBG, omit5pside = True ) statusN = lib_cexpander.cexpander_checkCBG4omsrbordergaps( nextCBG, omit3pside = True ) if statusP or statusN: # if one or both CBGs changed -> new lsrCBG if statusP: prevCBG.create_cache() if statusN: nextCBG.create_cache() newLsrCBG = create_intermediate_lowsimilarity_region( prevCBG,nextCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(prevCBG,newLsrCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(newLsrCBG,nextCBG) self.codingblockgraphs[pos] = newLsrCBG ############################################################ if verbose: print "gsg_cexpander_enlarge_lsrcbgs WAS:" print strreprPrevCbg print strreprLsrCbg print strreprNextCbg print "gsg_cexpander_enlarge_lsrcbgs IS:" print prevCBG print newLsrCBG print nextCBG ############################################################ lsr_coords_changed += 1 except NoOverallMinimalSpanningRange: # NoOverallMinimalSpanningRange Exception; # that is - normally - the signal for deleting this CBG. # However, here it is a SEVERE problem. The CBG is 'lost' due to # the cexpander optimization. This will result in a later crash ######################################################################## if verbose: print "SeriousWarning: CBG lost due to gsg_cexpander_enlarge_lsrcbgs" print "NoOverallMinimalSpanningRange" print strreprPrevCbg print strreprLsrCbg print strreprNextCbg ######################################################################## # Restore CBGs and lsrCBG in state as before this operation prevCBG.pacbps = bckp_prevcbg_pacbps prevCBG.create_cache() nextCBG.pacbps = bckp_nextcbg_pacbps nextCBG.create_cache() restoredLsrCBG = create_intermediate_lowsimilarity_region( prevCBG,nextCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(prevCBG,restoredLsrCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(restoredLsrCBG,nextCBG) self.codingblockgraphs[pos] = restoredLsrCBG except lib_cexpander.ZeroUniformlyAlignedPositions: # due to optimization, the multiple alignment collapsed # that is - normally - the signal for deleting this CBG. # However, here it is a SEVERE problem. The CBG is 'lost' due to # the cexpander optimization. This will result in a later crash ######################################################################## if verbose: print "SeriousWarning: CBG lost due to gsg_cexpander_enlarge_lsrcbgs" print "lib_cexpander.ZeroUniformlyAlignedPositions" print strreprPrevCbg print strreprLsrCbg print strreprNextCbg ######################################################################## # Restore CBGs and lsrCBG in state as before this operation prevCBG.pacbps = bckp_prevcbg_pacbps prevCBG.create_cache() nextCBG.pacbps = bckp_nextcbg_pacbps nextCBG.create_cache() restoredLsrCBG = create_intermediate_lowsimilarity_region( prevCBG,nextCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(prevCBG,restoredLsrCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(restoredLsrCBG,nextCBG) self.codingblockgraphs[pos] = restoredLsrCBG except: # unexpected exception -> raise! raise "UnExpectedException in checkCBGs4omsrbordergaps" # return the counter how much lsrCBGs are changed return lsr_coords_changed
def _place_cbg_in_partialgsg(cbglist, partialGSG, optimizetinyexoninterface=True, omit_conditional_addition=False, verbose=False): """ @type cbglist: [] @param cbglist: list with CodingBlockGraphs that might be palced in the (partial)GSG @type partialGSG: GeneStructureOfCodingBlockGraphs @param partialGSG: partial GeneStructureOfCodingBlockGraphs in which the CBGs are tried to be inserted into @rtype: Boolean @return: are any CBGs from cbglist placed into partialGSG? """ # import function here to prevent circular import # TODO: make correct import! from genestructure_intermediatecbg import intermediateCBG_node_comparison placed_in_partialGSG = [] curgsglen = len(partialGSG) while cbglist: for i in range(0, len(cbglist)): cbg = cbglist[i] # placeability check in the GSG with function's settings # for topological check or not placeability = partialGSG.add_codingblock( cbg, only_try_adding=True, omit_conditional_addition=omit_conditional_addition) ############################################################ if verbose: print i, cbg, placeability ############################################################ if not placeability: continue # place in the partialGSG and find the inserted position added = partialGSG.add_codingblock( cbg, omit_conditional_addition=omit_conditional_addition) cbgposingsg = _cbg_position_in_gsg(cbg, partialGSG) # do intermediateCBG_node_comparison() in the insert position if cbgposingsg > 0 and cbgposingsg < len(partialGSG) - 1: prevCBG = partialGSG.codingblockgraphs[cbgposingsg - 1] nextCBG = partialGSG.codingblockgraphs[cbgposingsg + 1] if False == intermediateCBG_node_comparison( prevCBG, cbg, nextCBG): # erroneou CBG insert -> continue continue # replace proper pacbporfs from the parents if cbgposingsg > 0: prevCBG = partialGSG.codingblockgraphs[cbgposingsg - 1] replacements1 = partialGSG.codingblockgraphs[ cbgposingsg]._recrute_pacbporfs_from_parental_cbg( prevCBG, verbose=verbose) if cbgposingsg < len(partialGSG) - 1: nextCBG = partialGSG.codingblockgraphs[cbgposingsg + 1] replacements2 = partialGSG.codingblockgraphs[ cbgposingsg]._recrute_pacbporfs_from_parental_cbg( nextCBG, verbose=verbose) # create cbgIFs created = partialGSG.create_cbginterfaces() # check if one of the direct neighbouring CBGs has # the same set of nodes -> signal for a lsrCBG cbginterface_isa_lsrcbg = False cbginterface_isa_lsrcbg_asses_cbgIFa = None cbginterface_isa_lsrcbg_asses_cbgIFb = None lsrCBG = None if cbgposingsg > 0 and len(cbg.node_set().intersection( partialGSG.codingblockgraphs[ cbgposingsg - 1].node_set())) == cbg.node_count(): # left/5p of cbg is a CBG in the partialGSG with identical node set cbginterface_isa_lsrcbg = True cbginterface_isa_lsrcbg_asses_cbgIFa = False cbginterface_isa_lsrcbg_asses_cbgIFb = True lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region( partialGSG.codingblockgraphs[cbgposingsg - 1], cbg) if cbgposingsg < len(partialGSG) - 1 and len( cbg.node_set().intersection(partialGSG.codingblockgraphs[ cbgposingsg + 1].node_set())) == cbg.node_count(): # right/3p of cbg is a CBG in the partialGSG with identical node set cbginterface_isa_lsrcbg = True cbginterface_isa_lsrcbg_asses_cbgIFa = True cbginterface_isa_lsrcbg_asses_cbgIFb = False lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region( cbg, partialGSG.codingblockgraphs[cbgposingsg + 1]) # assess the created CBGinterfaces cbgIFa = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface5p cbgIFb = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface3p if cbgIFa: if optimizetinyexoninterface: cbgIFa.optimizetinyexoninterface() cbgIFaCheck = cbgIFa.optimalitycheck() else: cbgIFaCheck = [None, None, None] if cbgIFb: if optimizetinyexoninterface: cbgIFb.optimizetinyexoninterface() cbgIFbCheck = cbgIFb.optimalitycheck() else: cbgIFbCheck = [None, None, None] # check if this freshly placed CBG makes sense to place in the partialGSG if cbginterface_isa_lsrcbg and lsrCBG: is_lsrcbg_addable = True if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\ cbgIFaCheck.count(True) < 2: is_lsrcbg_addable = False if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\ cbgIFbCheck.count(True) < 2: is_lsrcbg_addable = False if is_lsrcbg_addable: # addable in the partialGSG; add the lsrCBG too added = partialGSG.add_codingblock( lsrCBG, omit_conditional_addition=True) # create cbgInterfaces for the novel added lsrCBG partialGSG.create_cbginterfaces() placed_in_partialGSG.append(i) # continue trying adding the next CBG continue else: # nope, not addable; pass here and solve # removal of this (lsr)CBG lateron pass elif cbginterface_isa_lsrcbg and not lsrCBG: is_lsrcbg_addable = True if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\ cbgIFaCheck.count(True) < 2: is_lsrcbg_addable = False if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\ cbgIFbCheck.count(True) < 2: is_lsrcbg_addable = False if is_lsrcbg_addable: # addable in the partialGSG as a tight fit to existing CBGs # without a lsrCBG. Weird & rare case but can happen! # re-create cbgInterface for the novel added CBG because # it is not recognized asa splitted interface yet partialGSG.codingblockgraphs[ cbgposingsg]._CBGinterface5p = None partialGSG.codingblockgraphs[ cbgposingsg]._CBGinterface3p = None if cbgposingsg > 0: partialGSG.codingblockgraphs[cbgposingsg - 1]._CBGinterface3p = None if cbgposingsg < len(partialGSG) - 1: partialGSG.codingblockgraphs[cbgposingsg + 1]._CBGinterface5p = None # now recreate cbgInterfaces partialGSG.create_cbginterfaces() placed_in_partialGSG.append(i) # continue trying adding the next CBG continue else: # nope, not addable; pass here and solve # removal of this (lsr)CBG lateron pass elif cbgIFa and cbgIFb: if cbgIFaCheck.count(True) >= 2 or cbgIFbCheck.count( True) >= 2: ############################################################ if verbose: print "PLACEDab\n", cbgIFa, "\n", cbg, "\n", cbgIFb ############################################################ # succesfully placed; leave in place placed_in_partialGSG.append(i) # continue trying adding the next CBG continue elif cbgIFa: if cbgIFaCheck.count(True) >= 2: ############################################################ if verbose: print "PLACEDa\n", cbgIFa, "\n", cbg ############################################################ # succesfully placed; leave in place placed_in_partialGSG.append(i) # continue trying adding the next CBG continue elif cbgIFb: if cbgIFbCheck.count(True) >= 2: ############################################################ if verbose: print "PLACEDb\n", cbg, "\n", cbgIFb ############################################################ # succesfully placed; leave in place placed_in_partialGSG.append(i) # continue trying adding the next CBG continue else: # what else!? raise "No cbgIFs at all in partialGSG %s for cbg %s" % ( partialGSG, cbg) ############################################################ if verbose: print i, "NOTPLACABLE!", print cbg print "cbgIFa:", cbgIFa print "cbgIFb:", cbgIFb ############################################################ # Remove the falsely placed CBG and recreate original cbgIFs partialGSG.codingblockgraphs.pop(cbgposingsg) created = partialGSG.create_cbginterfaces() # done with trying to add this CBGS. Do next... if placed_in_partialGSG: # remove the CBGs that are placed in the partialGSG _remove_placed_cbgs_from_list(placed_in_partialGSG, cbglist) # reset placed_in_partialGSG to empty list placed_in_partialGSG = [] else: # no cbgs placed in the GSG -> break the while loop break # check if a CBG was added if len(partialGSG) > curgsglen: return True else: return False
def _place_cbg_in_partialgsg(cbglist,partialGSG, optimizetinyexoninterface=True, omit_conditional_addition=False, verbose=False): """ @type cbglist: [] @param cbglist: list with CodingBlockGraphs that might be palced in the (partial)GSG @type partialGSG: GeneStructureOfCodingBlockGraphs @param partialGSG: partial GeneStructureOfCodingBlockGraphs in which the CBGs are tried to be inserted into @rtype: Boolean @return: are any CBGs from cbglist placed into partialGSG? """ # import function here to prevent circular import # TODO: make correct import! from genestructure_intermediatecbg import intermediateCBG_node_comparison placed_in_partialGSG = [] curgsglen = len(partialGSG) while cbglist: for i in range(0,len(cbglist)): cbg = cbglist[i] # placeability check in the GSG with function's settings # for topological check or not placeability = partialGSG.add_codingblock(cbg, only_try_adding=True, omit_conditional_addition=omit_conditional_addition ) ############################################################ if verbose: print i, cbg, placeability ############################################################ if not placeability: continue # place in the partialGSG and find the inserted position added = partialGSG.add_codingblock(cbg,omit_conditional_addition=omit_conditional_addition) cbgposingsg = _cbg_position_in_gsg(cbg,partialGSG) # do intermediateCBG_node_comparison() in the insert position if cbgposingsg > 0 and cbgposingsg < len(partialGSG)-1: prevCBG = partialGSG.codingblockgraphs[cbgposingsg-1] nextCBG = partialGSG.codingblockgraphs[cbgposingsg+1] if False == intermediateCBG_node_comparison(prevCBG,cbg,nextCBG): # erroneou CBG insert -> continue continue # replace proper pacbporfs from the parents if cbgposingsg > 0: prevCBG = partialGSG.codingblockgraphs[cbgposingsg-1] replacements1 = partialGSG.codingblockgraphs[cbgposingsg]._recrute_pacbporfs_from_parental_cbg(prevCBG,verbose=verbose) if cbgposingsg < len(partialGSG)-1: nextCBG = partialGSG.codingblockgraphs[cbgposingsg+1] replacements2 = partialGSG.codingblockgraphs[cbgposingsg]._recrute_pacbporfs_from_parental_cbg(nextCBG,verbose=verbose) # create cbgIFs created = partialGSG.create_cbginterfaces() # check if one of the direct neighbouring CBGs has # the same set of nodes -> signal for a lsrCBG cbginterface_isa_lsrcbg = False cbginterface_isa_lsrcbg_asses_cbgIFa = None cbginterface_isa_lsrcbg_asses_cbgIFb = None lsrCBG = None if cbgposingsg > 0 and len(cbg.node_set().intersection( partialGSG.codingblockgraphs[cbgposingsg-1].node_set())) == cbg.node_count(): # left/5p of cbg is a CBG in the partialGSG with identical node set cbginterface_isa_lsrcbg = True cbginterface_isa_lsrcbg_asses_cbgIFa = False cbginterface_isa_lsrcbg_asses_cbgIFb = True lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region( partialGSG.codingblockgraphs[cbgposingsg-1], cbg ) if cbgposingsg < len(partialGSG)-1 and len(cbg.node_set().intersection( partialGSG.codingblockgraphs[cbgposingsg+1].node_set())) == cbg.node_count(): # right/3p of cbg is a CBG in the partialGSG with identical node set cbginterface_isa_lsrcbg = True cbginterface_isa_lsrcbg_asses_cbgIFa = True cbginterface_isa_lsrcbg_asses_cbgIFb = False lsrCBG = codingblock_splitting.create_intermediate_lowsimilarity_region( cbg, partialGSG.codingblockgraphs[cbgposingsg+1] ) # assess the created CBGinterfaces cbgIFa = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface5p cbgIFb = partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface3p if cbgIFa: if optimizetinyexoninterface: cbgIFa.optimizetinyexoninterface() cbgIFaCheck = cbgIFa.optimalitycheck() else: cbgIFaCheck = [ None, None, None ] if cbgIFb: if optimizetinyexoninterface: cbgIFb.optimizetinyexoninterface() cbgIFbCheck = cbgIFb.optimalitycheck() else: cbgIFbCheck = [ None, None, None ] # check if this freshly placed CBG makes sense to place in the partialGSG if cbginterface_isa_lsrcbg and lsrCBG: is_lsrcbg_addable = True if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\ cbgIFaCheck.count(True) < 2: is_lsrcbg_addable = False if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\ cbgIFbCheck.count(True) < 2: is_lsrcbg_addable = False if is_lsrcbg_addable: # addable in the partialGSG; add the lsrCBG too added = partialGSG.add_codingblock(lsrCBG,omit_conditional_addition=True) # create cbgInterfaces for the novel added lsrCBG partialGSG.create_cbginterfaces() placed_in_partialGSG.append(i) # continue trying adding the next CBG continue else: # nope, not addable; pass here and solve # removal of this (lsr)CBG lateron pass elif cbginterface_isa_lsrcbg and not lsrCBG: is_lsrcbg_addable = True if cbgIFa and cbginterface_isa_lsrcbg_asses_cbgIFa and\ cbgIFaCheck.count(True) < 2: is_lsrcbg_addable = False if cbgIFb and cbginterface_isa_lsrcbg_asses_cbgIFb and\ cbgIFbCheck.count(True) < 2: is_lsrcbg_addable = False if is_lsrcbg_addable: # addable in the partialGSG as a tight fit to existing CBGs # without a lsrCBG. Weird & rare case but can happen! # re-create cbgInterface for the novel added CBG because # it is not recognized asa splitted interface yet partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface5p = None partialGSG.codingblockgraphs[cbgposingsg]._CBGinterface3p = None if cbgposingsg > 0: partialGSG.codingblockgraphs[cbgposingsg-1]._CBGinterface3p = None if cbgposingsg < len(partialGSG)-1: partialGSG.codingblockgraphs[cbgposingsg+1]._CBGinterface5p = None # now recreate cbgInterfaces partialGSG.create_cbginterfaces() placed_in_partialGSG.append(i) # continue trying adding the next CBG continue else: # nope, not addable; pass here and solve # removal of this (lsr)CBG lateron pass elif cbgIFa and cbgIFb: if cbgIFaCheck.count(True) >= 2 or cbgIFbCheck.count(True) >= 2: ############################################################ if verbose: print "PLACEDab\n", cbgIFa, "\n", cbg, "\n", cbgIFb ############################################################ # succesfully placed; leave in place placed_in_partialGSG.append(i) # continue trying adding the next CBG continue elif cbgIFa: if cbgIFaCheck.count(True) >= 2: ############################################################ if verbose: print "PLACEDa\n", cbgIFa, "\n", cbg ############################################################ # succesfully placed; leave in place placed_in_partialGSG.append(i) # continue trying adding the next CBG continue elif cbgIFb: if cbgIFbCheck.count(True) >= 2: ############################################################ if verbose: print "PLACEDb\n", cbg, "\n", cbgIFb ############################################################ # succesfully placed; leave in place placed_in_partialGSG.append(i) # continue trying adding the next CBG continue else: # what else!? raise "No cbgIFs at all in partialGSG %s for cbg %s" % ( partialGSG, cbg ) ############################################################ if verbose: print i, "NOTPLACABLE!", print cbg print "cbgIFa:", cbgIFa print "cbgIFb:", cbgIFb ############################################################ # Remove the falsely placed CBG and recreate original cbgIFs partialGSG.codingblockgraphs.pop(cbgposingsg) created = partialGSG.create_cbginterfaces() # done with trying to add this CBGS. Do next... if placed_in_partialGSG: # remove the CBGs that are placed in the partialGSG _remove_placed_cbgs_from_list(placed_in_partialGSG,cbglist) # reset placed_in_partialGSG to empty list placed_in_partialGSG = [] else: # no cbgs placed in the GSG -> break the while loop break # check if a CBG was added if len(partialGSG) > curgsglen: return True else: return False