def add_codingblock(self, new, log_debug=False, only_try_adding=False, max_cbg_gtg_topo_dif=None, max_cbg_gtg_abs_dif=None, min_cbg_gtg_id_ratio=None, min_tcode_omsr=None, omit_conditional_addition=False): """ (Try to) add a CodingBlockGraph to the genestructure @type new: CodingBlockGraph @param new: CodingBlockGraph object to be added to the genestructure @type only_try_adding: Boolean @param only_try_adding: only try to add the CBG and return succes status @type omit_conditional_addition: Boolean @param omit_conditional_addition: omit all CBG quality checks; just verify if the CBG is placeable based on the OMSR coordinates @type max_cbg_gtg_topo_dif: float (or None) @param max_cbg_gtg_topo_dif: @type max_cbg_gtg_abs_dif: float (or None) @param max_cbg_gtg_abs_dif: @type max_cbg_gtg_id_ratio: float (or None) @param max_cbg_gtg_id_ratio: @type min_tcode_omsr: float (or None) @param min_tcode_omsr: @rtype: Boolean @return: True or False, weather or not adding was succesfull """ verbose = log_debug # log_debug must be replaced by verbose.... # update edge weights by overall minimal spanning range if not only_try_adding: new.update_edge_weights_by_minimal_spanning_range() # check for difference with GeneTreeGraph if new.__class__.__name__ == 'CodingBlockGraph' and\ not omit_conditional_addition and\ self.genetree() and len(self) >= 1: newgtg = new.genetree() if new.node_count() == self.EXACT_SG_NODE_COUNT: gtg = self.genetree() else: # new to-be-placed graph misses certain organism node(s) completegtg = self.genetree() gtg = deepcopy(completegtg) for missingorg in completegtg.organism_set().difference( new.organism_set()): gtg.del_node(missingorg) # calculate identity, topological and absolute GTG differences cbg_gtg_topo_dif = gtg.graphalignmentdifference(newgtg) cbg_gtg_abs_dif = gtg.absolutegraphalignmentdifference(newgtg) cbg_gtg_id_ratio = newgtg.identity() / gtg.identity() #################################################################### if verbose: print "cbg2gsg", new #################################################################### # check the identity ratio if min_cbg_gtg_id_ratio: threshold_min_cbg_gtg_id_ratio = MIN_GTG_ID_RATIO_FUNCTION( min_cbg_gtg_id_ratio, gtg, new) ################################################################ if verbose: print "CUSTOM", threshold_min_cbg_gtg_id_ratio, print min_cbg_gtg_id_ratio ################################################################ else: threshold_min_cbg_gtg_id_ratio = MIN_GTG_ID_RATIO_FUNCTION( self.MIN_CBG_GTG_ID_RATIO, gtg, new) ################################################################ if verbose: print "NORMAL", threshold_min_cbg_gtg_id_ratio, print self.MIN_CBG_GTG_ID_RATIO ################################################################ if cbg_gtg_id_ratio < threshold_min_cbg_gtg_id_ratio: ################################################################ if verbose: print "rejected on ID ratio", threshold_min_cbg_gtg_id_ratio, print ">", cbg_gtg_id_ratio ################################################################ return False else: pass # check the relative topological difference if max_cbg_gtg_topo_dif: threshold_max_cbg_gtg_topo_dif = MAX_GTG_TOPO_DIF_FUNCTION( max_cbg_gtg_topo_dif, gtg, new) ################################################################ if verbose: print "CUSTOM", threshold_max_cbg_gtg_topo_dif, print max_cbg_gtg_topo_dif, gtg.identity(), new.omsrlength( ) ################################################################ else: threshold_max_cbg_gtg_topo_dif = MAX_GTG_TOPO_DIF_FUNCTION( self.MAX_CBG_GTG_TOPO_DIF, gtg, new) ################################################################ if verbose: print "NORMAL", threshold_max_cbg_gtg_topo_dif, print self.MAX_CBG_GTG_TOPO_DIF, gtg.identity( ), new.omsrlength() ################################################################ if cbg_gtg_id_ratio >= 1.10: # ignore TOPO_DIF check when newgtg.id% >> gtg.id% pass elif cbg_gtg_topo_dif > threshold_max_cbg_gtg_topo_dif: ################################################################ if verbose: print "rejected on TOPO_DIF", cbg_gtg_topo_dif, print ">", threshold_max_cbg_gtg_topo_dif ################################################################ return False else: pass # check the absolute topological difference if max_cbg_gtg_abs_dif: threshold_max_cbg_gtg_abs_dif = MAX_GTG_ABS_DIF_FUNCTION( max_cbg_gtg_abs_dif, gtg, new) ################################################################ if verbose: print "CUSTOM", threshold_max_cbg_gtg_abs_dif, print max_cbg_gtg_abs_dif, gtg.identity(), new.omsrlength() ################################################################ else: threshold_max_cbg_gtg_abs_dif = MAX_GTG_ABS_DIF_FUNCTION( self.MAX_CBG_GTG_ABS_DIF, gtg, new) ################################################################ if verbose: print "NORMAL", threshold_max_cbg_gtg_abs_dif, print self.MAX_CBG_GTG_ABS_DIF ################################################################ if cbg_gtg_id_ratio >= 1.10: # ignore TOPO_DIF check when newgtg.id% >> gtg.id% pass elif cbg_gtg_abs_dif > threshold_max_cbg_gtg_abs_dif: ################################################################ if verbose: print "rejected on ABS_DIF", threshold_max_cbg_gtg_abs_dif, print "<", cbg_gtg_abs_dif ################################################################ return False else: pass # check the Tcode score if min_tcode_omsr > new.msr_tcode_score(): ################################################################ if verbose: print "rejected on MIN_TCODE_OMSR", min_tcode_omsr, ">", print new.msr_tcode_score() ################################################################ return False else: pass else: # probably omit_conditional_addition==True pass # check if exactly this one is already in the genestructure if self.is_codingblockgraph_already_in_genestructure(new): #################################################################### if verbose: print "already in genestructure!!" #################################################################### return False # if LowSimilarityRegionCodingBlockGraph, do a OMSR check if new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': # get the OMSR of the complete GeneStructure omsrGSG = self.overall_minimal_spanning_range() for node, omsr in new._omsr.iteritems(): org = new.organism_by_node(node) if omsrGSG[org].intersection(omsr): ############################################################ if verbose: print "ALREADY IN GENESTRUCTURE!", "\n", omsr print omsrGSG[org].intersection(omsr) ############################################################ return False else: ################################################################ if verbose: print "lsrCBG NOT in genestructure's OMSR" ################################################################ # new lsrCBG! just continue with this function pass for pos in range(0, len(self)): cbg = self.codingblockgraphs[pos] # do not compare position towards a LowSimilarityRegionCodingBlockGraph if cbg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue # get relative positioning data of 2 CBGs (absPosCbg, absPosNew, binPosCbg, binPosNew, orfIdent, posRel, posBin) = relatively_positioned_towards(cbg, new) if verbose: print "eval cbg pos", pos, "binarytuples:", binPosCbg, binPosNew # Check the positioning in binaryCbgPositioning # Required positioning of new codingblock `new` is cbg-->new if binPosCbg == (1, 0, 0) and binPosNew == (0, 0, 1): # The order is new-->cbg; continue continue elif new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph' and new.node_set( ).difference(cbg.node_set()): # binPos comparison assumes correct position, but it is not because node intersection # not all nodes in lsrCBG 'new' are shared in next CBG 'cbg' -> ignore here! continue elif new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph' and binPosCbg == ( 1, 0, 0) and binPosNew == (0, 0, 0): # Final check -> is the lsrCBG directly ajacent to the cbg? # In case a single orf set is split up in several CBGs (and several lsrCBGs), # insertion can give problems when not verifying the distance distances = cbg.distance_between_codingblocks(new) if list(Set(distances.values())) == [0]: # yes, this is the position where we want to insert pass else: # nope, not the correct, diretly adjacent position if verbose: print "# NOT ADDING HERE a lsrCBG????" print cbg print new print distances # continue to the next cbg position continue # if only_try_adding, return a succesfull True! if only_try_adding: return True if verbose: print new, binPosCbg, binPosNew, "inserting on pos %s+1" % pos # Add into the ordered GeneStructure! tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert(pos + 1, tobeadded) # succesfull insert; return True return True elif binPosCbg == (0, 0, 1) and binPosNew == (1, 0, 0): # The order is cbg-->new; this is the required location! if new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': # check if directly neighboring this cbg distances = cbg.distance_between_codingblocks(new) print new print distances # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert(pos, tobeadded) if verbose: print "ADDING:", pos, new print binPosCbg, binPosNew # succesfull insert; return True return True elif binPosCbg == (0, 1, 0) and binPosNew == (0, 1, 0): # Exactly this codingblock exists already in the genestructure return False elif binPosCbg in [(1, 0, 0), (1, 1, 0)] and binPosNew in [(0, 0, 1), (0, 1, 1)]: # Check for: binPosCbg ~= (1,?,0) and binPosNew ~= (0,?,1) # Some overlap, and the order is new-->cbg # So, potential accepted new CBG 1 position AFTER this one! # Check if the overlap is compatible with Orf order for i in range(0, len(posBin)): (a1, a2, a3), (b1, b2, b3) = posRel[i] (Ba1, Ba2, Ba3), (Bb1, Bb2, Bb3) = posBin[i] identicalorfs = orfIdent[i] if _is_compatible_overlap((a1, a2, a3), (b1, b2, b3), (Ba1, Ba2, Ba3), (Bb1, Bb2, Bb3), identicalorfs): pass else: if verbose: print "ABOUT TO BREAK THE FORLOOP I:" print(a1, a2, a3), (b1, b2, b3) print(Ba1, Ba2, Ba3), (Bb1, Bb2, Bb3) cbg_overlap_ratio = float(a2) / float(a1 + a2 + a3) new_overlap_ratio = float(b2) / float(b1 + b2 + b3) overlap_ratio = max( [cbg_overlap_ratio, new_overlap_ratio]) print overlap_ratio # No! incorrect positioning -> break break else: # EOF forloop nicely reached; compatible new codingblock # The order is cbg-->new; this is the required location! # check how it is positioned towards the other; NO OVERLAP ALOWED HERE! if pos < len(self) - 1: next = self.codingblockgraphs[pos + 1] (absPosNext, absPosNewNext, binPosNext, binPosNewNext, orfIdentNext, posRelNext, posBinNext) = relatively_positioned_towards( new, next) # now check binPosNext & binPosNewNext: should be (1, 0, 0) & (0, 0, 1) # That means -> no overlap with the next CBG if binPosNext == (1, 0, 0) and binPosNewNext == (0, 0, 1): # no overlap -> ready to store! pass else: # there is overlap! Do not allow addition of this CBG ########################################################### if verbose: print "OVERLAP WITH NEXT CBG!:", pos, new print binPosNext, binPosNewNext ########################################################### return False else: # no further CBGs in GSG, so no check possible (and no overlap possible ;-) pass # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert(pos + 1, tobeadded) if verbose: print "ADDING:", pos + 1, new print binPosCbg, binPosNew # succesfull insert; return True return True # forloop broken -> incompatible new codingblock return False elif binPosCbg in [(0, 0, 1), (0, 1, 1)] and binPosNew in [(1, 0, 0), (1, 1, 0)]: # Check for: binPosCbg ~= (0,?,1) and binPosNew ~= (1,?,0) # The order is new-->cbg; this is the required location! # Check if the overlap is compatible with Orf order for i in range(0, len(posBin)): (a1, a2, a3), (b1, b2, b3) = posRel[i] (Ba1, Ba2, Ba3), (Bb1, Bb2, Bb3) = posBin[i] identicalorfs = orfIdent[i] if _is_compatible_overlap((a1, a2, a3), (b1, b2, b3), (Ba1, Ba2, Ba3), (Bb1, Bb2, Bb3), identicalorfs): pass else: if verbose: print "ABOUT TO BREAK THE FORLOOP II:" print(a1, a2, a3), (b1, b2, b3) print(Ba1, Ba2, Ba3), (Bb1, Bb2, Bb3) cbg_overlap_ratio = float(a2) / float(a1 + a2 + a3) new_overlap_ratio = float(b2) / float(b1 + b2 + b3) overlap_ratio = max( [cbg_overlap_ratio, new_overlap_ratio]) print overlap_ratio # No! incorrect positioning -> break break else: # EOF forloop nicely reached; compatible new codingblock # The order is new-->cbg; this is the required location! # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert(pos, tobeadded) # succesfull insert; return True return True # forloop broken -> incompatible new codingblock return False else: # A more messy positioning (overlaps/repetitive etc.) if verbose: print "WEIRD BINARY SUMMED ORDER!", absPosCbg, absPosNew, binPosCbg, binPosNew print cbg for i in range(0, len(posBin)): print posRel[i], posBin[i], orfIdent[i] # Reject this new codingblockgrap return False else: # If eof for loop is reached, append to the end # of current genestructure # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.append(tobeadded) # check if this is the first added cbg to GeneStructure object if len(self.codingblockgraphs) == 1: # cache the genetreegraph object gtg = self.set_genetree() # set threshold values as a function of gtg self.initialize_first_added_cbg() # succesfull insert; return True return True
def remove_overlapping_cbgs(self,verbose=False, ignore_is_optimal_cbgif=True, ignore_is_compatible_cbgif=False, cbg_max_alowed_overlap_aa_length=CBG_MAX_ALOWED_GSGREMOVAL_OVERLAP_AA_LENGTH, cbg_max_alowed_overlap_ratio=CBG_MAX_ALOWED_GSGREMOVAL_OVERLAP_RATIO): """ Remove overlapping CBGs in the GSG @type ignore_is_optimal_cbgif: Boolean @param ignore_is_optimal_cbgif: if True, leave is_optimal() cbgIFs intact @type ignore_is_compatible_cbgif: Boolean @param ignore_is_compatible_cbgif: if True, leave is_compatible() cbgIFs intact @type cbg_max_alowed_aa_length: integer @param cbg_max_alowed_aa_length: maximal overlap between CBGs in AA's @type cbg_max_alowed_overlap_ratio: float @param cbg_max_alowed_overlap_ratio: ratio between overlap and omsr (AA) @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: Integer @return: Number of CBGs that are removed from the GSG """ removed = True removed_cnt = 0 while removed: removed = False for pos in range(1,len(self)): # get concerned CBGs (cbg1,cbg2) = self.codingblockgraphs[pos-1:pos+1] if cbg1.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue if cbg2.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue prevCBG = None nextCBG = None if pos-2 >= 0: prevCBG = self.codingblockgraphs[pos-2] if pos+1 < len(self): nextCBG = self.codingblockgraphs[pos+1] # check cbgIF between these CBGs if cbg1._CBGinterface3p: if ignore_is_optimal_cbgif and\ cbg1._CBGinterface3p.is_optimal(): continue if ignore_is_compatible_cbgif and\ cbg1._CBGinterface3p.is_compatible(): print "COMPATBLE!!!" continue # get overlap data between these 2 CBGs ( absPosCbg1, absPosCbg2, binPosCbg1, binPosCbg2, orfIdent, posRel, posBin ) = codingblock_ordering.relatively_positioned_towards(cbg1,cbg2) # evaluate the overlap data if binPosCbg1 != (1, 0, 0) or binPosCbg2 != (0, 0, 1): distances = cbg1.distance_between_codingblocks(cbg2) overlaps = [ min([0,value]) for value in distances.values() ] # remove all non-overlaps while 0 in overlaps: overlaps.remove(0) # now sum them and calculate an average summed = abs(sum(overlaps)) if summed: average = float(summed) / len(overlaps) else: average = 0.0 ############################################################ #if verbose: # print cbg1 # print cbg2 # print pos-1, pos+1, # print absPosCbg1, absPosCbg2, # print binPosCbg1, binPosCbg2 # print distances, summed, average, # print cbg1.total_weight(), cbg2.total_weight() ############################################################ # check if the overlap ratio is not to large ratioCbg1 = ( average / cbg1.omsrlength() ) > cbg_max_alowed_overlap_ratio ratioCbg2 = ( average / cbg2.omsrlength() ) > cbg_max_alowed_overlap_ratio remove_pos = None if average > cbg_max_alowed_overlap_aa_length and ratioCbg1 and ratioCbg2: # hmmm both have a high overlap ratio -> remove the lowest scoring if cbg1.total_weight() < cbg2.total_weight(): remove_pos = pos-1 else: remove_pos = pos elif average > cbg_max_alowed_overlap_aa_length and ratioCbg1: remove_pos = pos-1 elif average > cbg_max_alowed_overlap_aa_length and ratioCbg2: remove_pos = pos elif ratioCbg1 and ratioCbg2: # hmmm both have a high overlap ratio -> remove the lowest scoring if cbg1.total_weight() < cbg2.total_weight(): remove_pos = pos-1 else: remove_pos = pos elif ratioCbg1: remove_pos = pos-1 elif ratioCbg2: remove_pos = pos else: # omit removal! pass if remove_pos != None: # final check: if node_count() not identical -> # then remove the one that has lowest number of nodes! if cbg1.node_count() > cbg2.node_count(): remove_pos = pos elif cbg1.node_count() < cbg2.node_count(): remove_pos = pos-1 else: # identical node_count -> stick to the oppointed one! pass # get positional pointer to CBG with which the to-be-deleted # CBG is overlapping with (for verbose logginf) if remove_pos == pos: overlapping_cbg_pos = pos-1 theCBGif = cbg2._CBGinterface5p # final check: does to-be-deleted CBG # fill a gap in the genestructure scaffold? if _is_intermediate_overlapping_cbg_a_gsg_scaffold_enrichment( self,cbg1,cbg2,nextCBG): # this is a perfect example of a scaffold enrichment, # caused by a small exon in >= 1 genes, compared to # continious exons in >= 1 other genes. ####################################### if verbose: print "SCAFFOLD ENRICHMENT" print cbg1 print cbg2 print nextCBG,"NEXT" ####################################### continue else: overlapping_cbg_pos = remove_pos theCBGif = cbg1._CBGinterface3p # final check: does to-be-deleted CBG # fill a gap in the genestructure scaffold? if _is_intermediate_overlapping_cbg_a_gsg_scaffold_enrichment( self,prevCBG,cbg1,cbg2): # this is a perfect example of a scaffold enrichment, # caused by a small exon in >= 1 genes, compared to # continious exons in >= 1 other genes. ####################################### if verbose: print "SCAFFOLD ENRICHMENT" print prevCBG,"PREV" print cbg1 print cbg2 ####################################### continue # remove this codingblock! deletedCBG = self.remove_cbg_by_pos(remove_pos) ############################################################ if verbose: print "REMOVED!", pos, overlapping_cbg_pos, remove_pos print deletedCBG deletedCBG.printmultiplealignment() for (key,n1,n2),pacbp in deletedCBG.pacbps.iteritems(): print pacbp,n1,n2 print "IN VIOLENCE WITH:" print self.codingblockgraphs[overlapping_cbg_pos] self.codingblockgraphs[overlapping_cbg_pos].printmultiplealignment() for (key,n1,n2),pacbp in\ self.codingblockgraphs[overlapping_cbg_pos].pacbps.iteritems(): print pacbp,n1,n2 if theCBGif: print "INTERFACE" print theCBGif print theCBGif._interface_is_intron ############################################################ # set removed variable to True for the next iteration! removed = True removed_cnt += 1 break else: pass ############################################################ #if verbose: print "overlap oke...." ############################################################ # return the number of removed CBGs from the GSG return removed_cnt
def add_codingblock(self,new,log_debug=False,only_try_adding=False, max_cbg_gtg_topo_dif=None, max_cbg_gtg_abs_dif=None, min_cbg_gtg_id_ratio=None, min_tcode_omsr=None, omit_conditional_addition=False ): """ (Try to) add a CodingBlockGraph to the genestructure @type new: CodingBlockGraph @param new: CodingBlockGraph object to be added to the genestructure @type only_try_adding: Boolean @param only_try_adding: only try to add the CBG and return succes status @type omit_conditional_addition: Boolean @param omit_conditional_addition: omit all CBG quality checks; just verify if the CBG is placeable based on the OMSR coordinates @type max_cbg_gtg_topo_dif: float (or None) @param max_cbg_gtg_topo_dif: @type max_cbg_gtg_abs_dif: float (or None) @param max_cbg_gtg_abs_dif: @type max_cbg_gtg_id_ratio: float (or None) @param max_cbg_gtg_id_ratio: @type min_tcode_omsr: float (or None) @param min_tcode_omsr: @rtype: Boolean @return: True or False, weather or not adding was succesfull """ verbose = log_debug # log_debug must be replaced by verbose.... # update edge weights by overall minimal spanning range if not only_try_adding: new.update_edge_weights_by_minimal_spanning_range() # check for difference with GeneTreeGraph if new.__class__.__name__ == 'CodingBlockGraph' and\ not omit_conditional_addition and\ self.genetree() and len(self) >= 1: newgtg = new.genetree() if new.node_count() == self.EXACT_SG_NODE_COUNT: gtg = self.genetree() else: # new to-be-placed graph misses certain organism node(s) completegtg = self.genetree() gtg = deepcopy(completegtg) for missingorg in completegtg.organism_set().difference(new.organism_set()): gtg.del_node(missingorg) # calculate identity, topological and absolute GTG differences cbg_gtg_topo_dif = gtg.graphalignmentdifference( newgtg ) cbg_gtg_abs_dif = gtg.absolutegraphalignmentdifference( newgtg ) cbg_gtg_id_ratio = newgtg.identity() / gtg.identity() #################################################################### if verbose: print "cbg2gsg", new #################################################################### # check the identity ratio if min_cbg_gtg_id_ratio: threshold_min_cbg_gtg_id_ratio = MIN_GTG_ID_RATIO_FUNCTION(min_cbg_gtg_id_ratio,gtg,new) ################################################################ if verbose: print "CUSTOM", threshold_min_cbg_gtg_id_ratio, print min_cbg_gtg_id_ratio ################################################################ else: threshold_min_cbg_gtg_id_ratio = MIN_GTG_ID_RATIO_FUNCTION(self.MIN_CBG_GTG_ID_RATIO,gtg,new) ################################################################ if verbose: print "NORMAL", threshold_min_cbg_gtg_id_ratio, print self.MIN_CBG_GTG_ID_RATIO ################################################################ if cbg_gtg_id_ratio < threshold_min_cbg_gtg_id_ratio: ################################################################ if verbose: print "rejected on ID ratio", threshold_min_cbg_gtg_id_ratio, print ">", cbg_gtg_id_ratio ################################################################ return False else: pass # check the relative topological difference if max_cbg_gtg_topo_dif: threshold_max_cbg_gtg_topo_dif = MAX_GTG_TOPO_DIF_FUNCTION(max_cbg_gtg_topo_dif,gtg,new) ################################################################ if verbose: print "CUSTOM", threshold_max_cbg_gtg_topo_dif, print max_cbg_gtg_topo_dif, gtg.identity(), new.omsrlength() ################################################################ else: threshold_max_cbg_gtg_topo_dif = MAX_GTG_TOPO_DIF_FUNCTION(self.MAX_CBG_GTG_TOPO_DIF,gtg,new) ################################################################ if verbose: print "NORMAL", threshold_max_cbg_gtg_topo_dif, print self.MAX_CBG_GTG_TOPO_DIF, gtg.identity(), new.omsrlength() ################################################################ if cbg_gtg_id_ratio >= 1.10: # ignore TOPO_DIF check when newgtg.id% >> gtg.id% pass elif cbg_gtg_topo_dif > threshold_max_cbg_gtg_topo_dif: ################################################################ if verbose: print "rejected on TOPO_DIF", cbg_gtg_topo_dif, print ">", threshold_max_cbg_gtg_topo_dif ################################################################ return False else: pass # check the absolute topological difference if max_cbg_gtg_abs_dif: threshold_max_cbg_gtg_abs_dif = MAX_GTG_ABS_DIF_FUNCTION(max_cbg_gtg_abs_dif,gtg,new) ################################################################ if verbose: print "CUSTOM", threshold_max_cbg_gtg_abs_dif, print max_cbg_gtg_abs_dif, gtg.identity(), new.omsrlength() ################################################################ else: threshold_max_cbg_gtg_abs_dif = MAX_GTG_ABS_DIF_FUNCTION(self.MAX_CBG_GTG_ABS_DIF,gtg,new) ################################################################ if verbose: print "NORMAL", threshold_max_cbg_gtg_abs_dif, print self.MAX_CBG_GTG_ABS_DIF ################################################################ if cbg_gtg_id_ratio >= 1.10: # ignore TOPO_DIF check when newgtg.id% >> gtg.id% pass elif cbg_gtg_abs_dif > threshold_max_cbg_gtg_abs_dif: ################################################################ if verbose: print "rejected on ABS_DIF", threshold_max_cbg_gtg_abs_dif, print "<", cbg_gtg_abs_dif ################################################################ return False else: pass # check the Tcode score if min_tcode_omsr > new.msr_tcode_score(): ################################################################ if verbose: print "rejected on MIN_TCODE_OMSR", min_tcode_omsr, ">", print new.msr_tcode_score() ################################################################ return False else: pass else: # probably omit_conditional_addition==True pass # check if exactly this one is already in the genestructure if self.is_codingblockgraph_already_in_genestructure(new): #################################################################### if verbose: print "already in genestructure!!" #################################################################### return False # if LowSimilarityRegionCodingBlockGraph, do a OMSR check if new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': # get the OMSR of the complete GeneStructure omsrGSG = self.overall_minimal_spanning_range() for node,omsr in new._omsr.iteritems(): org = new.organism_by_node(node) if omsrGSG[org].intersection(omsr): ############################################################ if verbose: print "ALREADY IN GENESTRUCTURE!", "\n", omsr print omsrGSG[org].intersection(omsr) ############################################################ return False else: ################################################################ if verbose: print "lsrCBG NOT in genestructure's OMSR" ################################################################ # new lsrCBG! just continue with this function pass for pos in range(0,len(self)): cbg = self.codingblockgraphs[pos] # do not compare position towards a LowSimilarityRegionCodingBlockGraph if cbg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue # get relative positioning data of 2 CBGs ( absPosCbg, absPosNew, binPosCbg, binPosNew, orfIdent, posRel, posBin) = relatively_positioned_towards(cbg,new) if verbose: print "eval cbg pos", pos, "binarytuples:", binPosCbg, binPosNew # Check the positioning in binaryCbgPositioning # Required positioning of new codingblock `new` is cbg-->new if binPosCbg == (1,0,0) and binPosNew == (0,0,1): # The order is new-->cbg; continue continue elif new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph' and new.node_set().difference(cbg.node_set()): # binPos comparison assumes correct position, but it is not because node intersection # not all nodes in lsrCBG 'new' are shared in next CBG 'cbg' -> ignore here! continue elif new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph' and binPosCbg == (1,0,0) and binPosNew == (0,0,0): # Final check -> is the lsrCBG directly ajacent to the cbg? # In case a single orf set is split up in several CBGs (and several lsrCBGs), # insertion can give problems when not verifying the distance distances = cbg.distance_between_codingblocks(new) if list(Set(distances.values())) == [0]: # yes, this is the position where we want to insert pass else: # nope, not the correct, diretly adjacent position if verbose: print "# NOT ADDING HERE a lsrCBG????" print cbg print new print distances # continue to the next cbg position continue # if only_try_adding, return a succesfull True! if only_try_adding: return True if verbose: print new, binPosCbg, binPosNew, "inserting on pos %s+1" % pos # Add into the ordered GeneStructure! tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert( pos+1, tobeadded ) # succesfull insert; return True return True elif binPosCbg == (0,0,1) and binPosNew == (1,0,0): # The order is cbg-->new; this is the required location! if new.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': # check if directly neighboring this cbg distances = cbg.distance_between_codingblocks(new) print new print distances # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert( pos, tobeadded ) if verbose: print "ADDING:", pos, new print binPosCbg, binPosNew # succesfull insert; return True return True elif binPosCbg == (0,1,0) and binPosNew == (0,1,0): # Exactly this codingblock exists already in the genestructure return False elif binPosCbg in [(1,0,0),(1,1,0)] and binPosNew in [(0,0,1),(0,1,1)]: # Check for: binPosCbg ~= (1,?,0) and binPosNew ~= (0,?,1) # Some overlap, and the order is new-->cbg # So, potential accepted new CBG 1 position AFTER this one! # Check if the overlap is compatible with Orf order for i in range(0,len(posBin)): (a1,a2,a3), (b1,b2,b3) = posRel[i] (Ba1,Ba2,Ba3), (Bb1,Bb2,Bb3) = posBin[i] identicalorfs = orfIdent[i] if _is_compatible_overlap( (a1,a2,a3), (b1,b2,b3), (Ba1,Ba2,Ba3), (Bb1,Bb2,Bb3), identicalorfs ): pass else: if verbose: print "ABOUT TO BREAK THE FORLOOP I:" print (a1,a2,a3), (b1,b2,b3) print (Ba1,Ba2,Ba3), (Bb1,Bb2,Bb3) cbg_overlap_ratio = float(a2) / float(a1+a2+a3) new_overlap_ratio = float(b2) / float(b1+b2+b3) overlap_ratio = max([cbg_overlap_ratio, new_overlap_ratio]) print overlap_ratio # No! incorrect positioning -> break break else: # EOF forloop nicely reached; compatible new codingblock # The order is cbg-->new; this is the required location! # check how it is positioned towards the other; NO OVERLAP ALOWED HERE! if pos < len(self)-1: next = self.codingblockgraphs[pos+1] ( absPosNext, absPosNewNext, binPosNext, binPosNewNext, orfIdentNext, posRelNext, posBinNext ) = relatively_positioned_towards(new,next) # now check binPosNext & binPosNewNext: should be (1, 0, 0) & (0, 0, 1) # That means -> no overlap with the next CBG if binPosNext == (1, 0, 0) and binPosNewNext == (0, 0, 1): # no overlap -> ready to store! pass else: # there is overlap! Do not allow addition of this CBG ########################################################### if verbose: print "OVERLAP WITH NEXT CBG!:", pos, new print binPosNext, binPosNewNext ########################################################### return False else: # no further CBGs in GSG, so no check possible (and no overlap possible ;-) pass # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert( pos+1, tobeadded ) if verbose: print "ADDING:", pos+1, new print binPosCbg, binPosNew # succesfull insert; return True return True # forloop broken -> incompatible new codingblock return False elif binPosCbg in [(0,0,1),(0,1,1)] and binPosNew in [(1,0,0),(1,1,0)]: # Check for: binPosCbg ~= (0,?,1) and binPosNew ~= (1,?,0) # The order is new-->cbg; this is the required location! # Check if the overlap is compatible with Orf order for i in range(0,len(posBin)): (a1,a2,a3), (b1,b2,b3) = posRel[i] (Ba1,Ba2,Ba3), (Bb1,Bb2,Bb3) = posBin[i] identicalorfs = orfIdent[i] if _is_compatible_overlap( (a1,a2,a3), (b1,b2,b3), (Ba1,Ba2,Ba3), (Bb1,Bb2,Bb3), identicalorfs ): pass else: if verbose: print "ABOUT TO BREAK THE FORLOOP II:" print (a1,a2,a3), (b1,b2,b3) print (Ba1,Ba2,Ba3), (Bb1,Bb2,Bb3) cbg_overlap_ratio = float(a2) / float(a1+a2+a3) new_overlap_ratio = float(b2) / float(b1+b2+b3) overlap_ratio = max([cbg_overlap_ratio, new_overlap_ratio]) print overlap_ratio # No! incorrect positioning -> break break else: # EOF forloop nicely reached; compatible new codingblock # The order is new-->cbg; this is the required location! # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.insert( pos, tobeadded ) # succesfull insert; return True return True # forloop broken -> incompatible new codingblock return False else: # A more messy positioning (overlaps/repetitive etc.) if verbose: print "WEIRD BINARY SUMMED ORDER!", absPosCbg, absPosNew, binPosCbg, binPosNew print cbg for i in range(0,len(posBin)): print posRel[i], posBin[i], orfIdent[i] # Reject this new codingblockgrap return False else: # If eof for loop is reached, append to the end # of current genestructure # if only_try_adding, return a succesfull True! if only_try_adding: return True # Make deepcopy of this CBG in orde to prevent abnormalities # when splitting/merging in lateron function #tobeadded = deepcopy(new) tobeadded = new tobeadded.create_cache() self.codingblockgraphs.append( tobeadded ) # check if this is the first added cbg to GeneStructure object if len(self.codingblockgraphs) == 1: # cache the genetreegraph object gtg = self.set_genetree() # set threshold values as a function of gtg self.initialize_first_added_cbg() # succesfull insert; return True return True