def _is_intermediate_overlapping_cbg_a_gsg_scaffold_enrichment(gsg, cbgA,cbgB,cbgC,minimal_scaffold_aa_enrichment=5): """ """ if not (gsg and cbgA and cbgB and cbgC): # (most likely) cbgA or cbgC is not defined # function behavious should be ti return False return False if not cbgA.mutual_nodes(cbgC): # series of CBGs does not represent an suitable # gene structure scaffold -> return False return False # perform this check from graph_genestructure import GenestructureOfCodingBlockGraphs partGSG = GenestructureOfCodingBlockGraphs(gsg.input) partGSG.codingblockgraphs = [ cbgA,cbgB,cbgC ] partGSG._GENETREE = gsg._GENETREE partOMSRa = partGSG.overall_minimal_spanning_range() partGSG.codingblockgraphs = [ cbgA,cbgC ] partOMSRb = partGSG.overall_minimal_spanning_range() scaffold_enrichments = [] for node in cbgA.mutual_nodes(cbgC): org = gsg.organism_by_node(node) scaffold_enrichments.append( len(partOMSRa[org]) - len(partOMSRb[org]) >=\ minimal_scaffold_aa_enrichment ) # check if True in scaffold_enrichments if True in scaffold_enrichments: return True else: return False
def split_final_cbg_on_spanningrange_difference(self, sprdif_min_aa_length=CBG_FINAL_SPRDIF_MIN_AA_LENGTH, sprdif_min_node_count=CBG_FINAL_SPRDIF_MIN_NODE_COUNT, sprdif_min_gtid_ratio=0.55, only_perform_if_stopcodon_tw_ratio_lte=CBG_FINAL_SPRDIF_ONLY_IF_STOP_TW_RATIO_LTE, only_preform_if_cbg_id_gte=CBG_FINAL_SPRDIF_ONLY_IF_CBG_ID_GTE ): """ @type sprdif_min_aa_length: integer @param sprdif_min_aa_length: minimal length of the sprdif in aa's @type cbg_min_node_count: integer @param cbg_min_node_count: minimal number of nodes in a CBG to be elegiable for trying a split @type sprdif_min_gtid_ratio: float @param sprdif_min_gtid_ratio: @type only_perform_if_stopcodon_tw_ratio_lte: float @param only_perform_if_stopcodon_tw_ratio_lte: run function only when lastCBG.stopcodongraph.totalweight <= threshold @type only_preform_if_cbg_id_gte: float @param only_preform_if_cbg_id_gte: run function only when lastCBG.genetree.identity() >= threshold """ # get the CBG that is labelled as IS_LAST=True current_last = self.get_final_cbg() # check if we are alowed to peform this function # for groups of genes with very low identity, this function # is more likely to decrease the result then to improve the result # make AlignedStopCodonGraph current_last.align_stop_codons() tw_current = current_last._stopcodongraph.total_weight() ratio = tw_current / self.EXACT_SG_EDGE_COUNT # now check if it is alowed to enter the function: only_perform_if_... if only_perform_if_stopcodon_tw_ratio_lte and ratio > only_perform_if_stopcodon_tw_ratio_lte: return False if only_preform_if_cbg_id_gte and current_last.genetree().identity() < only_preform_if_cbg_id_gte: return False # check for rigth sprdif op requested size; if not => return False if not current_last.has_rigth_spanningrange_difference( sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count): # no rigth spanningrange difference -> done & return return False # make a deepcopy and clear cache of the one that will be processed last = deepcopy(current_last) last.clear_cache() # iteratively split splits = last.iteratively_split_codingblock_on_spanningrange_difference( side='rigth', sprdif_min_aa_length=sprdif_min_aa_length, sprdif_min_node_count=sprdif_min_node_count, ) # was the split succesfull? if len(splits) == 1: # no splits => done here! return False # when here process the sprdif CBGs # 1) cbghmmsearch2pacbpcollection # 2) pacbpCollection2acceptedcbgs all_accepted_cbgs = [] # loop over the splits; except for the most left one (the input `last` CBG) for splittedCBG in splits[1:]: if splittedCBG.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue # complete with cbghmmsearch2pacbpcollection # get ratio of the GTG of this CBG ratio = splittedCBG.genetree().identity() / current_last.genetree().identity() # if ratio is bad -> do not perform! if sprdif_min_gtid_ratio and ratio < sprdif_min_gtid_ratio: continue pacbpCollection = cbghmmsearch2pacbpcollection(splittedCBG,self.input, prev=last, pacbp_min_length=sprdif_min_aa_length, hmmsearch_num_hits=3 ) # get list of accepted CBGs accepted = conversion.pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,prev=last) all_accepted_cbgs.extend( accepted ) # if no accepted ones -> return False if not all_accepted_cbgs: return False # order graphs by total weight all_accepted_cbgs = ordering.order_graphlist_by_total_weight(all_accepted_cbgs) # and re-order on node occurrence: if a neighboring node is incorporated -> more likely! all_accepted_cbgs = ordering.reorder_cbgs_on_node_occurrence(all_accepted_cbgs,prev=last) # and now try to add the accepted cbgs into the genestructure # speedup the process by creating a tinyGSG object of only the last CBG # but, set the _GENETREE attribute to the genetree of the main GSG from graph_genestructure import GenestructureOfCodingBlockGraphs lastGSG = GenestructureOfCodingBlockGraphs(self.input) lastGSG.add_codingblock(current_last) lastGSG._GENETREE = self._GENETREE RETURN_STATUS_CBG_IS_ADDED = False for cbgL in all_accepted_cbgs: # only Ks CBG graphs are alowed here! if cbgL.node_count() != current_last.node_count(): continue if lastGSG.add_codingblock(cbgL,only_try_adding=True, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ): # it is addable; prepare for final addition to the genestructure lsrCBG = None cbgL.IS_SPLITTED = False cbgL.IS_5P_SPLITTED = False cbgL.IS_FIRST = False cbgL.IS_LAST = True current_last.IS_LAST = False # if identical nodes -> create a lsrCBG if not cbgL.node_set().difference(current_last.get_nodes()): current_last.IS_SPLITTED = True current_last.IS_3P_SPLITTED = True cbgL.IS_SPLITTED = True cbgL.IS_5P_SPLITTED = True lsrCBG = graphAbgp.codingblock_splitting.create_intermediate_lowsimilarity_region( current_last, cbgL ) if not lsrCBG.node_count(): lsrCBG = None # now add the new last CBG status = self.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) status = lastGSG.add_codingblock(cbgL, max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF, max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF, min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO, ) # if added, update the return value (RETURN_STATUS_CBG_IS_ADDED) if status: RETURN_STATUS_CBG_IS_ADDED = True print cbgL print cbgL.IS_5P_SPLITTED, cbgL.IS_SPLITTED, cbgL.IS_3P_SPLITTED # and add the intermediate lsrCBG when available if lsrCBG: statusMainGSG = self.add_codingblock(lsrCBG) statusLastGSG = lastGSG.add_codingblock(lsrCBG) print "lsrCBG added:", statusMainGSG, statusLastGSG else: # not placeable in the genestructure pass # in exceptional cases, 2 CBGs can be added. In case the node_set() is identical, # yet another lsrCBG has to be created in between these 2 new CBGs # check this in the main GSG (NOT in the lastGSG; when a lsrCBG is added here, # splits are added to the surrounding CBGs. Because call-by-reference, these # splits are added to the main GSG (self) too, and adding the same lsrCBG # will fail (splitted CBGs are skipped! if RETURN_STATUS_CBG_IS_ADDED: self.finalize_genestructure() if self.join_false_inframe_introns(): print "EXTRA lsrCBG added!!" # recreate interfaces if there is a new one created self.create_cbginterfaces() # return the return status True|False return RETURN_STATUS_CBG_IS_ADDED
def replace_scaffold_breaking_cbgs(self,verbose=False): """ (Try) to replace CBG that break the GSG scaffold by other CBGs @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: Boolean @return: Is any CBG replaced? """ # Boolean return value scaffold_breaking_cbg_replaced = False for cbgpos in self.cbgpositerator(reversed=True)[1:]: cbg = self.codingblockgraphs[cbgpos] if cbg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue if cbg.node_count() < self.EXACT_SG_NODE_COUNT: continue # loop forwards through the GSG and look for mutual nodes identical_nodes = [] for backwardspos in range(cbgpos+1,len(self)): comparecbg = self.codingblockgraphs[backwardspos] if cbg.mutual_nodes(comparecbg): identical_nodes.append(True) break else: identical_nodes.append(False) if not identical_nodes: continue # final CBG -> continue elif identical_nodes == [True]: continue # neighboring node has mutual nodes -> continue elif identical_nodes.count(True) == 0: continue # no mutual nodes at all -> continue else: # this is what we are looking for, a list like # [ False, ... True ] with >1 False # get total_weights of the intermediate CBGs tws = [ self.codingblockgraphs[_pos].total_weight() for\ _pos in range(cbgpos+1,backwardspos) ] ################################################################ if verbose: print cbgpos, backwardspos, identical_nodes print cbg print tws print comparecbg ################################################################ # first, check if the CBGs are already partially overlapping # if so, get rid of this intermediate CBG omsrdist = cbg.omsr_distance_between_codingblocks(comparecbg) if max(omsrdist.values()) <= 1: # yes, all organisms glue these CBGs perfectly together # just remove this one without further checks cbg._CBGinterface3p = None comparecbg._CBGinterface5p = None self.codingblockgraphs.__setslice__( cbgpos+1, backwardspos+1, [ comparecbg ] ) scaffold_breaking_cbg_replaced = True # go to the next cbg in the list continue # do a more eleborate check by trying to create a CBG # in this large_scaffold_gap from graph_genestructure import GenestructureOfCodingBlockGraphs partialGSG = GenestructureOfCodingBlockGraphs(self.input) partialGSG.codingblockgraphs = [ cbg, comparecbg ] partialGSG._GENETREE = self._GENETREE partialGSG.create_large_intermediate_cbg_for_scaffold_gap( sprdif_min_node_count = 2, cbg_min_node_count = self.EXACT_SG_NODE_COUNT, verbose = verbose ) if len(partialGSG) == 2: ############################################################ if verbose: print "NO scaffold CBGs found!" ############################################################ pass else: new_tws = [ _cbg.total_weight() for _cbg in\ partialGSG.codingblockgraphs[1:-1] ] if sum(new_tws) > sum(tws): # replace! self.codingblockgraphs.__setslice__( cbgpos+1, backwardspos, partialGSG.codingblockgraphs[1:-1] ) scaffold_breaking_cbg_replaced = True else: pass ############################################################ if verbose: if sum(new_tws) > sum(tws): print "REPLACING scaffold-breaking CBGs!!!" else: print "MAINTAINING scaffold-breaking CBGs!!!" for cbg in partialGSG: print cbg ############################################################ # return if CBGs are removed return scaffold_breaking_cbg_replaced
def separate_ds_and_us_gsg(self): """ Remove all CBGs that have status IS_IGNORED in separate GSGs @attention: LowSimilarityRegionCodingBlockGraph that are not required anymore are deleted! @attention: CodingBlockGraphInterface objects around removed CBGs are set to None! @rtype: tuple @return: tuple of 3 (empty) GenestructureOfCodingBlockGraphs """ # create empty GSG to place IS_IGNORED CBGs in from graph_genestructure import GenestructureOfCodingBlockGraphs dsGSG = GenestructureOfCodingBlockGraphs(self.input) usGSG = GenestructureOfCodingBlockGraphs(self.input) etcGSG = GenestructureOfCodingBlockGraphs(self.input) # if no CBGs in GSG -> return all empty ones if len(self) == 0: return dsGSG, usGSG, etcGSG # check if there is any LowSimilarityRegionCodingBlockGraph directly # next to a CBG that has status IS_IGNORED. If so, set the lsrCBG # status to IS_IGNORED too! for pos in range(0,len(self)): if not self.codingblockgraphs[pos].IS_IGNORED: continue thiscbg = self.codingblockgraphs[pos] if pos > 0: prevclass = self.codingblockgraphs[pos-1].__class__.__name__ if prevclass == 'LowSimilarityRegionCodingBlockGraph': # set to IS_IGNORED too! self.codingblockgraphs[pos-1].IS_IGNORED = True if pos < len(self)-1: nextclass = self.codingblockgraphs[pos+1].__class__.__name__ if nextclass == 'LowSimilarityRegionCodingBlockGraph': # set to IS_IGNORED too! self.codingblockgraphs[pos+1].IS_IGNORED = True # Separate a potential downsteam GSG from this main GSG in dsGSG # that means, all IS_IGNORED CBGs until the first that is not IS_IGNORED for pos in range(0,len(self)): if pos == 0 and not self.codingblockgraphs[pos].IS_IGNORED: # first CBG is not IS_IGNORED -> no dsGSG break if not dsGSG and not self.codingblockgraphs[pos].IS_IGNORED and pos >= 1: # place all CBGs ds of this first CBG that is not IS_IGNORED in dsGSG for delpos in range(0,pos): dsGSG.codingblockgraphs.append( self.codingblockgraphs.pop(0) ) for cbg in dsGSG.codingblockgraphs: cbg.IS_IGNORED = False break # if dsGSG is not empty, fix cbgIFs in the GSG itself and check for # now non-sense lsrCBGs in the dsGSG if len(dsGSG) > 0: # whipe out the cbgIF object on the 5p side of the first CBG self.codingblockgraphs[0]._CBGinterface5p = None self.codingblockgraphs[0]._forced_5p_ends = {} # check dsGSG; if on of its exterior CBGs is an lsrCBG, remove it! if dsGSG.codingblockgraphs[0].__class__.__name__ ==\ 'LowSimilarityRegionCodingBlockGraph': removed_lsrCBG = dsGSG.codingblockgraphs.pop(0) if dsGSG.codingblockgraphs[-1].__class__.__name__ ==\ 'LowSimilarityRegionCodingBlockGraph': removed_lsrCBG = dsGSG.codingblockgraphs.pop() # separate a potential upstream GSG from this main GSG in usGSG # that means, all IS_IGNORED CBGs until the first that is not IS_IGNORED for pos in range(len(self)-1,-1,-1): if pos == len(self)-1 and not self.codingblockgraphs[pos].IS_IGNORED: # first CBG is not IS_IGNORED -> no usGSG break if not usGSG and not self.codingblockgraphs[pos].IS_IGNORED and pos < len(self)-1: # place all CBGs ds of this first CBG that is not IS_IGNORED in usGSG for delpos in range(pos+1,len(self)): usGSG.codingblockgraphs.insert(0, self.codingblockgraphs.pop() ) for cbg in usGSG.codingblockgraphs: cbg.IS_IGNORED = False break # if usGSG is not empty, fix cbgIFs in the GSG itself and check for # now non-sense lsrCBGs in the usGSG if len(usGSG) > 0: # whipe out the cbgIF object on the 3p side of the last CBG self.codingblockgraphs[len(self)-1]._CBGinterface3p = None self.codingblockgraphs[len(self)-1]._forced_3p_ends = {} # check usGSG; if on of its exterior CBGs is an lsrCBG, remove it! if usGSG.codingblockgraphs[0].__class__.__name__ ==\ 'LowSimilarityRegionCodingBlockGraph': removed_lsrCBG = usGSG.codingblockgraphs.pop(0) if usGSG.codingblockgraphs[-1].__class__.__name__ ==\ 'LowSimilarityRegionCodingBlockGraph': removed_lsrCBG = usGSG.codingblockgraphs.pop() # check for intermediate IS_IGNORED CBGs in the main GSG and place in etcGSG for pos in range(len(self)-1,-1,-1): if self.codingblockgraphs[pos].IS_IGNORED: # place the IS_IGNORED one in etcGSG removed_cbg = self.codingblockgraphs.pop(pos) classname = removed_cbg.__class__.__name__ if classname != 'LowSimilarityRegionCodingBlockGraph': # only place non-lsrCBGs in the etcGSG # intermediate lsrCBGs that are IS_IGNORED are now nonsense etcGSG.codingblockgraphs.insert(0, removed_cbg ) # whipe out potential cbgIF objects surrounding this CBG if pos > 0: self.codingblockgraphs[pos-1]._CBGinterface3p = None self.codingblockgraphs[pos-1]._forced_3p_ends = {} if pos <= len(self): self.codingblockgraphs[pos]._CBGinterface5p = None self.codingblockgraphs[pos]._forced_5p_ends = {} # and return dsGSG, usGSG, etcGSG return dsGSG, usGSG, etcGSG