def create_cbginterfaces(self, ignore_optimal=True, ignore_compatible=True, allow_phase_shift=False, allow_non_canonical=False, optimizetinyexoninterface=False, verbose=False): """ (Re)create CBGInterface objects in between CBGs in this GSG @type ignore_optimal: Boolean @param ignore_optimal: Once a CBGInterface is optimal, do not recreate it @type ignore_compatible: Boolean @param ignore_compatible: Once a CBGInterface is compatible, do not recreate it @type allow_phase_shift: Boolean @param allow_phase_shift: (re)create CBGInterfaces allowing a phase shift of splice sites @type allow_non_canonical: Boolean @param allow_non_canonical: (re)create CBGInterfaces allowing non-canonical (donor) splice sites @type optimizetinyexoninterface: Boolean @param optimizetinyexoninterface: do a quick optimization of ths cbgIF (non-canonical, short suitable splice site range etc @type verbose: Boolean @param verbose: print status messages to STDOUT @rtype: Integer @return: number of CBGInterfaceobjects that is (re)created """ RECREATED_CNT = 0 for pos in range(1, len(self)): cbgD, cbgA = self[pos - 1], self[pos] CREATE_INTERFACE = False has_interface_donor = self.has_donor_cbginterface(cbgD) has_interface_acceptor = self.has_acceptor_cbginterface(cbgA) if has_interface_donor and has_interface_acceptor: # interface objects already exist; only (re)create when not ignore_optimal pass #if self.cbginterface_is_optimal_donor(cbgD) and self.cbginterface_is_optimal_acceptor(cbgA): # if not ignore_optimal: CREATE_INTERFACE = True #elif self.cbginterface_is_compatible_donor(cbgD) and self.cbginterface_is_compatible_acceptor(cbgA): # if not ignore_compatible: CREATE_INTERFACE = True #else: # CREATE_INTERFACE = True elif has_interface_donor: CREATE_INTERFACE = True elif has_interface_acceptor: CREATE_INTERFACE = True else: CREATE_INTERFACE = True if CREATE_INTERFACE: cbgIF = CodingBlockGraphInterface(cbgD, cbgA) cbgIF.harvest_splice_sites( allow_phase_shift=allow_phase_shift, allow_non_canonical=allow_non_canonical) cbgIF.find_conserved_splice_sites() if optimizetinyexoninterface: cbgIF.optimizetinyexoninterface() # and set the interface objects to the CBGs in GSG cbgD._CBGinterface3p = cbgIF cbgA._CBGinterface5p = cbgIF RECREATED_CNT += 1 if verbose: print cbgIF else: if verbose: print cbgD._CBGinterface3p, "EXISTING" # set current first & last CBG as IS_FIRST and IS_LAST if len(self): self.codingblockgraphs[0].IS_FIRST = True self.codingblockgraphs[-1].IS_LAST = True # return counter for how much CBGInterfaces are recreated return RECREATED_CNT
def check_lsrcbgs_for_inframe_introns(self,verbose=False): """ Check the lsrCBGs in the GSG and see if these regions can better be explained by an inframe intron """ INFRAME_INTRONS_PREDICTED = 0 LSR_RECREATED = 0 for cbgpos in range(len(self)-1,-1,-1): cbg = self.codingblockgraphs[cbgpos] if cbg.__class__.__name__ != 'LowSimilarityRegionCodingBlockGraph': continue # do the inframe intron analyses on a lsrCBG inframeintrons = cbg.potentially_contains_inframe_intron(verbose=verbose) # aparantly it seems possible to create one or more introns in the lsrCBG if inframeintrons: # get the bordering CBGs prev = self.codingblockgraphs[cbgpos-1] next = self.codingblockgraphs[cbgpos+1] # make CBGInterface between prev and next; # reset the _IS_SPLITTED tags! prev._splicedonorgraph = None prev._CBGinterface3p = None prev._forced_3p_ends = {} prev.IS_3P_SPLITTED = False prev.IS_SPLITTED = prev.IS_5P_SPLITTED next._spliceacceptorgraph = None next._CBGinterface5p = None next._forced_5p_ends = {} next.IS_5P_SPLITTED = False next.IS_SPLITTED = next.IS_3P_SPLITTED # create an actual CBGInterface of both CBGs around the lsrCBG cbgIF = CodingBlockGraphInterface(prev,next) if verbose: print cbgIF # re-harvest splice sites; store ALL the intron-projected sites cbgIF.harvest_splice_sites(allow_phase_shift=False,store_all_projected_sites=True) if verbose: print cbgIF # now remove all non-projected splice-sites in organisms that # are not reported to have a potential inframe intron cbgIF.allow_intron_in_organisms(inframeintrons) cbgIF.find_conserved_splice_sites() if verbose: print cbgIF print "compatible:", cbgIF.is_compatible(), "optimal:", cbgIF.is_optimal() print cbgIF._optimal_aligned_donor print cbgIF._optimal_aligned_acceptor # yes, this is what we expect; a compatible CBGInterface! # this very likely represents an inframe intron! if cbgIF.is_compatible(): # remove the lsrCBG from the GSG lsrCBG = self.codingblockgraphs.pop(cbgpos) # set the CBGInterface object in next and prev CBG prev._CBGinterface3p = cbgIF next._CBGinterface5p = cbgIF # increase the counter of number of inframe introns predicted INFRAME_INTRONS_PREDICTED+=1 ############################################################ if verbose: print "INFRAME INTRON PREDICTED!!" ############################################################ else: # nope, this does not seem like a proper inframe intron # reset the CBGs and the lsrCBG objects as they were! # If this point is reached, `first` and `second` are CBGs with exactly the same nodes # create intermediate lsrCBG prev.IS_SPLITTED = True prev.IS_3P_SPLITTED = True next.IS_SPLITTED = True next.IS_5P_SPLITTED = True lsrCBG = create_intermediate_lowsimilarity_region(prev,next) self.codingblockgraphs[cbgpos] = lsrCBG # recreate the CBGInterfaces (I) cbgIFa = CodingBlockGraphInterface(prev,lsrCBG) cbgIFa.harvest_splice_sites() cbgIFa.find_conserved_splice_sites() # set the interface object to the CBGs in GSG prev._CBGinterface3p = cbgIFa lsrCBG._CBGinterface5p = cbgIFa # recreate the CBGInterfaces (II) cbgIFb = CodingBlockGraphInterface(lsrCBG,next) cbgIFb.harvest_splice_sites() cbgIFb.find_conserved_splice_sites() # set the interface object to the CBGs in GSG lsrCBG._CBGinterface3p = cbgIFb next._CBGinterface5p = cbgIFb ############################################################ if verbose: print "NO COMPATIBLE SITE!" ############################################################ ###for org in inframeintrons: ### print org, "NO COMPATIBLE SITES FOUND!" ### print prev ### print cbgIF ### print next ### theorf = next.get_orfs_of_graph(organism = org )[0] ### print theorf ### theorf.printproteinanddna() ### for donor in theorf._donor_sites: print donor ### for acceptor in theorf._acceptor_sites: print acceptor # return number of found inframe introns return INFRAME_INTRONS_PREDICTED
def cbg_cexpander_inframe_intron_search(self, min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON, min_intron_nt_length = MIN_INTRON_NT_LENGTH, verbose=False): """ @type self: CodingBlockGraph @param self: CodingBlockGraph instance @type min_total_pssm_score: float @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON @type min_intron_nt_length: integer @param min_intron_nt_length: MIN_INTRON_NT_LENGTH @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list or False @return: list with new (sub)CBGs or False when not splitted """ ######################################################################## if verbose: stw = StopWatch(name="cexpCbgIfIntron") stw.start() ######################################################################## # return variable; list of splitted CBGs. return_cbg_list = [ self ] # create cexpander multiplealignment blocks cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander, verbose=verbose) # In freak-accident cases (one in thousends of times), cexpander produces # unequal amount of 1's in the binarystrings. This is theoretically impossible. # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns # False in these cases. Catch this here by quiting current # cbg_cexpander_inframe_intron_search() function call and return False TODO=True if not cbgMA: return False ######################################################################## if verbose: print stw.lap() blockscnt = len( cbgMA[ cbgMA.keys()[0] ] ) print self print "BLOCKS:", blockscnt, self._cexpander.binarystring, print self._cexpander.projected_on for org in cbgMA.keys(): print org, "\t", for blockid in range(0,blockscnt): if cbgMA[org][blockid].count("1") >= 1: print len(cbgMA[org][blockid]), else: print cbgMA[org][blockid], print "" ######################################################################## # loop over the aligned cexpander blocks and check the # non-uniformly aligned blocks for length variation blockscnt = len( cbgMA[ cbgMA.keys()[0] ] ) oricbgomsr = self.overall_minimal_spanning_range() for blockid in range(0,blockscnt): # obtain non-uniformly aligned AA lengths for this block lengths = {} for org in cbgMA.keys(): lengths[org] = cbgMA[org][blockid].count("0") # skip the uniformly aligned blocks if list(Set(lengths.values())) == [0]: continue #################################################################### if verbose: print stw.lap(), "lengths:", lengths #################################################################### # obtain coordinates for this area lsrcoords = {} for org in cbgMA.keys(): node = self.node_by_organism(org) coordSta = min(oricbgomsr[node]) # make summation of length of preceeding (non)aligned blocks for i in range(0,blockid): coordSta += cbgMA[org][i].count("1") +\ cbgMA[org][i].count("0") # end coord is start coord + length of current block coordEnd = coordSta + lengths[org] lsrcoords[org] = ( coordSta, coordEnd ) #################################################################### if verbose: print stw.lap(), "lsrcoords:", lsrcoords #################################################################### # translate AA lengths to NT lengths for k in lengths.keys(): lengths[k] = lengths[k]*3 # check lenght discrepancy and assign putative inframe introns putative_inframe_intron_orgs =\ _length_discrepancy_to_potential_inframe_introns(lengths) if not putative_inframe_intron_orgs: # no length discrepancy that can represent an inframe intron continue # organisms/genes for which an inframe intron can be an improvement # data dictionary. Keys: 'max_nt_length', 'min_nt_length', # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm' inframe_intron_criteria = {} # find putative inframe introns in assigned genes/organisms putative_inframe_introns = {} for org in putative_inframe_intron_orgs: # assign inframe intron criteria for this organism inframe_intron_criteria[org] = { 'min_nt_length' : min_intron_nt_length, 'min_total_pssm' : min_total_pssm_score, 'min_donor_pos' : (min(lsrcoords[org]) - 5) * 3, 'max_acceptor_pos' : (max(lsrcoords[org]) + 5) * 3, } # search for potential introns that can be responsible for this event theorf = self.get_orfs_of_graph(organism=org)[0] introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf, min_intron_nt_length=min_intron_nt_length ) ################################################################ if verbose: print "introns:", org, len(introns), "raw" ################################################################ # filter introns for all outside the OMSR, to short, to long, # total pssm_score etc introns = _filter_putative_inframe_intron_list( introns,org,inframe_intron_criteria) putative_inframe_introns[org] = introns ################################################################ if verbose: print "introns:", org, len(introns), "filtered" ################################################################ # check if all putative_inframe_intron_orgs have indeed introns # and check if all have at least a single intron phase in common if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]: # no introns in one or more organisms/genes -> continue continue if len( putative_inframe_introns )> 1: # do phase check in all organisms/genes phases = Set([0,1,2]) for org, intronlist in putative_inframe_introns.iteritems(): thisphases = Set([ intron.phase for intron in intronlist ]) phases.intersection_update(thisphases) if len(phases) == 0: ################################################################ if verbose: print "no mutual phase -> no cbgIF.is_optimal()" ################################################################ # no mutual phase -> no cbgIF.is_optimal() possible lateron continue else: pass # if an intron in at least a single organism is still there, # then split the involved pacbps in the `original` cbgL, the last # added CBG element in the return_cbg_list, and make a (virtual) # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps! cbgR = self.deepcopy() cbgL = self.deepcopy() # loop over the organisms/genes with inframe introns split # the Pacbps of these orgs in both to-become L and R CBGs inframe_intron_orgs = putative_inframe_introns.keys() for org in inframe_intron_orgs: ################################################################ if verbose: print "splitting PACBPs for org:", org print "L", cbgL print "R", cbgL ################################################################ node = self.node_by_organism(org) replacementsL = {} replacementsR = {} for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems(): if node in [node1,node2]: # get the pacbp of this pacbporf and split it! pacbp = pacb.conversion.pacbporf2pacbp(pacbporf) org1 = self.organism_by_node(node1) org2 = self.organism_by_node(node2) if org1 in putative_inframe_introns.keys() and\ org2 in putative_inframe_introns.keys() and\ inframe_intron_orgs.index(org) > 0: # already splitted; both orgs are inframe introns! continue # make split coordinates relative splitL = lsrcoords[org1][0] - pacbp.query_start splitR = lsrcoords[org1][1] - pacbp.query_start pacbpL = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitL,splitL),returnside='left') pacbpR = pacb.splitting.split_pacb_on_coordinates( pacbp,(splitR,splitR),returnside='rigth') # check if both cbgL and cbgR make sence # if not -> return False! if not pacbpL: return False if not pacbpR: return False ######################################################## if verbose: print "#", node1, node2, lsrcoords[org1], print "L:", splitL, "R:", splitR print pacbp print pacbpL print pacbpR ######################################################## # pacbpL -> extented pacbporfL -> store to replacementsL newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL, pacbporf.orfQ,pacbporf.orfS) newpacbporfL.extend_pacbporf_after_stops() replacementsL[(key,node1,node2)] = newpacbporfL # pacbpR -> extented pacbporfR -> store to replacementsR newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR, pacbporf.orfQ,pacbporf.orfS) newpacbporfR.extend_pacbporf_after_stops() replacementsR[(key,node1,node2)] = newpacbporfR # do the pacbporf replacements in both CBGs statusL = _update_cbg_with_pacbporf_replacements( cbgL,replacementsL) statusR = _update_cbg_with_pacbporf_replacements( cbgR,replacementsR) # check if both cbgL and cbgR make sence if not statusL or not statusR: # return unchanged cbg status -> False return False # Verify the interface between cbgL and cbgR. # Most likely, the sites are nicely alignable. cbgIF = CodingBlockGraphInterface(cbgL,cbgR) cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() ) cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() ) cbgIF.harvest_splice_sites() cbgIF.find_conserved_splice_sites() #################################################################### if verbose: print cbgL print cbgIF print cbgR cbgIF.interfaceproperties() #################################################################### # check the properties of the CBGinterface if cbgIF.optimalitycheck().count(True) >= 2: # yes; is_compatible and donor and/or acceptor is optimal cbgL._CBGinterface3p = cbgIF cbgR._CBGinterface5p = cbgIF cbgL.copy_5pcbginterface_from_othercbg(self) cbgR.copy_3pcbginterface_from_othercbg(self) return_cbg_list = [ cbgL, cbgR ] ################################################################ if verbose: print "INFRAME INTRON CONFIRMED!!" ################################################################ else: # no compatible interface... although intron(s) was/were found! # (at least) two options are now open: # 1. enforce the intron(s) and create cbgIF with _forced_ends # 2. ignore the intron(s) and create an intermediate lsrCBG # 1. is `tricky`. First, how sure is this inframe intron, # what type of criteria do we assume etc etc. # second, how to create a coorect cbgIF? It must be an # IS_SPLITTED interface, of which the boundaries might fall # outside the OMSR's of the CBGs. # 2. ignore the intron(s) and create an intermediate lsrCBG lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR) prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG) prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR) cbgL.copy_5pcbginterface_from_othercbg(self) cbgR.copy_3pcbginterface_from_othercbg(self) return_cbg_list = [ cbgL, lsrCBG, cbgR ] ################################################################ if verbose: print "no INFRAME INTRON -> lsrCBG" print cbgL print " ", lsrCBG._CBGinterface5p print " ", lsrCBG print " ", lsrCBG._CBGinterface3p print cbgR self.printmultiplealignment() print cbgL cbgL.printmultiplealignment() print cbgR cbgR.printmultiplealignment() ################################################################ # EOF this function. # return False if this CBG remained intact, list of splits when splitted if len(return_cbg_list) == 1: return False else: return return_cbg_list
def construct_final_tiny_cbg(self, max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH, max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH, take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS, take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS, take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS, maximal_current_stopcodongraph_average_weight=0.90, minimal_last_vs_new_identity_ratio=0.80, maximal_cexpander_cbg_tail_uniformity_aa_length=3, elegiable_donor_omsr_nt_offset=21, verbose=False): """ Make a tiny final CBG by ``shooting tiny exons into the deep`` """ # get current last CBG last = self.get_final_cbg() # check if final tail of this CBG is uniformaly alignable cxpdrOutput = cexpanderanalyses_omsr2orfend(last) IS_UNIFORMLY_ALIGNED = True for trf in cxpdrOutput._transferblocks: if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"): IS_UNIFORMLY_ALIGNED = False break ############################################################ if verbose: print "Cexpander uniformaly aligned:", print maximal_cexpander_cbg_tail_uniformity_aa_length, print "->", IS_UNIFORMLY_ALIGNED print "omsr: ", last._cexpander.projected_on, print last._cexpander.binarystring trf = cxpdrOutput.get_transfer_of_projected_on( last._cexpander.projected_on) if trf and trf != True: print "omsr2orfend:", last._cexpander.projected_on, print trf.binarystring ############################################################ if IS_UNIFORMLY_ALIGNED: # break out of this function. Chance of overpredicting # a final tiny exon is bigger then finding a True one! return False # check if the stopcodongraph is not (very) good already if last._stopcodongraph.average_weight() >=\ maximal_current_stopcodongraph_average_weight: # break out of this function. Chance of overpredicting # a final tiny exon is bigger then finding a True existing one return False # start the timer (performance benchmark in verbose mode) stw = StopWatch(name='stwFinalECG') stw.start() # get FinalExons on elegiable Orfs based on distance towards OMSR of # current last CBG and minimal acceptor site score omsr = last.overall_minimal_spanning_range() maxsr = last.maximal_spanning_range() ECG = ExonCollectionGraph() ################################################################ if verbose: print "currentLAST", last print last._stopcodongraph print last._stopcodongraph.is_optimal() for org in last.organism_set(): print org, last._stopcodongraph.is_optimal(organism=org) for organism in last.organism_set(): node = last.node_by_organism(organism) theorf = last.get_orfs_of_graph(organism=organism)[0] print organism, "\t", node, "\t", max(omsr[node]), "\t", print max(maxsr[node]), theorf.endPY/3 ################################################################ for organism in last.organism_set(): node = last.node_by_organism(organism) # calculate an offset for the acceptor position # variable elegiable_acceptor_omsr_nt_offset is needed to # enlarge the OMSR definded offset. When the OMSR is by chance # a few nt or aa larger than the actual exon length, the true # acceptor position can be erroneously abandoned. offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset theorf = last.get_orfs_of_graph(organism=organism)[0] # check if this final orf is self can serve as a final extension remaining_orf_nt_length = (theorf.protein_endPY - max(omsr[node])) * 3 remaining_maxsr_nt_length = (max(maxsr[node]) - max(omsr[node])) * 3 remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 FIND_NEW_FINAL_ORFS = True STORE_CURRENT_ORF_AS_FIOO = False if remaining_maxsr_nt_length >= max_exon_nt_length: # exceptionally large maxsr on rigth side of omsr # store as FIOO but to NOT search for an orf extension! ### FIND_NEW_FINAL_ORFS = False # discarded 17/09/2009; when poos maxsr present, overruled! STORE_CURRENT_ORF_AS_FIOO = True elif remaining_maxsr_tostop_nt_length <= 18: # maxsr is less then 6 AA apart from stop on current orf #FIND_NEW_FINAL_ORFS = False STORE_CURRENT_ORF_AS_FIOO = True elif remaining_orf_nt_length < max_exon_nt_length: # final piece of unaligned sequence is a perfect HMM seed STORE_CURRENT_ORF_AS_FIOO = True else: pass if STORE_CURRENT_ORF_AS_FIOO: cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) ) # set pssm_score to (very) high; this rewards # using the current Orf as the last Orf cbs.pssm_score = 20.0 fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf) node = (organism,theorf.id,fioo.start,fioo.end) ECG.add_node_and_object(node,fioo) ################################################################ if verbose: print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY ################################################################ if not FIND_NEW_FINAL_ORFS: # quit here -> no orf extension of this CBG continue # get elegiable (new) final orfs orflist = self.input[organism]['orfs'].get_elegiable_orfs( max_orf_start=offset+max_intron_nt_length, min_orf_end=offset ) ################################################################ if verbose: print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3 ################################################################ for orf in orflist: results = find_tailing_exon_on_orf( theorf,orf, current_donor_pos=offset, max_tailingexon_nt_length=max_exon_nt_length, max_tailingexon_intron_nt_length=max_intron_nt_length, ) for exon,intron in results: node = (organism,orf.id,exon.start,exon.end) if node not in ECG.get_nodes(): ECG.add_node_and_object(node,exon) if verbose: print organism, node, exon if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count() # now take only the best `take_max_best_acceptors` # because there can be quite some of them! for organism in ECG.organism_set(): objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True ) for obj in objects[take_max_best_acceptors:]: node = (organism,obj.orf.id,obj.start,obj.end) ECG.del_node(node) if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score ######################################################################## if verbose: print stw.lap(), ">take_max_best_acceptors DELETED" for organism in ECG.organism_set(): for obj in ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True ): print "remaining", organism, obj.orf.id, obj.length, obj ######################################################################## # only continue if all organisms are represented in the ECG if last.organism_set_size() > ECG.organism_set_size(): if verbose: print "To few organisms/genes present -> return False" return False # create edges in the ECG between compatible phases and # exon length, then make pacbps for these edges ECG.create_edges() ECG.make_pacbps_for_edges() if verbose: print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps) # search for complete graphs in this last_exon_graphs = ECG.find_fully_connected_subgraphs() ######################################################################## if verbose: print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()", print len(last_exon_graphs) ######################################################################## # only continue if there is an perfectly aligned last exon graph if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0): #################################################################### if verbose: print "no perfect aligned last exon graph -> return False" #################################################################### return False # convert to CodingBlockGraphs new_last_cbgs = [] for leg in last_exon_graphs[0:take_max_best_ecgs]: cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last) if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size(): # create cache of CBG and do final check on quality cbg.create_cache() if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\ cbg._cexpander.binarystring.find("1") == -1: # discard hardly alignable CBGs continue # if here, then append this cbg as a possible novel final CBG new_last_cbgs.append( cbg ) ################################################################ if verbose: print "LEGcbg", cbg ################################################################ ######################################################################## if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs) ######################################################################## if not new_last_cbgs: #################################################################### if verbose: print "no ecgs convertable to CBGs -> return False" #################################################################### return False # order by total weight, get the optimal CBG and its corresponding ECG new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs) theNewLastCbg = None cbgIF = None # check all interfaces between the novel final CBGs and the previous # CBG. The best interface is added to the GSG! cbgif_accepted_new_last_cbgs = [] already_checked_node_sets = [] for newcbg in new_last_cbgs[0:take_max_best_cbgs]: lastExonGraph = newcbg._ExonCollectionGraph del( newcbg._ExonCollectionGraph ) # check if it is not the extention of the current # last CBG (identical nodes) if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0: if verbose: print "newCBG is the extention of current last CBG!!" continue # check if this combination of nodes (orfs) has not been tried already if newcbg.get_ordered_nodes() in already_checked_node_sets: ############################################################### if verbose: print "newCBG node set done earlier:", print newcbg.get_ordered_nodes() ############################################################### continue else: # append this set of nodes (as a list) to checklist already_checked_node_sets.append( newcbg.get_ordered_nodes() ) # check if this new final tinyexon graph has a compatible interface # with the current last one cbgIF = CodingBlockGraphInterface(last,newcbg) cbgIF.harvest_splice_sites() distinct_orgs = [] for node in lastExonGraph.get_nodes(): exon = lastExonGraph.get_node_object(node) if exon.acceptor.__class__.__name__ == 'SpliceAcceptor': distinct_orgs.append( lastExonGraph.organism_by_node(node) ) cbgIF.allow_intron_in_organisms(distinct_orgs) cbgIF.find_conserved_splice_sites() # do NOT optimize -> consumes a lot of time and is helpfull # only in extreme cases... #cbgIF.optimize() if not cbgIF.is_compatible(): ################################################################ if verbose: print "newCBG not a is_compatible() cbgIF" print newcbg ################################################################ continue # append to cbgif_accepted_new_last_cbgs newcbg._CBGinterface5p = cbgIF cbgif_accepted_new_last_cbgs.append( ( cbgIF.optimalitycheck().count(True), newcbg.total_weight(), newcbg ) ) ######################################################################## if verbose: print stw.lap(), "cbgIFs checked %s/%s" % ( len(cbgif_accepted_new_last_cbgs), len(new_last_cbgs[0:take_max_best_cbgs]) ) ######################################################################## # now start by adding the highest scoring newcbg first cbgif_accepted_new_last_cbgs.sort() cbgif_accepted_new_last_cbgs.reverse() ######################################################################## if verbose: print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs) for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs: print true_cnt,totalwt,newcbg._CBGinterface5p print newcbg ######################################################################## for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs: # get the already created cbgIF from the newcbg graph cbgIF = newcbg._CBGinterface5p # now check 4 criteria: # (1) cbgIF.is_optimal() (2) >GTG.identity # (3) >STG.totalweight (4) <STG.distance criteria = [] criteria.append( cbgIF.is_optimal() ) criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() ) criteria.append( newcbg.genetree().identity() > last.genetree().identity() ) criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() ) #################################################################### if verbose: print "TRYING ADDITION of final newcbg", criteria print true_cnt,totalwt,newcbg._CBGinterface5p print newcbg #################################################################### # check if there is only a single different node/orf changed in the newcbg # this is recognized by a symmetric_difference of size 2 # in this case, be very strict! This easily causes overprediction (FP) tiny exons if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2: # check if 4 criteria are valid; # a single False results in not accepting this new last tiny cbg if False in criteria: if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria # continue -> no new tiny CBG continue # now start check the criteria. # if criteria[0] == True, means a fully is_optimal interface! # do not perform any additional check, just add! if criteria[0] == True: theNewLastCbg = newcbg break # total weight criterion -> new.tw() > last.tw() if criteria[1] == False: ########################################################################## if verbose: print "# NOVEL lastTinyExon discarded; to low total weight" print "#", newcbg._stopcodongraph ########################################################################## # continue -> no new tiny CBG continue # identity criterion -> allow a ratio i.s.o. new.id() > last.id() # this strict criterion (>) is applied for single-new-orf-CBGs if criteria[2] == False: ratio = newcbg.genetree().identity() / last.genetree().identity() if ratio < minimal_last_vs_new_identity_ratio: ###################################################################### if verbose: print "# NOVEL lastTinyExon discarded; to low identity" print "#", newcbg._stopcodongraph, newcbg.genetree().identity() ###################################################################### # continue -> no new tiny CBG continue if criteria[3] == False: ########################################################################## if verbose: print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance" print "#", newcbg._stopcodongraph ########################################################################## # continue -> no new tiny CBG continue # if this point is reached, a new tiny last CBG has been found! theNewLastCbg = newcbg # break out of the for loop; store into the genestructure break # all okay -> ready for inserting the new CBG if theNewLastCbg and verbose: ################################################################################ print "NEW FINAL TINY EXON FOUND!!" print theNewLastCbg print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable() print cbgIF._optimal_aligned_donor, cbgIF.donor_phase() print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase() ################################################################################ # hard-insert into the genestructure # using add_codingblock is likely to cause problems # because of the tinyness of the CBG if theNewLastCbg: for pos in range(0,len(self)): if self.codingblockgraphs[pos].IS_IGNORED: continue if self.codingblockgraphs[pos].IS_LAST: thelast = self.codingblockgraphs[pos] thelast.IS_LAST = False newcbg.IS_LAST = True self.codingblockgraphs.insert(pos+1,theNewLastCbg) # set the CBGInterface object in next and prev CBG self.codingblockgraphs[pos]._CBGinterface3p = cbgIF self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF # break out; end of this function break # done! return a True because newcbg is created & inserted return True else: # no newLastCbg found return False