def collection2alignedsites(self,edges=None,minimal_edges=2): """ TODO!!! """ # handle edges argument (error check etc.) edges = self._handle_edges_argument(edges) # do the first basal alignment self.alignedsites = sort_by_cumulative_score( [ conversion.TranslationalStartSiteCollectionGraph2AlignedTranslationalStartSiteGraph(algsite,max_node_count=self.organism_set_size()) for algsite in self.find_conserved_sites(edges=edges) ] ) # and order all the sites on cumulative score self.alignedsites = sort_by_cumulative_score(self.alignedsites) # no keep on aligning the remaining fraction of non-aligned sites for current_edges in range(edges-1,minimal_edges-1,-1): if self.alignedsites and self.alignedsites[-1].__class__.__name__ == 'TranslationalStartSiteCollectionGraph': # redo this non-aligned part gra = self.alignedsites.pop() self.alignedsites.extend( [ conversion.TranslationalStartSiteCollectionGraph2AlignedTranslationalStartSiteGraph(algsite,max_node_count=self.organism_set_size()) for algsite in gra.find_conserved_sites(edges=current_edges) ] ) self.alignedsites = sort_by_cumulative_score(self.alignedsites)
def recombine_into_completegraphs(self,edges=None,verbose=False): """ Create all possible ExonCollectionGraphs by organism node recombination @type edges: number @param edges: number of outgoing edges of a node in a FCG @rtype: list @return: list with ExonCollectionGraph of the requested properties """ from codingblock_splitting import cross # if edges is not applied get by definition from the OrganismGraph if not edges: edges = self.organism_set_size() - 1 # currently, this function has only a hard-set max_missing_edges == 0 max_missing_edges = 0 retlist = [] # make a cross of all the pacbp positions in the lists of alternatives allcombis = cross([ self.get_organism_nodes(org) for org in self.organism_set() ]) if verbose: print "combinations:", len(allcombis) # gather a list of missing edges in the ECG missing_edges = [] for (node1,node2) in self.pairwisecrosscombinations_node(): if self._organism_from_node(node1) == self._organism_from_node(node2): continue if not self.has_edge(node1,node2): missing_edges.append((node1,node2)) if verbose: print "missing edges:", len(missing_edges) # check for combinations that nodes that are listed as a missing edge # these are not relevant because max_missing_edges == 0 if edges == self.organism_set_size() - 1: for pos in range(len(allcombis)-1,-1,-1): combi = allcombis[pos] for node1,node2 in missing_edges: if node1 in combi and node2 in combi: allcombis.pop(pos) break if verbose: print "relevant:", len(allcombis) for combi in allcombis: sg = ExonCollectionGraph() for node in combi: # get the exon object from the main ExonCollectionGraph exon = self.get_node_object(node) # add node & object to the subgraph sg.add_node_and_object(node,exon) # create the edges in the subgraph for (node1,node2) in sg.pairwisecrosscombinations_node(): if self.has_edge(node1,node2): wt = self.get_edge_weight(node1,node2) sg.add_edge(node1,node2,wt=wt) ## now check if is a succesfull recombination #if sg.node_count() != self.organism_set_size(): # continue #if sg.edge_count() < sum(range(0,self.organism_set_size())) - max_missing_edges: # continue # remove nodes that have zero edges sg.remove_low_connectivity_nodes(min_connectivity=1) # do not check nr. of nodes on organism_set_size, but on variable `edges`! if sg.node_count() < edges+1: continue if sg.edge_count() < sum(range(0,edges+1)) - max_missing_edges: continue # if here -> accepted! if edges == self.organism_set_size() - 1: retlist.append(sg) else: # hmm... recombination with allowing missing organisms/nodes # that means that there are duplicates in the subgraphs # that come to this point. Check if this subgraph is already # present in retlist before addition for alt in retlist: if len(alt.node_set().difference(sg.node_set())) == 0: break else: # not recognized -> add! retlist.append(sg) # update the attributes dicts for sg in retlist: sg._update_after_changes() # and return a ordered/prioritized list return sort_by_cumulative_score(retlist)
def collection2alignedsites(self,edges=None,minimal_edges=2): """ """ # handle edges argument (error check etc.) edges = self._handle_edges_argument(edges) # do the first basal alignment self.alignedsites = sort_by_cumulative_score( [ conversion.SpliceSiteCollectionGraph2AlignedSpliceSiteGraph(algsite,max_node_count=self.organism_set_size()) for algsite in self.find_conserved_sites(edges=edges) ] ) # now keep on aligning the remaining fraction of non-aligned sites for current_edges in range(edges-1,minimal_edges-1,-1): if self.alignedsites and self.alignedsites[-1].__class__.__name__ in\ ['DonorSiteCollectionGraph','AcceptorSiteCollectionGraph','SpliceSiteCollectionGraph']: # redo this non-aligned part gra = self.alignedsites.pop() self.alignedsites.extend( [ conversion.SpliceSiteCollectionGraph2AlignedSpliceSiteGraph(algsite,max_node_count=self.organism_set_size()) for algsite in gra.find_conserved_sites(edges=current_edges) ] ) self.alignedsites = sort_by_cumulative_score(self.alignedsites) # if there are AlignedSpliceSiteWithPhaseShiftGraphs, remove all that have not all organisms represented lenrange = range(len(self.alignedsites)-1,-1,-1) for pos in lenrange: if self.alignedsites[pos].__class__.__name__ in ['AlignedDonorSiteWithPhaseShiftGraph', 'AlignedAcceptorSiteWithPhaseShiftGraph','AlignedSpliceSiteWithPhaseShiftGraph']: if self.alignedsites[pos].organism_set_size() != self.organism_set_size(): _removed = self.alignedsites.pop(pos) # finally, merge AlignedSites that are separated due possible erroneous alignments # around (aligned) inframe-introns. Due to ALIGNED_DONOR_MAX_TRIPLET_DISTANCE, # situations like the following can occur. Suppose organism A-E, E having an inframe intron # that is in some Pacbps splitted, and in some aligned. Due to the differences in the location # where BLAST places the gaps, offset can arrise, resulting in 2 i.s.o. 1 AlignedSite: # A(x)-B(x)-C(x)-D(x) and A(x)-B(x)-C(x)-E(x), where A(x)-B(x)-C(x) are the same sites! if self.alignedsites: currentpos = 0 while True: for pos in range(currentpos,len(self.alignedsites)): site_merged = False site = self.alignedsites[pos] if site.organism_set_size() == self.organism_set_size(): continue if site.__class__.__name__ not in\ ['AlignedDonorSiteGraph','AlignedAcceptorSiteGraph','AlignedSpliceSiteGraph']: continue for otherpos in range(pos+1,len(self.alignedsites)): othersite = self.alignedsites[otherpos] if othersite.organism_set_size() == self.organism_set_size(): continue if othersite.__class__.__name__ not in\ ['AlignedDonorSiteGraph','AlignedAcceptorSiteGraph','AlignedSpliceSiteGraph']: continue if site.phase() == othersite.phase(): mutual_nodes = graphPlus.comparison.mutual_nodes(site,othersite) if not mutual_nodes: continue # check if the difference in nodes completes `site` new_nodes = othersite.node_set().difference( site.get_nodes() ) new_orgs = [ othersite._organism_from_node(node) for node in new_nodes] if not site.organism_set().intersection(new_orgs): # yes, there are mutual nodes! Now update `site` with the nodes # that are only occuring in `othersite` site.nodes.update(othersite.nodes) site.weights.update(othersite.weights) site._node_pssm.update(othersite._node_pssm) site._node_object.update(othersite._node_object) site._edge_binary_entropies.update(othersite._edge_binary_entropies) # set site_merged to True to make shure the outern for loop # is broken, remove `othersite` and break out site_merged = True self.alignedsites.pop(otherpos) break if site_merged: # yes, a succesfull merge; break the outern forloop because the # length of the list self.alignedsites has changed! currentpos = pos break else: # eof list; break the while loop break # if currentpos>0, >=1 sites are merged -> resort the sites self.alignedsites = sort_by_cumulative_score(self.alignedsites)