def create_genetree_from_crossdata(crossdata): """ (Try to) create the GeneTreeGraph from crossdata dictionairy @type crossdata: dict @param crossdata: crossdata <dict data structure> @rtype: GeneTreeGraph @return: estimated! GeneTreeGraph constructed from PabcPs with highest bitscore """ # fill GeneTreeGraph with nodes GTG = GeneTreeGraph() for (geneA,geneB) in crossdata.keys(): if geneA not in GTG.get_nodes(): GTG.add_node(geneA) if geneB not in GTG.get_nodes(): GTG.add_node(geneB) # fill GeneTreeGraph with edges for (geneA,geneB) in crossdata.keys(): keys = crossdata[(geneA,geneB)]['accepted_pacbs'].keys() if keys: keys.sort() keys.reverse() bestpacbp = crossdata[(geneA,geneB)]['accepted_pacbs'][keys[0]] # store this edge to the GeneTreeGraph GTG.add_edge(geneA,geneB,bestpacbp.identityscore) else: # no keys at all, meaning no Pacbps between 2 species meaining # GTG can not be created yet! Set GSG back to empty graph and return # set GTG back to an empty graph, break out anf return GTG = GeneTreeGraph() break # return the GeneTreeGraph return GTG
def detect_and_remove_synteny(inwpcbgs, PCG, GENE_IDENTIFIER_SET, verbose=True): """ """ MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20 observed_organism_subcombis = [] syntenic_subinwpcbgs = [] # detect syntenic genes in MAIN inwpCBGs, # without taking strongest informants by GTG analyses syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) for inwpCBG in inwpcbgs: # omit inwpCBGs with annotated exons/orfs if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue target = inwpCBG._get_target_organism() # make a (artificially fully connected) GeneTreeGraph gtg = GeneTreeGraph() gtg.add_node(target) for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems(): orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_node(orgS) for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems(): orgQ = inwpCBG.organism_by_node(nodeQ) orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_edge(orgQ, orgS, wt=pacbporf.bitscore) # make artificially missed edges between the informants for org in inwpCBG.organism_set(): if org not in [orgQ, orgS] and org in GENE_IDENTIFIER_SET: if gtg.has_edge( orgS, org ) and\ gtg.weights[(orgS, org)] > pacbporf.bitscore: gtg.set_edge_weight(orgS, org, wt=pacbporf.bitscore) else: gtg.add_edge(orgS, org, wt=pacbporf.bitscore) # omit (nearly) empty genetreegraphs if gtg.node_count() <= 1: continue # remove (much) weaker connected nodes as expected from the gtg while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\ min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]): node = gtg.weakest_connected_node() gtg.del_node(node) # check if already tested before; present in observed_organism_subcombis if gtg.get_ordered_nodes() in observed_organism_subcombis: continue # store to already tested organism subcombinations observed_organism_subcombis.append(gtg.get_ordered_nodes()) # create a subPCG of these organisms subPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for (pacbpkey, nodeQ, nodeS), pacbporf in PCG.pacbps.iteritems(): (orgQ, orfQid), (orgS, orfSid) = nodeQ, nodeS if orgQ not in gtg.get_nodes(): continue if orgS not in gtg.get_nodes(): continue subPCG.add_node(nodeQ) subPCG.add_node(nodeS) subPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) subPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # make inwpCBGs of this subPCG subinwpcbgs = PCG2inwpCBGS(subPCG) # check if there are subinwpcbgs if not subinwpcbgs: continue ######################################################################## #if verbose: # print "subPCG organism set:", gtg.get_ordered_nodes() # print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes()) ######################################################################## # create a subInwardsPointingCodingBlockGraph of these organisms #subinwpCBG = InwardsPointingCodingBlockGraph() #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems(): # (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS # if orgQ not in gtg.get_nodes(): continue # if orgS not in gtg.get_nodes(): continue # subinwpCBG.add_node(nodeQ) # subinwpCBG.add_node(nodeS) # subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # detect syntenic genes in this subinwpcbgs syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) #################################################################### if verbose: print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes( ) for subCBG in subinwpcbgs: print "syntenic in:", subCBG, subCBG.get_ordered_nodes() #################################################################### if not syntenic_subinwpcbgs: return False # cleanup all inwpCBGs from the syntenic subInwpCBGs syntenic_pacbpkeys = [] for syntinwpcbg in syntenic_subinwpcbgs: node_set = syntinwpcbg.node_set() for inwpCBG in inwpcbgs: if not node_set.difference(inwpCBG.node_set()): for pacbpkey in inwpCBG.pacbps.keys(): if pacbpkey not in syntenic_pacbpkeys: syntenic_pacbpkeys.append(pacbpkey) # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG # and, at the same time, remove from the main PCG syntenicPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for key in syntenic_pacbpkeys: (pacbpkey, nodeQ, nodeS) = key pacbporf = PCG.pacbps[key] # add to syntenicPCG syntenicPCG.add_node(nodeQ) syntenicPCG.add_node(nodeS) syntenicPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) syntenicPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, key) # return syntenicPCG return syntenicPCG
def detect_and_remove_synteny(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True): """ """ MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20 observed_organism_subcombis = [] syntenic_subinwpcbgs = [] # detect syntenic genes in MAIN inwpCBGs, # without taking strongest informants by GTG analyses syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) for inwpCBG in inwpcbgs: # omit inwpCBGs with annotated exons/orfs if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue target = inwpCBG._get_target_organism() # make a (artificially fully connected) GeneTreeGraph gtg = GeneTreeGraph() gtg.add_node(target) for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems(): orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_node(orgS) for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems(): orgQ = inwpCBG.organism_by_node(nodeQ) orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_edge( orgQ, orgS, wt = pacbporf.bitscore ) # make artificially missed edges between the informants for org in inwpCBG.organism_set(): if org not in [orgQ,orgS] and org in GENE_IDENTIFIER_SET: if gtg.has_edge( orgS, org ) and\ gtg.weights[(orgS, org)] > pacbporf.bitscore: gtg.set_edge_weight(orgS,org,wt = pacbporf.bitscore) else: gtg.add_edge( orgS, org, wt = pacbporf.bitscore ) # omit (nearly) empty genetreegraphs if gtg.node_count() <= 1: continue # remove (much) weaker connected nodes as expected from the gtg while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\ min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]): node = gtg.weakest_connected_node() gtg.del_node(node) # check if already tested before; present in observed_organism_subcombis if gtg.get_ordered_nodes() in observed_organism_subcombis: continue # store to already tested organism subcombinations observed_organism_subcombis.append( gtg.get_ordered_nodes() ) # create a subPCG of these organisms subPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for (pacbpkey,nodeQ,nodeS), pacbporf in PCG.pacbps.iteritems(): (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS if orgQ not in gtg.get_nodes(): continue if orgS not in gtg.get_nodes(): continue subPCG.add_node(nodeQ) subPCG.add_node(nodeS) subPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) subPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # make inwpCBGs of this subPCG subinwpcbgs = PCG2inwpCBGS(subPCG) # check if there are subinwpcbgs if not subinwpcbgs: continue ######################################################################## #if verbose: # print "subPCG organism set:", gtg.get_ordered_nodes() # print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes()) ######################################################################## # create a subInwardsPointingCodingBlockGraph of these organisms #subinwpCBG = InwardsPointingCodingBlockGraph() #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems(): # (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS # if orgQ not in gtg.get_nodes(): continue # if orgS not in gtg.get_nodes(): continue # subinwpCBG.add_node(nodeQ) # subinwpCBG.add_node(nodeS) # subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # detect syntenic genes in this subinwpcbgs syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) #################################################################### if verbose: print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes() for subCBG in subinwpcbgs: print "syntenic in:", subCBG, subCBG.get_ordered_nodes() #################################################################### if not syntenic_subinwpcbgs: return False # cleanup all inwpCBGs from the syntenic subInwpCBGs syntenic_pacbpkeys = [] for syntinwpcbg in syntenic_subinwpcbgs: node_set = syntinwpcbg.node_set() for inwpCBG in inwpcbgs: if not node_set.difference(inwpCBG.node_set()): for pacbpkey in inwpCBG.pacbps.keys(): if pacbpkey not in syntenic_pacbpkeys: syntenic_pacbpkeys.append(pacbpkey) # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG # and, at the same time, remove from the main PCG syntenicPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) for key in syntenic_pacbpkeys: (pacbpkey,nodeQ,nodeS) = key pacbporf = PCG.pacbps[key] # add to syntenicPCG syntenicPCG.add_node(nodeQ) syntenicPCG.add_node(nodeS) syntenicPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) syntenicPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,key) # return syntenicPCG return syntenicPCG