def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,GTG,exclude_annotated=True,verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get target organism identifier target = inwpcbgs[0]._get_target_organism() # return list with inwpcbgs gtgdiscrepancy_inwpcbg_list = [] if exclude_annotated: # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) range_5p_test = range(0,posFirst) range_3p_test = range(posFinal+1,len(inwpcbgs)) protected_target_orfid_list = [] for inwpCBG in inwpcbgs[posFirst:posFinal+1]: if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0: protected_target_orfid_list.append( inwpCBG.get_orfs_of_graph(organism=target)[0].id ) else: range_5p_test = [] range_3p_test = [] protected_target_orfid_list = [] ############################################################################ if verbose and exclude_annotated: print "NOT-excluded:", range_5p_test, range_3p_test ############################################################################ # detect UTR or nongene / noncoding inwpCBGS for pos in range(0,len(inwpcbgs)): if exclude_annotated and pos in range_5p_test: pass elif exclude_annotated and pos in range_3p_test: pass elif exclude_annotated and inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() == 0: # in the middle of the annotated geen structure, but not a single # Orf annotated as an exon. Asses for gtg difference too! pass elif exclude_annotated: continue else: pass # get this inwpCBG and thisInwpCBG = inwpcbgs[pos] # ignore if the target's Orf is belonging to a `protected` Orf if protected_target_orfid_list and\ thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\ protected_target_orfid_list: continue # ignore inwpCBGs which are very likely (poor quality) SignalP alignments cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides()) if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\ thisInwpCBG.get_signalp_score() > 0.75: continue # create its GeneTreeGraph gtg = pcg2gtg_by_identity(thisInwpCBG,target) # step 1. Do the gtg/GTG difference check difference = _relative_gtg_difference(gtg,GTG,target) if difference < NONGENE_GTG_MAX_DIFFERENCE: # step 2. Do the CEXPANDER check if thisInwpCBG.node_count() <= 2: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference,NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference,NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: # cexpander check is succesfull, GTGdifference claims # the aligment is bogus. Do a more elaborate check on # some other variables of thisInwpCBG # calculate the difference between minsr & maxsr lengths node = thisInwpCBG.get_organism_nodes(target)[0] minsr = thisInwpCBG.minimal_spanning_range_sizes()[node] maxsr = thisInwpCBG.maximal_spanning_range_sizes()[node] msr_ratio = float(minsr)/float(maxsr) # calculate the ratio between average weights of gtg and GTG average_wt_gtg = _pairwise_gtg_average_weight(gtg,target) average_wt_GTG = _pairwise_gtg_average_weight(GTG,target) gtg_ratio = average_wt_gtg / average_wt_GTG if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\ gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference,NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: pass else: pass # return the gtgdiscrepancy_inwpcbg_list return gtgdiscrepancy_inwpcbg_list
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs, GTG, exclude_annotated=True, verbose=True): """ """ # return empty list when no inwpcbgs applied if not inwpcbgs: return [] # get target organism identifier target = inwpcbgs[0]._get_target_organism() # return list with inwpcbgs gtgdiscrepancy_inwpcbg_list = [] if exclude_annotated: # get most likely first & final inwpCBG pointer in inwpcbgs list posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs) range_5p_test = range(0, posFirst) range_3p_test = range(posFinal + 1, len(inwpcbgs)) protected_target_orfid_list = [] for inwpCBG in inwpcbgs[posFirst:posFinal + 1]: if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0: protected_target_orfid_list.append( inwpCBG.get_orfs_of_graph(organism=target)[0].id) else: range_5p_test = [] range_3p_test = [] protected_target_orfid_list = [] ############################################################################ if verbose and exclude_annotated: print "NOT-excluded:", range_5p_test, range_3p_test ############################################################################ # detect UTR or nongene / noncoding inwpCBGS for pos in range(0, len(inwpcbgs)): if exclude_annotated and pos in range_5p_test: pass elif exclude_annotated and pos in range_3p_test: pass elif exclude_annotated and inwpcbgs[ pos].count_orfs_labeled_as_annotated_exon() == 0: # in the middle of the annotated geen structure, but not a single # Orf annotated as an exon. Asses for gtg difference too! pass elif exclude_annotated: continue else: pass # get this inwpCBG and thisInwpCBG = inwpcbgs[pos] # ignore if the target's Orf is belonging to a `protected` Orf if protected_target_orfid_list and\ thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\ protected_target_orfid_list: continue # ignore inwpCBGs which are very likely (poor quality) SignalP alignments cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides()) if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\ thisInwpCBG.get_signalp_score() > 0.75: continue # create its GeneTreeGraph gtg = pcg2gtg_by_identity(thisInwpCBG, target) # step 1. Do the gtg/GTG difference check difference = _relative_gtg_difference(gtg, GTG, target) if difference < NONGENE_GTG_MAX_DIFFERENCE: # step 2. Do the CEXPANDER check if thisInwpCBG.node_count() <= 2: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference, NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference, NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: # cexpander check is succesfull, GTGdifference claims # the aligment is bogus. Do a more elaborate check on # some other variables of thisInwpCBG # calculate the difference between minsr & maxsr lengths node = thisInwpCBG.get_organism_nodes(target)[0] minsr = thisInwpCBG.minimal_spanning_range_sizes()[node] maxsr = thisInwpCBG.maximal_spanning_range_sizes()[node] msr_ratio = float(minsr) / float(maxsr) # calculate the ratio between average weights of gtg and GTG average_wt_gtg = _pairwise_gtg_average_weight(gtg, target) average_wt_GTG = _pairwise_gtg_average_weight(GTG, target) gtg_ratio = average_wt_gtg / average_wt_GTG if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\ gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO: gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG) ################################################################ if verbose: print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % ( difference, NONGENE_GTG_MAX_DIFFERENCE), print thisInwpCBG.get_organism_nodes(target)[0] ################################################################ else: pass else: pass # return the gtgdiscrepancy_inwpcbg_list return gtgdiscrepancy_inwpcbg_list
def detect_and_remove_gtgdiscrepancy(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True): """ """ # if empty list or empty PCG provided: return False if not inwpcbgs or not PCG or PCG.node_count() == 0: return False # get target organism identifier target = inwpcbgs[0]._get_target_organism() # Make *the* GTG of the strongest X informant species # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET); # unigene informants are not taken into account here. # X is defined here by: # -- at least 3 informants (for very small number of informants) # -- optimally half of the total numers of informants # -- at most 8 informants min_gtg_node_count = 3 + 1 max_gtg_node_count = 8 + 1 gtg_size = min([(len(GENE_IDENTIFIER_SET)-1)/2, max_gtg_node_count]) gtg_size = max([min_gtg_node_count,gtg_size]) btGTG = pcg2gtg_by_bitscore(PCG,target,identifier_list=GENE_IDENTIFIER_SET) ntGTG = pcg2gtg_by_identity(PCG,target,identifier_list=GENE_IDENTIFIER_SET) # TEMP solution because OrganismGraph != OrganismStarGraph # make bitscore ordered list of nodes bitscore_ordered_nodes = [] for (tNode,iNode),wt in btGTG.weights.iteritems(): if tNode==target: bitscore_ordered_nodes.append( ( wt, iNode ) ) bitscore_ordered_nodes.sort() #if verbose: print "btGTG::", bitscore_ordered_nodes while ntGTG.node_count() > gtg_size: # next line causes errors because OrganismGraph != OrganismStarGraph # this causes the target node in rare cases to be assigned as the weakest node # informant = btGTG.weakest_connected_node() (wt,informant) = bitscore_ordered_nodes.pop(0) btGTG.del_node(informant) ntGTG.del_node(informant) if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes() ############################################################################ if verbose: print "ntGTG:", ntGTG.get_ordered_nodes(), for node in ntGTG.get_ordered_nodes(): if node == target: continue print "%1.2f" % ntGTG.weights[(target,node)], print "" ############################################################################ # detect inwpCBGs which are probably the result of intron alignments gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(inwpcbgs,ntGTG) # detect inwpCBGs with strong discrepancy to this GTG gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,ntGTG) # merge both lists if gtgdiscrepancy_internal_inwpcbg_list: if not gtgdiscrepancy_inwpcbg_list: gtgdiscrepancy_inwpcbg_list.extend(gtgdiscrepancy_internal_inwpcbg_list) else: for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list: check_str = str(inwpcbg) if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]: gtgdiscrepancy_inwpcbg_list.append( inwpcbg ) if not gtgdiscrepancy_inwpcbg_list: return False # get list of inwpCBGs that have NO discrepancy correct_inwpcbg_list = [] check_str_list = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: check_str_list.append( str(discrinwpCBG) ) for inwpcbg in inwpcbgs: if str(inwpcbg) not in check_str_list: correct_inwpcbg_list.append( inwpcbg ) # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY gtgdiscrepancy_pacbpkeys = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: for pacbpkey in discrinwpCBG.pacbps.keys(): # check if this pacbpkey is occuring in a non-removed inwpCBG is_occurring_in_correct_inwpcbg = False for inwp in correct_inwpcbg_list: if pacbpkey in inwp.pacbps.keys(): is_occurring_in_correct_inwpcbg = True break # if is_occurring_in_correct_inwpcbg, continue and do not delete if is_occurring_in_correct_inwpcbg: continue # store to gtgdiscrepancy_pacbpkeys when not stored already if pacbpkey not in gtgdiscrepancy_pacbpkeys: gtgdiscrepancy_pacbpkeys.append(pacbpkey) # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG # and, at the same time, remove from the main PCG gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) for key in gtgdiscrepancy_pacbpkeys: if key not in PCG.pacbps.keys(): # !?!? TODO why not present in the PCG !?!?! # anyway, continue here to avoid KeyError # This PacbPORF was to be deleted rigth here, # so it is not an extreme disaster. But... scary ;-) continue (pacbpkey,nodeQ,nodeS) = key pacbporf = PCG.pacbps[key] # add to gtgdiscrepancyPCG gtgdiscrepancyPCG.add_node(nodeQ) gtgdiscrepancyPCG.add_node(nodeS) gtgdiscrepancyPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) gtgdiscrepancyPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,key) # return gtgdiscrepancyPCG return gtgdiscrepancyPCG
def detect_and_remove_gtgdiscrepancy(inwpcbgs, PCG, GENE_IDENTIFIER_SET, verbose=True): """ """ # if empty list or empty PCG provided: return False if not inwpcbgs or not PCG or PCG.node_count() == 0: return False # get target organism identifier target = inwpcbgs[0]._get_target_organism() # Make *the* GTG of the strongest X informant species # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET); # unigene informants are not taken into account here. # X is defined here by: # -- at least 3 informants (for very small number of informants) # -- optimally half of the total numers of informants # -- at most 8 informants min_gtg_node_count = 3 + 1 max_gtg_node_count = 8 + 1 gtg_size = min([(len(GENE_IDENTIFIER_SET) - 1) / 2, max_gtg_node_count]) gtg_size = max([min_gtg_node_count, gtg_size]) btGTG = pcg2gtg_by_bitscore(PCG, target, identifier_list=GENE_IDENTIFIER_SET) ntGTG = pcg2gtg_by_identity(PCG, target, identifier_list=GENE_IDENTIFIER_SET) # TEMP solution because OrganismGraph != OrganismStarGraph # make bitscore ordered list of nodes bitscore_ordered_nodes = [] for (tNode, iNode), wt in btGTG.weights.iteritems(): if tNode == target: bitscore_ordered_nodes.append((wt, iNode)) bitscore_ordered_nodes.sort() #if verbose: print "btGTG::", bitscore_ordered_nodes while ntGTG.node_count() > gtg_size: # next line causes errors because OrganismGraph != OrganismStarGraph # this causes the target node in rare cases to be assigned as the weakest node # informant = btGTG.weakest_connected_node() (wt, informant) = bitscore_ordered_nodes.pop(0) btGTG.del_node(informant) ntGTG.del_node(informant) if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes( ) ############################################################################ if verbose: print "ntGTG:", ntGTG.get_ordered_nodes(), for node in ntGTG.get_ordered_nodes(): if node == target: continue print "%1.2f" % ntGTG.weights[(target, node)], print "" ############################################################################ # detect inwpCBGs which are probably the result of intron alignments gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments( inwpcbgs, ntGTG) # detect inwpCBGs with strong discrepancy to this GTG gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs( inwpcbgs, ntGTG) # merge both lists if gtgdiscrepancy_internal_inwpcbg_list: if not gtgdiscrepancy_inwpcbg_list: gtgdiscrepancy_inwpcbg_list.extend( gtgdiscrepancy_internal_inwpcbg_list) else: for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list: check_str = str(inwpcbg) if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]: gtgdiscrepancy_inwpcbg_list.append(inwpcbg) if not gtgdiscrepancy_inwpcbg_list: return False # get list of inwpCBGs that have NO discrepancy correct_inwpcbg_list = [] check_str_list = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: check_str_list.append(str(discrinwpCBG)) for inwpcbg in inwpcbgs: if str(inwpcbg) not in check_str_list: correct_inwpcbg_list.append(inwpcbg) # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY gtgdiscrepancy_pacbpkeys = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: for pacbpkey in discrinwpCBG.pacbps.keys(): # check if this pacbpkey is occuring in a non-removed inwpCBG is_occurring_in_correct_inwpcbg = False for inwp in correct_inwpcbg_list: if pacbpkey in inwp.pacbps.keys(): is_occurring_in_correct_inwpcbg = True break # if is_occurring_in_correct_inwpcbg, continue and do not delete if is_occurring_in_correct_inwpcbg: continue # store to gtgdiscrepancy_pacbpkeys when not stored already if pacbpkey not in gtgdiscrepancy_pacbpkeys: gtgdiscrepancy_pacbpkeys.append(pacbpkey) # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG # and, at the same time, remove from the main PCG gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for key in gtgdiscrepancy_pacbpkeys: if key not in PCG.pacbps.keys(): # !?!? TODO why not present in the PCG !?!?! # anyway, continue here to avoid KeyError # This PacbPORF was to be deleted rigth here, # so it is not an extreme disaster. But... scary ;-) continue (pacbpkey, nodeQ, nodeS) = key pacbporf = PCG.pacbps[key] # add to gtgdiscrepancyPCG gtgdiscrepancyPCG.add_node(nodeQ) gtgdiscrepancyPCG.add_node(nodeS) gtgdiscrepancyPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) gtgdiscrepancyPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, key) # return gtgdiscrepancyPCG return gtgdiscrepancyPCG