def get_best_matching_domain(self, hmm_hits, observed_genes_per_domain, bin_name='UNKNOWN'): """Returns the best matching and other domain info using a random forest. The input dict `hmm_hits` has no role in predicting the best matching domain, but useful to print out some helpful messages when necessary. It returns a tuple for best matching domain and how confident is the match according to the random forest classifier. """ # learn domains anvi'o hmm hits know about .. NOTE: the key problem here is that due to this line, anvi'o currently allows # a single SCG collection per domain. This can be changed, and we will think about that when it is a necessity. No need # to be rocket scientists before the need arises. domains_in_hmm_hits = sorted([ d for d in hmm_hits if list(hmm_hits[d].values())[0]['num_genes_in_model_with_hits'] ]) sources_in_hmm_hits = sorted( list(set([list(hmm_hits[s].keys())[0] for s in hmm_hits]))) # if this class is initialized improperly, it means there are SCG domains in the contigs database anvi'o does not # recognize. but for a given bin, all HMM hits may be coming only from domains anvi'o recognizes. in those cases # we can predict the domain nicely, and move on with our lives. if hmm hits for a given bin includes hits from the # mysterious hmm collection the user defined, then there is not much we can do. here we will test the presence of # any HMM hits with a domain we don't recognize, and act accordingly. if not self.initialized_properly: hits_contain_a_domain_missing_from_SCG_domain_predictor = len( set(domains_in_hmm_hits).intersection( set(self.domains_missing_in_SCG_domain_predictor))) > 0 hits_contain_a_source_missing_from_SCG_domain_predictor = len( set(sources_in_hmm_hits).intersection( set(self.sources_missing_in_SCG_domain_predictor))) > 0 if hits_contain_a_domain_missing_from_SCG_domain_predictor or hits_contain_a_source_missing_from_SCG_domain_predictor: info_text = "NO DOMAIN ESTIMATION BECAUSE THERE IS WEIRD STUFF GOING ON. Anvi'o is having hard time determining the domain for\ this particular genomic bin because it includes HMM hits coming from single-copy core gene collection anvi'o did not\ know about when the domain predictor was trained :/ This is not a very big deal as anvi'o will continue showing you\ the completion/redundancy estimates for every SCG collection you have in this contigs database, but it will predict\ the proper domain for you." return ('', {}, {}, info_text) # learn domain predictions from anvi'o random forest domain_probabilities, actual_domains, control_domains = self.SCG_domain_predictor.predict_from_observed_genes_per_domain( observed_genes_per_domain) if anvio.DEBUG: self.run.warning(None, header="DOMAIN ESTIMTES FOR '%s'" % bin_name, lc='green') for domain in control_domains: self.run.info_single( "Probability %s %.2f" % (domain.upper(), domain_probabilities[domain]), mc='cyan') for domain in actual_domains: source = self.SCG_domain_predictor.SCG_domain_to_source[domain] if domain in domains_in_hmm_hits: self.run.info_single( "Domain '%8s' (probability: %.2f) C/R: %.2f/%.2f" % (domain, domain_probabilities[domain], hmm_hits[domain][source]['percent_completion'], hmm_hits[domain][source]['percent_redundancy']), mc='green') else: self.run.info_single( "Domain '%8s' (probabiity: %.2f) (HMMs were not run for this / had 0 hits)" % (domain, domain_probabilities[domain]), mc='red') # figure out the best matching domain and its confidence by simply sorting # actual domains first. best_matching_domain, domain_matching_confidence = sorted( [ d for d in domain_probabilities.items() if d[0] in actual_domains ], key=lambda x: x[1], reverse=True)[0] # if the confidence is less than 0.2, then we are in the world of noise. # pick the control domain that matches best: if domain_matching_confidence < 0.20: best_matching_domain, domain_matching_confidence = sorted( [ d for d in domain_probabilities.items() if d[0] in control_domains ], key=lambda x: x[1], reverse=True)[0] # figure out the completion and redundancy given the best matching domain # for further filtering down below. if best_matching_domain in domains_in_hmm_hits: source = self.SCG_domain_predictor.SCG_domain_to_source[ best_matching_domain] best_mathcing_domain_completion, best_matching_domain_redundancy = hmm_hits[best_matching_domain][source]['percent_completion'], \ hmm_hits[best_matching_domain][source]['percent_redundancy'] else: best_mathcing_domain_completion, best_matching_domain_redundancy = None, None # figure shit out info_text = '' max_confidence = max(domain_probabilities.values()) if best_matching_domain in control_domains: if best_matching_domain == "mixed": info_text = "ANVI'O IS GETTING MIXED DOMAIN SIGNAL. This often means that the set of contigs here probably are\ originating from a populations that belong to different domains of life. This happens when your\ genomic bin includes tremendous amount of contamination that spans through archaea to bacteria to\ who knows what. That's OK, but anvi'o is unable to offer a completion or redundancy estimate for this\ selection." elif best_matching_domain == "blank": info_text = "NO DOMAIN ESTIMATION BECAUSE THERE IS NO SIGNAL. So anvi'o is having hard time determining any domain\ for this set of contigs either because the number of contigs are very little, or there are no\ SCGs among the selected ones. This may happen if you are working with genomes that are\ extremely low completion, or alternatively coming from parts of life that are very \ understudied (such as viruses or plasmids, etc). If these do not apply to you, and you are\ sure your set of contigs represents a proper genome, then either anvi'o made a mistake, or you\ stumbled upon a graet story." else: info_text = "ANVI'O IS CONFUSED. Your best predicted domain for this set of contigs seem to be a 'control domain'\ yet the code does not recognize that. So this is a question for the programmers :/" else: if max_confidence < 0.5 and domain_probabilities['mixed'] >= 0.25: info_text = "CRAP DOMAIN EST BECAUSE STUFF IS MIXED. Please note that anvi'o determined '%s' \ as the best matching domain for your contigs BUT actually the probability of these \ contigs to be coming from mixed domains is crazy high (%.2f). Which means, neither \ this domain prediction, nor the completion and redundancy estimates should mean much,\ and you should take a look at the entire list of C/R estimates from all domain SCGs :/\ The good news is that more refined the input contigs (such as you make more and more\ precise selections or provdide more and more refined genomes), this situation will\ likely correct itself." % ( best_matching_domain, domain_probabilities['mixed']) elif max_confidence < 0.5 and domain_probabilities['mixed'] < 0.25: info_text = "CRAP DOMAIN EST BECAUSE WHO KNOWS WHY. Please note that anvi'o determined '%s' as the\ best matching domain for your contigs to predict the completion and redundancy\ estimates through sigle-copy core genes, however, since the confidence is as low\ as %.2f, you should take this estimate with a grain of salt. This low confidence\ is most likely due to a very small number of contigs to offer reliable estimates\ of domain." % (best_matching_domain, domain_matching_confidence) else: if best_matching_domain_redundancy is None: if best_matching_domain in hmm_data.scg_domain_to_source: info_text = "HOUSTON, WE HAVE A PROBLEM. The very high confidence of domain prediction indicates that this set of contigs\ contain enough signal to classify it as %(domain)s. HOWEVER, your contigs database says that there are no hits for\ HMMs described in the collection that serves the domain %(domain)s. This is only possible if you haven't run\ `anvi-run-hmms` for some domains. This problem should go away if you were to run that program on your contigs\ database with the parameter `-I %(collection)s`. If you don't do anuthing it's OK, too. The only problem is that you will\ not get any completion/redundancy estiamtes for domain %(domain)s." % { 'domain': best_matching_domain, 'collection': hmm_data.scg_domain_to_source[best_matching_domain] } else: info_text = "HOUSTON, WE HAVE A VERY BIG PROBLEM. The very high confidence of domain prediction indicates that this set of contigs\ contain enough signal to classify it as %(domain)s. HOWEVER, your contigs database says that there are no hits for\ HMMs described in the collection that serves the domain %(domain)s, which is fine, BUT THEN your anvi'o installatio does not\ seem to have a collection that can be used to estimate the completion of this domain. This is all very very confusing\ and if you let a developer know, they will be happy to investigate how did you end up here :(" % { 'domain': best_matching_domain } elif best_matching_domain_redundancy < 10: info_text = "GREAT DOMAIN EST & YOU'RE GOLDEN. The very high confidence of domain prediction indicates that this set of contigs\ are almost certainly coming from a population that belongs to %s. IN ADDITION, the low redundancy of SCGs\ do not predict any serious contamination. But please remember that this information does not mean there is NO\ contamination in your genome bin. If you want to take a more carful look, you can try `anvi-refine`." \ % (best_matching_domain) elif best_matching_domain_redundancy >= 10 and best_matching_domain_redundancy <= 100: info_text = "GREAT DOMAIN CONFIDENCE (YAY) BUT SOME SERIOUS REDUNDANCY (BOO). The very high confidence of domain prediction indicates that this\ set of contigs are almost certainly coming from a population that belongs to domain %s. BUT you almost certainly are\ looking at a composite genome. You should consider refining this particular collection of contigs to lower the\ redundancy." % (best_matching_domain) elif best_matching_domain_redundancy > 100: info_text = "GREAT DOMAIN CONFIDENCE (YAY) BUT A TON OF REDUNDANCY (NOPE). The very high confidence of random forest indicates\ that the very large fraction of this set of contigs are almost certainly coming from populations that belong to the\ domain %s. HOWEVER, the extremely high amount of redundancy of SCGs in domain %s suggests that you either are working\ with a set of contigs that are extremely composite, or you are looking basically an entire metagenome or something." \ % (best_matching_domain, best_matching_domain) else: info_text = "GREAT DOMAIN CONFIDENCE BUT ANVI'O MADE A CONFUSE. Your redundancy estimates are weird and anvi'o needs an adult :(" if anvio.DEBUG: self.run.warning(info_text) return (best_matching_domain, domain_probabilities, control_domains, remove_spaces(info_text))
def get_best_matching_domain(self, d, observed_genes_per_domain): """Returns the best matcing domain by using a random forest classifier. The input dict `d` has no role in predicting the best matching domain, but useful to print out some helpful messages when necessary. It returns a tuple for best matching domain and how confident is the match according to the random forest classifier. """ # learn everything from the random forest. domain_predictions, prob_mixed_domains, prob_blank_domain = self.SCG_comain_predictor.predict_from_observed_genes_per_domain( observed_genes_per_domain) domain_specific_estimates = [] if anvio.DEBUG: self.run.warning(None, header="SCG DATA FOR C/R ESTIMTES", lc='green') self.run.info_single("Probability BLANK: %.2f" % (prob_blank_domain), mc='cyan') self.run.info_single("Probability COMPOSITE %.2f" % (prob_mixed_domains), mc='cyan') domain_cr = {} for confidence, domain in domain_predictions: source = self.SCG_comain_predictor.SCG_domain_to_source[domain] domain_cr[domain] = { 'percent_completion': d[domain][source]['percent_completion'], 'percent_redundancy': d[domain][source]['percent_redundancy'] } if anvio.DEBUG: self.run.info_single( "Domain '%8s' (probability: %.2f) C/R: %.2f/%.2f" % (domain, confidence, domain_cr[domain]['percent_completion'], domain_cr[domain]['percent_redundancy']), mc='green') domain_matching_confidence, best_matching_domain = domain_predictions[ 0] best_mathcing_domain_completion, best_matching_domain_redundancy = domain_cr[ domain]['percent_completion'], domain_cr[domain][ 'percent_redundancy'] # it he probabiity of this domain to be blank is so high, and the best matching domain confidence # is so low, do not return any completion or best matching domain estimates. if domain_matching_confidence < 0.15 and prob_blank_domain > 0.5: domain_matching_confidence, best_matching_domain = 0.0, None best_mathcing_domain_completion, best_matching_domain_redundancy = None, None info_text = '' max_confidence = max([t[0] for t in domain_predictions]) if prob_blank_domain > 0.5: info_text = "CRAP DOMAIN EST BECAUSE NO SIGNAL. So anvi'o is having hard time determining any domain\ for this selection either because the number of contigs are very little, or there are no\ SCGs among the selected ones. This may happen if you are working with genomes that are\ extremely low completion, or alternatively coming from parts of life that are very \ understudied (such as viruses or plasmids, etc). If these do not apply to you, and you are\ sure your selection represents a proper genome, then either anvi'o made a mistake, or you\ stumbled upon a graet story." elif max_confidence < 0.6 and prob_mixed_domains > 0.4: info_text = "CRAP DOMAIN EST BECAUSE STUFF IS MIXED. Please note that anvi'o determined '%s' \ as the best matching domain for your contigs BUT actually the probability of these \ contigs to be coming from mixed domains is crazy high (%.2f). Which means, neither \ this domain prediction, nor the completion and redundancy estimates should mean much,\ and you should take a look at the entire list of C/R estimates from all domain SCGs :/\ The good news is that more refined the input contigs (such as you make more and more\ precise selections or provdide more and more refined genomes), this situation will\ likely correct itself." % (best_matching_domain, prob_mixed_domains) elif max_confidence < 0.6 and prob_mixed_domains < 0.4: info_text = "CRAP DOMAIN EST BECAUSE WHO KNOWS WHY. Please note that anvi'o determined '%s' as the\ best matching domain for your contigs to predict the completion and redundancy\ estimates through sigle-copy core genes, however, since the confidence is as low\ as %.2f, you should take this estimate with a grain of salt. This low confidence\ is most likely due to a very small number of contigs to offer reliable estimates\ of domain." % (best_matching_domain, domain_matching_confidence) else: if best_matching_domain_redundancy < 10: info_text = "GREAT DOMAIN EST. YOU'RE GOLDEN. The very high confidence of random forest indicates that this set of contigs\ are almost certainly coming from a population that belongs to domain %s. IN ADDITION, the low redundancy of SCGs\ do not predict any serious contamination. But please remember that this information does not mean there is NO\ contamination in your genome bin." \ % (best_matching_domain) elif best_matching_domain_redundancy > 10 and best_matching_domain_redundancy < 100: info_text = "GREAT DOMAIN EST. YOU'RE GOLDEN. The very high confidence of random forest indicates that this set of contigs\ are almost certainly coming from a population that belongs to domain %s. BUT you almost certainly are looking at\ a composite genome. You should consider refining this particular collection of contigs to lower the redundancy." \ % (best_matching_domain) elif best_matching_domain_redundancy > 100: info_text = "GREAT DOMAIN EST. YOU'RE GOLDEN. The very high confidence of random forest indicates that the very large fraction\ of this set of contigs are almost certainly coming from populations that belong to the domain %s. HOWEVER, the\ extremely high amount of redundancy of SCGs in domain %s suggests that you either are working with a set of contigs\ that are extremely composite, or you are looking basically an entire metagenome or something." \ % (best_matching_domain, best_matching_domain) else: info_text = "GREAT DOMAIN EST. YOU'RE GOLDEN. But your redundancy estimates are weird :(" if anvio.DEBUG: self.run.warning(info_text) return (best_matching_domain, domain_matching_confidence, remove_spaces(info_text))