def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None if self.mode == 'train': self.genomes_dir = os.path.abspath(A('genomes_dir')) self.classifier_output_path = os.path.abspath(A('output')) if A('classifier'): raise ConfigError( "You should not initialize the domain training class with a input classifier path (`args.classifier`)." ) if not self.genomes_dir: raise ConfigError( "You must provide a genomes directory. Please read the help menu if you are not sure\ how the contents of this directory should look like." ) filesnpaths.is_output_file_writable(self.classifier_output_path) filesnpaths.is_file_exists(self.genomes_dir) elif self.mode == 'predict': if A('output'): raise ConfigError( "You should not initialize the domain prediction class with an output classifier path (`args.output`)." ) default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf' self.input_classifier_path = A('classifier') or os.path.join( os.path.dirname(anvio.data.__file__), default_classifier_path) if A('classifier'): filesnpaths.is_file_exists(self.input_classifier_path) else: if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True): raise ConfigError( "Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\ those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\ If you are an anvi'o developer, what you need to do is to follow the instructions in \ `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\ classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path)) self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress) self.rf.initialize_classifier() else: raise ConfigError( "Someone initialized the SCG domain classifier class without an explicit mode :(" ) self.SCG_sources = [ d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy' ] self.SCG_domains = sorted([ hmm_data.sources[source]['domain'] for source in self.SCG_sources ]) self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources]) if not len(self.SCG_sources): raise ConfigError( "There is something wrong :( There is not even a single SCG source found. Usually\ anvi'o comes with multiple of them :/") if len(self.SCG_sources) == 1: raise ConfigError( "There is only a single SCG source in your anvi'o installation. It is OK if you are\ being a hacker and playing with things, but there is no logic behind creating a\ classifier with a single class.") if len(self.SCG_domains) != len(set(self.SCG_domains)): raise ConfigError( "Something is wrong. For each domain, there must be a single sinlge-copy core gene\ source.") self.data, self.labels, self.features = [], [], [] for domain in self.SCG_domains: self.features.extend( sorted(hmm_data.sources[self.SCG_domain_to_source[domain]] ['genes'])) self.run.info('SCG domain classifier mode', self.mode) self.run.info("SCG domains found", ', '.join(self.SCG_domains)) self.run.info("Num features", len(self.features))
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None if self.mode == 'train': self.genomes_dir = os.path.abspath(A('genomes_dir')) self.classifier_output_path = os.path.abspath(A('output')) if A('classifier'): raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).") if not self.genomes_dir: raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\ how the contents of this directory should look like.") filesnpaths.is_output_file_writable(self.classifier_output_path) filesnpaths.is_file_exists(self.genomes_dir) elif self.mode == 'predict': if A('output'): raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).") default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf' self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path) if A('classifier'): filesnpaths.is_file_exists(self.input_classifier_path) else: if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True): raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\ those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\ If you are an anvi'o developer, what you need to do is to follow the instructions in \ `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\ classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path)) self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress) self.rf.initialize_classifier() else: raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(") self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy'] self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources]) self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources]) if not len(self.SCG_sources): raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\ anvi'o comes with multiple of them :/") if len(self.SCG_sources) == 1: raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\ being a hacker and playing with things, but there is no logic behind creating a\ classifier with a single class.") if len(self.SCG_domains) != len(set(self.SCG_domains)): raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\ source.") self.data, self.labels, self.features = [], [], [] for domain in self.SCG_domains: self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes'])) self.run.info('SCG domain classifier mode', self.mode) self.run.info("SCG domains found", ', '.join(self.SCG_domains)) self.run.info("Num features", len(self.features))
def train(self): rf = RF(self.classifier_output_path, r=self.run, p=self.progress) rf.train(self.features, self.data, self.labels)