Esempio n. 1
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.run = run
        self.progress = progress

        A = lambda x: (args.__dict__[x]
                       if x in args.__dict__ else None) if args else None

        if self.mode == 'train':
            self.genomes_dir = os.path.abspath(A('genomes_dir'))
            self.classifier_output_path = os.path.abspath(A('output'))

            if A('classifier'):
                raise ConfigError(
                    "You should not initialize the domain training class with a input classifier path (`args.classifier`)."
                )

            if not self.genomes_dir:
                raise ConfigError(
                    "You must provide a genomes directory. Please read the help menu if you are not sure\
                                   how the contents of this directory should look like."
                )

            filesnpaths.is_output_file_writable(self.classifier_output_path)
            filesnpaths.is_file_exists(self.genomes_dir)

        elif self.mode == 'predict':
            if A('output'):
                raise ConfigError(
                    "You should not initialize the domain prediction class with an output classifier path (`args.output`)."
                )

            default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf'
            self.input_classifier_path = A('classifier') or os.path.join(
                os.path.dirname(anvio.data.__file__), default_classifier_path)

            if A('classifier'):
                filesnpaths.is_file_exists(self.input_classifier_path)
            else:
                if not filesnpaths.is_file_exists(self.input_classifier_path,
                                                  dont_raise=True):
                    raise ConfigError(
                        "Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\
                                       those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\
                                       If you are an anvi'o developer, what you need to do is to follow the instructions in \
                                       `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\
                                       classifier at the default anvi'o path of /blah/blah/anvio/data/%s."
                        % (default_classifier_path))

            self.rf = RF(self.input_classifier_path,
                         r=self.run,
                         p=self.progress)
            self.rf.initialize_classifier()

        else:
            raise ConfigError(
                "Someone initialized the SCG domain classifier class without an explicit mode :("
            )

        self.SCG_sources = [
            d for d in hmm_data.sources
            if hmm_data.sources[d]['kind'] == 'singlecopy'
        ]
        self.SCG_domains = sorted([
            hmm_data.sources[source]['domain'] for source in self.SCG_sources
        ])
        self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'],
                                           source)
                                          for source in self.SCG_sources])

        if not len(self.SCG_sources):
            raise ConfigError(
                "There is something wrong :( There is not even a single SCG source found. Usually\
                               anvi'o comes with multiple of them :/")

        if len(self.SCG_sources) == 1:
            raise ConfigError(
                "There is only a single SCG source in your anvi'o installation. It is OK if you are\
                               being a hacker and playing with things, but there is no logic behind creating a\
                               classifier with a single class.")

        if len(self.SCG_domains) != len(set(self.SCG_domains)):
            raise ConfigError(
                "Something is wrong. For each domain, there must be a single sinlge-copy core gene\
                               source.")

        self.data, self.labels, self.features = [], [], []

        for domain in self.SCG_domains:
            self.features.extend(
                sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]
                       ['genes']))

        self.run.info('SCG domain classifier mode', self.mode)
        self.run.info("SCG domains found", ', '.join(self.SCG_domains))
        self.run.info("Num features", len(self.features))
Esempio n. 2
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.run = run
        self.progress = progress

        A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None

        if self.mode == 'train':
            self.genomes_dir = os.path.abspath(A('genomes_dir'))
            self.classifier_output_path = os.path.abspath(A('output'))

            if A('classifier'):
                raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).")

            if not self.genomes_dir:
                raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\
                                   how the contents of this directory should look like.")

            filesnpaths.is_output_file_writable(self.classifier_output_path)
            filesnpaths.is_file_exists(self.genomes_dir)

        elif self.mode == 'predict':
            if A('output'):
                raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).")

            default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf'
            self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path)

            if A('classifier'):
                filesnpaths.is_file_exists(self.input_classifier_path)
            else:
                if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True):
                    raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\
                                       those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\
                                       If you are an anvi'o developer, what you need to do is to follow the instructions in \
                                       `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\
                                       classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path))

            self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress)
            self.rf.initialize_classifier()

        else:
            raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(")

        self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy']
        self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources])
        self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources])

        if not len(self.SCG_sources):
            raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\
                               anvi'o comes with multiple of them :/")

        if len(self.SCG_sources) == 1:
            raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\
                               being a hacker and playing with things, but there is no logic behind creating a\
                               classifier with a single class.")

        if len(self.SCG_domains) != len(set(self.SCG_domains)):
            raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\
                               source.")

        self.data, self.labels, self.features  = [], [], []

        for domain in self.SCG_domains:
            self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes']))

        self.run.info('SCG domain classifier mode', self.mode)
        self.run.info("SCG domains found", ', '.join(self.SCG_domains))
        self.run.info("Num features", len(self.features))
Esempio n. 3
0
 def train(self):
     rf = RF(self.classifier_output_path, r=self.run, p=self.progress)
     rf.train(self.features, self.data, self.labels)
Esempio n. 4
0
 def train(self):
     rf = RF(self.classifier_output_path, r=self.run, p=self.progress)
     rf.train(self.features, self.data, self.labels)