def _stop_training(self):
        """ Do the optimization step and define final parameter choice
        
        This is the main method of this node!
        
        .. todo:: Allow also parallelization over nominal_ranges! 
        """
        self._log("Starting optimization Process.")
        self.runs = [10 * self.run_number + run for run in range(self.runs)]
        original_flow_template = copy.copy(self.flow_template)
        # Fill in validation parameters in the template
        self.flow_template = NodeChainFactory.replace_parameters_in_node_chain(
            original_flow_template, self.validation_parameter_settings)
        if self.nom_rng is None:
            self.prepare_optimization()
            self.best_parametrization, self.best_performance = \
                self.get_best_parametrization()
            self.performance_dict[self.p2key(self.best_parametrization)] = \
                (self.best_performance, self.best_parametrization)
        else:
            nom_grid = self.search_grid(self.nom_rng)
            iterations = 0
            search_history = []
            # copy flow_template since we have to instantiate for every nom_par
            flow_template = copy.copy(self.flow_template)
            for nom_par in nom_grid:
                # for getting the best parameterization,
                # the class attribute flow_template must be overwritten
                self.flow_template = \
                    NodeChainFactory.replace_parameters_in_node_chain(
                        flow_template, nom_par)
                self.prepare_optimization()
                parametrization, performance = self.get_best_parametrization()
                self.performance_dict[self.p2key(nom_par)] = (performance, 
                                                              parametrization)
                iterations += self.iterations
                search_history.append((nom_par,self.search_history))
                # reinitialize optimization parameters
                self.re_init()
            # reconstructing the overwritten flow for further usage
            self.flow_template = flow_template
            self.iterations = iterations
            self.search_history = sorted(search_history, 
                                     key=lambda t: t[1][-1]["best_performance"])
            best_key = max(sorted(self.performance_dict.items()),
                                                          key=lambda t: t[1])[0]
            self.best_performance, self.best_parametrization = \
                self.performance_dict[best_key]
            self.best_parametrization.update(dict(best_key))
        # when best parameter dict is calculated, this has to be logged
        # or saved and the chosen parameter is used for training on the
        # whole data set, independent of the chosen algorithm
        self._log("Using parameterization %s with optimal performance %s for " \
                  "metric %s." % (self.best_parametrization,
                                  self.best_performance, self.metric))
        # Fill in the final parameters in the flow template
        self.flow_template = NodeChainFactory.replace_parameters_in_node_chain(
            original_flow_template, self.final_training_parameter_settings)
        best_flow_template = self.flow_template
        best_flow_template[1] = {'node': 'All_Train_Splitter'}
        #delete last node
        best_flow_template.pop(-1)
        self.flow = self.generate_subflow(best_flow_template, 
                                          self.best_parametrization, NodeChain)
        self.flow[-1].set_run_number(self.run_number)
        self.flow[0].set_generator(self.train_instances)
        self.flow.train()
        self._log("Training of optimal flow finished")

        # delete training instances that would be stored to disk if this node
        # is saved
        del self.train_instances
Esempio n. 2
0
    def _stop_training(self):
        """ Do the optimization step and define final parameter choice
        
        This is the main method of this node!
        
        .. todo:: Allow also parallelization over nominal_ranges! 
        """
        self._log("Starting optimization Process.")
        self.runs = [10 * self.run_number + run for run in range(self.runs)]
        original_flow_template = copy.copy(self.flow_template)
        # Fill in validation parameters in the template
        self.flow_template = NodeChainFactory.replace_parameters_in_node_chain(
            original_flow_template, self.validation_parameter_settings)
        if self.nom_rng is None:
            self.prepare_optimization()
            self.best_parametrization, self.best_performance = \
                self.get_best_parametrization()
            self.performance_dict[self.p2key(self.best_parametrization)] = \
                (self.best_performance, self.best_parametrization)
        else:
            nom_grid = self.search_grid(self.nom_rng)
            iterations = 0
            search_history = []
            # copy flow_template since we have to instantiate for every nom_par
            flow_template = copy.copy(self.flow_template)
            for nom_par in nom_grid:
                # for getting the best parameterization,
                # the class attribute flow_template must be overwritten
                self.flow_template = \
                    NodeChainFactory.replace_parameters_in_node_chain(
                        flow_template, nom_par)
                self.prepare_optimization()
                parametrization, performance = self.get_best_parametrization()
                self.performance_dict[self.p2key(nom_par)] = (performance,
                                                              parametrization)
                iterations += self.iterations
                search_history.append((nom_par, self.search_history))
                # reinitialize optimization parameters
                self.re_init()
            # reconstructing the overwritten flow for further usage
            self.flow_template = flow_template
            self.iterations = iterations
            self.search_history = sorted(
                search_history, key=lambda t: t[1][-1]["best_performance"])
            best_key = max(sorted(self.performance_dict.items()),
                           key=lambda t: t[1])[0]
            self.best_performance, self.best_parametrization = \
                self.performance_dict[best_key]
            self.best_parametrization.update(dict(best_key))
        # when best parameter dict is calculated, this has to be logged
        # or saved and the chosen parameter is used for training on the
        # whole data set, independent of the chosen algorithm
        self._log("Using parameterization %s with optimal performance %s for " \
                  "metric %s." % (self.best_parametrization,
                                  self.best_performance, self.metric))
        # Fill in the final parameters in the flow template
        self.flow_template = NodeChainFactory.replace_parameters_in_node_chain(
            original_flow_template, self.final_training_parameter_settings)
        best_flow_template = self.flow_template
        best_flow_template[1] = {'node': 'All_Train_Splitter'}
        #delete last node
        best_flow_template.pop(-1)
        self.flow = self.generate_subflow(best_flow_template,
                                          self.best_parametrization, NodeChain)
        self.flow[-1].set_run_number(self.run_number)
        self.flow[0].set_generator(self.train_instances)
        self.flow.train()
        self._log("Training of optimal flow finished")

        # delete training instances that would be stored to disk if this node
        # is saved
        del self.train_instances