Exemple #1
0
    def si(self) -> FeatureEngineer:
        """
        Run swarm intelligence for optimizing semi-continuous, continuous and (one-hot-encoded) categorical feature engineering

        :return: FeatureEngineer
            FeatureEngineer object containing the hole feature engineering, meta data included
        """
        if self.engineer_continuous:
            if self.generate_continuous:
                self._generate_continuous_features()
            if len(self.feature_engineer.get_predictors()) >= 4:
                if self.train_continuous_critic:
                    self._feature_critic()
                self._pre_define_max_features(feature_type='continuous', scale=True if self.user_defined_max_features <= 1 else False)
                self._evolve_feature_learning_ai(feature_type='continuous', evolutionary_algorithm='si')
                self._feature_learning(feature_type='continuous', evolutionary_algorithm='si')
            else:
                Log(write=False, env='dev').log(msg='Not enough continuous or ordinal features to efficiently run reinforcement feature learning framework')
        if self.engineer_categorical:
            if self.generate_categorical:
                self._generate_categorical_features()
            if self.save_temp_data:
                if self.output_path is None:
                    Log(write=False, level='info').log(msg='No output path found for writing temporary data for applying one-hot merging')
                else:
                    self.feature_engineer.save(file_path='{}feature_learning.p'.format(self.output_path),
                                               cls_obj=True,
                                               overwrite=True,
                                               create_dir=False
                                               )
                    del self.feature_engineer
                    self.feature_engineer: FeatureEngineer = FeatureEngineer(feature_engineer_file_path='{}feature_learning.p'.format(self.output_path))
                    _continuous_features: List[str] = self.feature_engineer.get_feature_types().get('continuous')
                    self.feature_engineer.clean(markers=dict(features=_continuous_features))
                    _remaining_non_categorical_features: List[str] = self.feature_engineer.get_feature_types().get('date')# + self.feature_engineer.get_feature_types().get('text')
                    self.feature_engineer.clean(markers=dict(features=_remaining_non_categorical_features))
                    if len(self.feature_engineer.get_predictors()) >= 4:
                        if self.train_categorical_critic:
                            self._feature_critic()
                        self._pre_define_max_features(feature_type='categorical', scale=True if self.user_defined_max_features <= 1 else False)
                        self._evolve_feature_learning_ai(feature_type='categorical', evolutionary_algorithm='si')
                        self._feature_learning(feature_type='categorical', evolutionary_algorithm='si')
                        self.feature_engineer.merge_engineer(feature_engineer_file_path='{}feature_learning.p'.format(self.output_path))
                    else:
                        Log(write=False, env='dev').log(msg='Not enough categorical features to efficiently run reinforcement feature learning framework')
            else:
                if len(self.feature_engineer.get_predictors()) >= 4:
                    if self.train_categorical_critic:
                        self._feature_critic()
                    self._pre_define_max_features(feature_type='categorical',
                                                  scale=True if self.user_defined_max_features <= 1 else False)
                    self._evolve_feature_learning_ai(feature_type='categorical', evolutionary_algorithm='si')
                    self._feature_learning(feature_type='categorical', evolutionary_algorithm='si')
                else:
                    Log(write=False, env='dev').log(msg='Not enough categorical features to efficiently run reinforcement feature learning framework')
        if len(self.evolved_features) > 0:
            self.feature_engineer.set_predictors(features=list(set(self.evolved_features)), exclude_original_data=False)
        return self.feature_engineer
Exemple #2
0
    def correlation(self,
                    meth: str = 'pearson',
                    min_obs: int = 1) -> pd.DataFrame:
        """

        Calculate correlation coefficients

        :param meth: String containing the method to be used as correlation coefficient
                        -> pearson: Marginal Correlation based on Pearson's r
                        -> kendall: Rank Correlation based on Kendall
                        -> spearman: Rank Correlation based on Spearman
                        -> partial: Partial Correlation
        :param min_obs: Integer indicating the minimum amount of valid observations
        :return: Pandas DataFrame containing the correlation matrix
        """
        if meth in ['pearson', 'kendall', 'spearman']:
            _cor: pd.DataFrame = self.data_set[self.features].corr(
                method=meth, min_periods=min_obs)
        elif meth == 'partial':
            if self.data_set.shape[0] - self.data_set.isnull().astype(
                    dtype=int).sum().sum() > 0:
                _cov: np.ndarray = np.cov(
                    m=self.data_set[self.features].dropna())
                try:
                    assert np.linalg.det(_cov) > np.finfo(np.float32).eps
                    _inv_var_cov: np.ndarray = np.linalg.inv(_cov)
                except AssertionError:
                    _inv_var_cov: np.ndarray = np.linalg.pinv(_cov)
                    #warnings.warn('The inverse of the variance-covariance matrix '
                    #              'was calculated using the Moore-Penrose generalized '
                    #              'matrix inversion, due to its determinant being at '
                    #              'or very close to zero.')
                _std: np.ndarray = np.sqrt(np.diag(_inv_var_cov))
                _cov2cor: np.ndarray = _inv_var_cov / np.outer(_std, _std)
                _cor: pd.DataFrame = pd.DataFrame(
                    data=np.nan_to_num(x=_cov2cor, copy=True) * -1,
                    columns=self.features,
                    index=self.features)
            else:
                _cor: pd.DataFrame = pd.DataFrame()
                Log(write=False, level='info').log(
                    msg=
                    'Can not calculate coefficients for partial correlation because of the high missing data rate'
                )
        else:
            raise HappyLearningUtilsException(
                'Method for calculating correlation coefficient ({}) not supported'
                .format(meth))
        return _cor
Exemple #3
0
    def _feature_learning(self, feature_type: str, evolutionary_algorithm: str):
        """
        Run reinforcement feature learning based on feature types (categorical or continuous)

        :param feature_type: str
            Name of the feature type to engineer
                -> continuous: (semi-) continuous features
                -> categorical: categorical (nominal) features

        :param evolutionary_algorithm: str
            Name of the reinforced evolutionary algorithm
                -> ga: Genetic Algorithm
                -> si: Swarm Intelligence
        """
        Log(write=False, level='error').log(msg='Start feature engineering using {} features'.format('continuous original' if feature_type == 'continuous' else 'categorical one-hot-encoded'))
        if self.kwargs.get('mutation_prob') is None:
            self.kwargs.update(dict(mutation_prob=0.5))
        if self.kwargs.get('adjustment_prob') is None:
            self.kwargs.update(dict(adjustment_prob=0.5))
        if self.kwargs.get('max_generations') is None:
            self.kwargs.update(dict(max_generations=5))
        if self.kwargs.get('max_adjustments') is None:
            self.kwargs.update(dict(max_adjustments=5))
        if self.kwargs.get('parents_ratio') is None:
            self.kwargs.update(dict(parents_ratio=0.5))
        if evolutionary_algorithm == 'ga':
            _feature_learning_evolution: GeneticAlgorithm = GeneticAlgorithm(mode='feature_engineer',
                                                                             feature_engineer=self.feature_engineer,
                                                                             df=self.feature_engineer.get_data(),
                                                                             target=self.feature_engineer.get_target(),
                                                                             features=self.feature_engineer.get_predictors(),
                                                                             re_split_data=False if self.kwargs.get('re_split_data') is None else self.kwargs.get('re_split_data'),
                                                                             re_sample_cases=False if self.kwargs.get('re_sample_cases') is None else self.kwargs.get('re_sample_cases'),
                                                                             re_sample_features=False,
                                                                             max_features=self.max_features,
                                                                             labels=self.kwargs.get('labels'),
                                                                             models=[self.categorical_learning.get('model_name')] if feature_type == 'categorical' else [self.continuous_learning.get('model_name')],
                                                                             model_params=self.categorical_learning.get('param') if feature_type == 'categorical' else self.continuous_learning.get('param'),
                                                                             burn_in_generations=-1,
                                                                             warm_start=False,
                                                                             max_generations=self.kwargs.get('max_generations'),
                                                                             pop_size=64 if self.kwargs.get('pop_size') is None else self.kwargs.get('pop_size'),
                                                                             mutation_rate=0.1,
                                                                             mutation_prob=self.kwargs.get('mutation_prob'),
                                                                             parents_ratio=self.kwargs.get('parents_ratio'),
                                                                             early_stopping=0,
                                                                             convergence=False,
                                                                             timer_in_seconds=43200 if self.kwargs.get('timer_in_secondes') is None else self.kwargs.get('timer_in_secondes'),
                                                                             force_target_type=self.force_target_type,
                                                                             plot=False if self.kwargs.get('plot') is None else self.kwargs.get('plot'),
                                                                             output_file_path=self.kwargs.get('output_file_path'),
                                                                             multi_threading=False if self.kwargs.get('multi_threading') is None else self.kwargs.get('multi_threading'),
                                                                             multi_processing=False if self.kwargs.get('multi_processing') is None else self.kwargs.get('multi_processing'),
                                                                             log=False if self.kwargs.get('log') is None else self.kwargs.get('log'),
                                                                             verbose=0 if self.kwargs.get('verbose') is None else self.kwargs.get('verbose')
                                                                             )
        elif evolutionary_algorithm == 'si':
            _feature_learning_evolution: SwarmIntelligence = SwarmIntelligence(mode='feature_engineer',
                                                                               feature_engineer=self.feature_engineer,
                                                                               df=self.feature_engineer.get_data(),
                                                                               target=self.feature_engineer.get_target(),
                                                                               features=self.feature_engineer.get_predictors(),
                                                                               re_split_data=False if self.kwargs.get('re_split_data') is None else self.kwargs.get('re_split_data'),
                                                                               re_sample_cases=False if self.kwargs.get('re_sample_cases') is None else self.kwargs.get('re_sample_cases'),
                                                                               re_sample_features=False,
                                                                               max_features=self.max_features,
                                                                               labels=self.kwargs.get('labels'),
                                                                               models=[self.categorical_learning.get('model_name')] if feature_type == 'categorical' else [self.continuous_learning.get('model_name')],
                                                                               model_params=self.categorical_learning.get('param') if feature_type == 'categorical' else self.continuous_learning.get('param'),
                                                                               burn_in_adjustments=-1,
                                                                               warm_start=False,
                                                                               max_adjustments=self.kwargs.get('max_adjustments'),
                                                                               pop_size=64 if self.kwargs.get('pop_size') is None else self.kwargs.get('pop_size'),
                                                                               adjustment_rate=0.1,
                                                                               adjustment_prob=self.kwargs.get('adjustment_prob'),
                                                                               early_stopping=0,
                                                                               convergence=False,
                                                                               timer_in_seconds=43200 if self.kwargs.get('timer_in_secondes') is None else self.kwargs.get('timer_in_secondes'),
                                                                               force_target_type=self.force_target_type,
                                                                               plot=False if self.kwargs.get('plot') is None else self.kwargs.get('plot'),
                                                                               output_file_path=self.kwargs.get('output_file_path'),
                                                                               multi_threading=False if self.kwargs.get('multi_threading') is None else self.kwargs.get('multi_threading'),
                                                                               multi_processing=False if self.kwargs.get('multi_processing') is None else self.kwargs.get('multi_processing'),
                                                                               log=False if self.kwargs.get('log') is None else self.kwargs.get('log'),
                                                                               verbose=0 if self.kwargs.get('verbose') is None else self.kwargs.get('verbose')
                                                                               )
        else:
            raise FeatureLearningException('Reinforced evolutionary algorithm ({}) not supported'.format(evolutionary_algorithm))
        _feature_learning_evolution.optimize()
        self.evolved_features.extend(_feature_learning_evolution.evolved_features)
        self.feature_engineer = _feature_learning_evolution.feature_engineer
        if evolutionary_algorithm == 'ga':
            _generated_features: List[str] = _feature_learning_evolution.mutated_features.get('child')
        else:
            _generated_features: List[str] = _feature_learning_evolution.adjusted_features.get('to')
        Log(write=False, level='error').log(msg='Generated {} engineered features'.format(len(_generated_features)))
        if self.keep_fittest_only:
            Log(write=False, level='error').log(msg='Selected {} fittest features'.format(len(_feature_learning_evolution.evolved_features)))
            _erase: Dict[str, List[str]] = dict(features=list(set(_generated_features).difference(_feature_learning_evolution.evolved_features)))
            if len(_erase.get('features')) > 0:
                self.feature_engineer.clean(markers=_erase)
        del _feature_learning_evolution
Exemple #4
0
    def _evolve_feature_learning_ai(self, feature_type: str, evolutionary_algorithm: str):
        """
        Evolve ai for feature learning using genetic algorithm

        :param feature_type: str
            Name of the feature type to engineer
                -> continuous: (semi-) continuous features
                -> categorical: categorical (nominal) features

        :param evolutionary_algorithm: str
            Name of the reinforced evolutionary algorithm
                -> ga: Genetic Algorithm
                -> si: Swarm Intelligence
        """
        if feature_type == 'continuous':
            Log(write=False, level='info').log(msg='Evolve feature learning ai for engineering (semi-) continuous features ...')
        else:
            Log(write=False, level='info').log(msg='Evolve feature learning ai for engineering categorical (one-hot encoded) features ...')
        if evolutionary_algorithm == 'ga':
            _feature_learner: GeneticAlgorithm = GeneticAlgorithm(mode='model',
                                                                  target=self.feature_engineer.get_target(),
                                                                  features=self.feature_engineer.get_predictors(),
                                                                  re_split_data=False if self.kwargs.get('re_split_data') is None else self.kwargs.get('re_split_data'),
                                                                  re_sample_cases=False if self.kwargs.get('re_sample_cases') is None else self.kwargs.get('re_sample_cases'),
                                                                  re_sample_features=True,
                                                                  max_features=self.max_features,
                                                                  labels=self.kwargs.get('labels'),
                                                                  models=['cat'] if self.kwargs.get('models') is None else self.kwargs.get('models'),
                                                                  model_params=None,
                                                                  burn_in_generations=-1 if self.kwargs.get('burn_in_generations') is None else self.kwargs.get('burn_in_generations'),
                                                                  warm_start=True if self.kwargs.get('warm_start') is None else self.kwargs.get('warm_start'),
                                                                  max_generations=2 if self.kwargs.get('max_generations_ai') is None else self.kwargs.get('max_generations_ai'),
                                                                  pop_size=64 if self.kwargs.get('pop_size') is None else self.kwargs.get('pop_size'),
                                                                  mutation_rate=0.1 if self.kwargs.get('mutation_rate') is None else self.kwargs.get('mutation_rate'),
                                                                  mutation_prob=0.85 if self.kwargs.get('mutation_prob') is None else self.kwargs.get('mutation_prob'),
                                                                  parents_ratio=0.5 if self.kwargs.get('parents_ratio') is None else self.kwargs.get('parents_ratio'),
                                                                  early_stopping=0 if self.kwargs.get('early_stopping') is None else self.kwargs.get('early_stopping'),
                                                                  convergence=False if self.kwargs.get('convergence') is None else self.kwargs.get('convergence'),
                                                                  timer_in_seconds=43200 if self.kwargs.get('timer_in_secondes') is None else self.kwargs.get('timer_in_secondes'),
                                                                  force_target_type=self.force_target_type,
                                                                  plot=False if self.kwargs.get('plot') is None else self.kwargs.get('plot'),
                                                                  output_file_path=self.kwargs.get('output_file_path'),
                                                                  multi_threading=False if self.kwargs.get('multi_threading') is None else self.kwargs.get('multi_threading'),
                                                                  multi_processing=False if self.kwargs.get('multi_processing') is None else self.kwargs.get('multi_processing'),
                                                                  log=False if self.kwargs.get('log') is None else self.kwargs.get('log'),
                                                                  verbose=0 if self.kwargs.get('verbose') is None else self.kwargs.get('verbose'),
                                                                  feature_engineer=self.feature_engineer
                                                                  )
        elif evolutionary_algorithm == 'si':
            _feature_learner: SwarmIntelligence = SwarmIntelligence(mode='model',
                                                                    target=self.feature_engineer.get_target(),
                                                                    features=self.feature_engineer.get_predictors(),
                                                                    re_split_data=False if self.kwargs.get('re_split_data') is None else self.kwargs.get('re_split_data'),
                                                                    re_sample_cases=False if self.kwargs.get('re_sample_cases') is None else self.kwargs.get('re_sample_cases'),
                                                                    re_sample_features=True,
                                                                    max_features=self.max_features,
                                                                    labels=self.kwargs.get('labels'),
                                                                    models=['cat'] if self.kwargs.get('models') is None else self.kwargs.get('models'),
                                                                    model_params=None,
                                                                    burn_in_adjustments=-1 if self.kwargs.get('burn_in_adjustments') is None else self.kwargs.get('burn_in_adjustments'),
                                                                    warm_start=True if self.kwargs.get('warm_start') is None else self.kwargs.get('warm_start'),
                                                                    max_adjustments=2 if self.kwargs.get('max_adjustments_ai') is None else self.kwargs.get('max_adjustments_ai'),
                                                                    pop_size=64 if self.kwargs.get('pop_size') is None else self.kwargs.get('pop_size'),
                                                                    adjustment_rate=0.1 if self.kwargs.get('adjustment_rate') is None else self.kwargs.get('adjustment_rate'),
                                                                    adjustment_prob=0.85 if self.kwargs.get('adjustment_prob') is None else self.kwargs.get('adjustment_prob'),
                                                                    early_stopping=0 if self.kwargs.get('early_stopping') is None else self.kwargs.get('early_stopping'),
                                                                    convergence=False if self.kwargs.get('convergence') is None else self.kwargs.get('convergence'),
                                                                    timer_in_seconds=43200 if self.kwargs.get('timer_in_secondes') is None else self.kwargs.get('timer_in_secondes'),
                                                                    force_target_type=self.force_target_type,
                                                                    plot=False if self.kwargs.get('plot') is None else self.kwargs.get('plot'),
                                                                    output_file_path=self.kwargs.get('output_file_path'),
                                                                    multi_threading=False if self.kwargs.get('multi_threading') is None else self.kwargs.get('multi_threading'),
                                                                    multi_processing=False if self.kwargs.get('multi_processing') is None else self.kwargs.get('multi_processing'),
                                                                    log=False if self.kwargs.get('log') is None else self.kwargs.get('log'),
                                                                    verbose=0 if self.kwargs.get('verbose') is None else self.kwargs.get('verbose'),
                                                                    feature_engineer=self.feature_engineer
                                                                    )
        else:
            raise FeatureLearningException('Reinfoced evolutionary algorithm ({}) not supported'.format(evolutionary_algorithm))
        _feature_learner.optimize()
        if feature_type == 'categorical':
            self.categorical_learning = _feature_learner.evolution
        else:
            self.continuous_learning = _feature_learner.evolution
        Log(write=False, level='error').log(msg='Feature learning ai evolved -> {}'.format(_feature_learner.evolution.get('model_name')))
    def mice(self, rubin_gelman_convergence: bool = False) -> dd.DataFrame:
        """
        Run multiple imputation by chained equation (mice)

        :param rubin_gelman_convergence: bool
            Run process until rubin-gelman convergence test passes

        :return dask DataFrame:
            Fully imputed data set
        """
        # Step 1: Initial imputation
        self.df = self.df.fillna(0)
        _std: dict = {ft: self.df[ft].std() for ft in self.imp_sequence}
        _pool_std: dict = {}
        for i in range(0, self.n_iter, 1):
            Log(write=False, env='dev').log(msg='Iteration: {}'.format(i))
            for imp in self.imp_sequence:
                Log(write=False,
                    env='dev').log(msg='Imputation of: {}'.format(imp))
                # Step 2: Re-impute missing values for imputing feature
                #self.df.loc[self.nan_idx.get(imp), imp] = np.nan
                if i + 1 > self.n_burn_in_iter:
                    for m in range(0, self.n_chains, 1):
                        # Step 3: Train machine learning algorithm und run prediction for each chain
                        if imp in self.feature_types.get('categorical'):
                            _pred = Classification(
                                clf_params=dict(n_estimators=50)
                            ).extreme_gradient_boosting_tree().fit(
                                X=self.df[self.predictors[imp]],
                                y=self.df[imp]).predict(
                                    data=self.df[self.predictors[imp]])
                        elif imp in self.feature_types.get('continuous'):
                            _pred = Regression(
                                reg_params=dict(n_estimators=50)
                            ).extreme_gradient_boosting_tree().fit(
                                X=self.df[self.predictors[imp]],
                                y=self.df[imp]).predict(
                                    data=self.df[self.predictors[imp]])
                        elif imp in self.feature_types.get('date'):
                            _pred = Regression(
                                reg_params=dict(n_estimators=50)
                            ).extreme_gradient_boosting_tree().fit(
                                X=self.df[self.predictors[imp]],
                                y=self.df[imp]).predict(
                                    data=self.df[self.predictors[imp]])
                        else:
                            raise MultipleImputationException(
                                'Data type of feature "{}" not supported for imputation'
                                .format(imp))
                        self.chains.get(m).update({
                            imp:
                            pd.DataFrame(
                                data=_pred,
                                columns=['pred']).loc[self.nan_idx.get(imp),
                                                      'pred'].values.tolist()
                        })
                        # Step 4: Impute missing values with predictions
                        self.df.loc[self.nan_idx.get(imp),
                                    imp] = self.chains[m].get(imp)
                        if i + 1 == self.n_iter:
                            _pool_std.update({
                                m:
                                dict(std=self.df[imp].std(),
                                     diff=_std.get(imp) - self.df[imp].std())
                            })
                else:
                    # Step 3: Train machine learning algorithm und run prediction for each chain
                    if imp in self.feature_types.get('categorical'):
                        _pred = Classification(
                        ).extreme_gradient_boosting_tree().fit(
                            X=self.df[self.predictors[imp]].compute(),
                            y=self.df[imp].compute()).predict(
                                data=self.df[self.predictors[imp]].compute())
                    elif imp in self.feature_types.get('continuous'):
                        _pred = Regression().extreme_gradient_boosting_tree(
                        ).fit(X=self.df[self.predictors[imp]].compute(),
                              y=self.df[imp].compute()).predict(
                                  data=self.df[self.predictors[imp]].compute())
                    elif imp in self.feature_types.get('date'):
                        _pred = Regression().extreme_gradient_boosting_tree(
                        ).fit(X=self.df[self.predictors[imp]].compute(),
                              y=self.df[imp].compute()).predict(
                                  data=self.df[self.predictors[imp]].compute())
                    else:
                        raise MultipleImputationException(
                            'Data type of feature "{}" not supported for imputation'
                            .format(imp))
                    # Step 4: Impute missing values with predictions
                    self.df.loc[self.nan_idx.get(imp), imp] = pd.DataFrame(
                        data=_pred,
                        columns=['pred']).loc[self.nan_idx.get(imp),
                                              'pred'].values.tolist()
        # Step 5: Evaluate imputed chains
        if self.pool_eval_meth == 'std':
            _diff: List[float] = [
                abs(_pool_std[s].get('diff')) for s in _pool_std.keys()
            ]
            _best_set: int = _diff.index(max(_diff))
            Log(write=False,
                env='dev').log(msg='Best Set: {}'.format(_best_set))
            for ft in self.imp_sequence:
                self.df.loc[self.nan_idx.get(ft),
                            ft] = self.chains[_best_set].get(ft)
        else:
            raise MultipleImputationException(
                'Evaluation method ({}) for pooling multiple imputed data sets not supported'
                .format(self.pool_eval_meth))
        return self.df
Exemple #6
0
 def play(self) -> dict:
     """
     Play unreal tournament to extract the fittest or most important players based on the concept of shapley values
     """
     Log(write=False, level='info').log(
         msg='Start penalty with {} players...'.format(self.n_features))
     _game_scores: List[float] = []
     _permutation_space: int = self.init_pairs
     _pair_size_factor: float = self.max_iter * self.pair_size_factor
     for i in range(0, self.max_iter + self.init_games, 1):
         if i == self.init_games:
             Log(write=False, level='info').log(
                 msg='Start feature tournament with {} players ...'.format(
                     self.n_features))
             self.tournament = True
         elif i > self.init_games:
             _pair_size: int = _permutation_space + int(_pair_size_factor)
             if self.n_features >= _pair_size:
                 _permutation_space = _pair_size
                 #_permutation_space = int(_permutation_space + (_permutation_space * self.pair_size_factor))
         else:
             if i == 0:
                 _permutation_space = self.init_pairs
         self._permutation(n=_permutation_space)
         _pool: ThreadPool = ThreadPool(
             processes=len(self.pairs)) if self.multi_threading else None
         for g in range(0, self.games, 1):
             Log(write=False, level='info').log(
                 msg='Iteration {} - Game {} ~ {} players each game'.format(
                     i + 1, g + 1, _permutation_space))
             if self.multi_threading:
                 self.threads.update(
                     {g: _pool.apply_async(func=self._game, args=[i])})
             else:
                 self._game(iteration=i)
             if i < self.init_games:
                 break
             self._permutation(n=_permutation_space)
         for thread in self.threads.keys():
             self.threads.get(thread).get()
         if i + 1 == self.init_games:
             _shapley_matrix: pd.DataFrame = pd.DataFrame(
                 data=self.shapley_additive_explanation['sum'],
                 index=['score']).transpose()
             _sorted_shapley_matrix = _shapley_matrix.sort_values(
                 by='score', axis=0, ascending=False, inplace=False)
             _all_features: int = _sorted_shapley_matrix.shape[0]
             _sorted_shapley_matrix = _sorted_shapley_matrix.loc[
                 _sorted_shapley_matrix['score'] > 0, :]
             if _sorted_shapley_matrix.shape[0] == 0:
                 raise FeatureTournamentException(
                     'No feature scored higher than 0 during penalty phase')
             _n_features: int = _sorted_shapley_matrix.shape[0]
             Log(write=False, level='info').log(
                 msg='Excluded {} features with score 0'.format(
                     _all_features - _n_features))
             _exclude_features: int = int(_n_features * self.penalty_factor)
             self.features = _sorted_shapley_matrix.index.values.tolist(
             )[0:(_n_features - _exclude_features)]
             self.n_features = len(self.features)
             Log(write=False, level='info').log(
                 msg='Excluded {} lowest scored features from tournament'.
                 format(_exclude_features))
         if i + 1 == self.max_iter + self.init_games:
             _shapley_values: dict = {}
             for sv in self.shapley_additive_explanation['game'].keys():
                 _shapley_values.update({
                     sv:
                     self.shapley_additive_explanation['sum'][sv] /
                     len(self.shapley_additive_explanation['game'][sv])
                 })
             self.shapley_additive_explanation.update(
                 {'total': _shapley_values})
         if self.n_features <= (self.pair_size_factor * _permutation_space):
             if i + 1 == self.max_iter:
                 break
     return self.shapley_additive_explanation
Exemple #7
0
 def _evolve_feature_tournament_ai(self):
     """
     Evolve ai for feature tournament using genetic algorithm
     """
     Log(write=False,
         level='info').log(msg='Evolve feature tournament ai ...')
     _feature_tournament_ai_learning: GeneticAlgorithm = GeneticAlgorithm(
         mode='model',
         df=self.df,
         target=self.target,
         features=self.features,
         re_split_data=False if self.kwargs.get('re_split_data') is None
         else self.kwargs.get('re_split_data'),
         re_sample_cases=False if self.kwargs.get('re_sample_cases') is None
         else self.kwargs.get('re_sample_cases'),
         re_sample_features=True,
         max_features=self.n_features,
         labels=self.kwargs.get('labels'),
         models=['cat'] if self.models is None else self.models,
         model_params=None,
         burn_in_generations=-1
         if self.kwargs.get('burn_in_generations') is None else
         self.kwargs.get('burn_in_generations'),
         warm_start=True if self.kwargs.get('warm_start') is None else
         self.kwargs.get('warm_start'),
         max_generations=2 if self.kwargs.get('max_generations_ai') is None
         else self.kwargs.get('max_generations_ai'),
         pop_size=64 if self.kwargs.get('pop_size') is None else
         self.kwargs.get('pop_size'),
         mutation_rate=0.1 if self.kwargs.get('mutation_rate') is None else
         self.kwargs.get('mutation_rate'),
         mutation_prob=0.5 if self.kwargs.get('mutation_prob') is None else
         self.kwargs.get('mutation_prob'),
         parents_ratio=0.5 if self.kwargs.get('parents_ratio') is None else
         self.kwargs.get('parents_ratio'),
         early_stopping=0 if self.kwargs.get('early_stopping') is None else
         self.kwargs.get('early_stopping'),
         convergence=False if self.kwargs.get('convergence') is None else
         self.kwargs.get('convergence'),
         timer_in_seconds=10000
         if self.kwargs.get('timer_in_secondes') is None else
         self.kwargs.get('timer_in_secondes'),
         force_target_type=self.force_target_type,
         plot=False
         if self.kwargs.get('plot') is None else self.kwargs.get('plot'),
         output_file_path=self.kwargs.get('output_file_path'),
         multi_threading=False if self.kwargs.get('multi_threading') is None
         else self.kwargs.get('multi_threading'),
         multi_processing=False
         if self.kwargs.get('multi_processing') is None else
         self.kwargs.get('multi_processing'),
         log=False
         if self.kwargs.get('log') is None else self.kwargs.get('log'),
         verbose=0 if self.kwargs.get('verbose') is None else
         self.kwargs.get('verbose'),
         **self.kwargs)
     _feature_tournament_ai_learning.optimize()
     self.feature_tournament_ai = _feature_tournament_ai_learning.evolution
     Log(write=False, level='error').log(
         msg='Feature tournament ai evolved -> {}'.format(
             self.feature_tournament_ai.get('model_name')))