Beispiel #1
0
 def test_get_feature_types(self):
     self.assertDictEqual(d1={'continuous': ['C', 'G', 'H'],
                              'categorical': ['A', 'B', 'F', 'I', 'J', 'K'],
                              'ordinal': [],
                              'date': ['D'],
                              'text': ['E']
                              },
                          d2=EasyExploreUtils().get_feature_types(df=DATA_SET,
                                                                  features=list(DATA_SET.keys()),
                                                                  dtypes=DATA_SET.dtypes.tolist()
                                                                  )
                          )
Beispiel #2
0
 def test_get_pairs(self):
     self.assertListEqual(list1=[tuple(['A', 'B']), tuple(['A', 'C']), tuple(['B', 'C'])],
                          list2=EasyExploreUtils().get_pairs(features=['A', 'B', 'C'], max_features_each_pair=2)
                          )
Beispiel #3
0
 def test_get_duplicates(self):
     self.assertDictEqual(d1=dict(cases=[], features=['K']),
                          d2=EasyExploreUtils().get_duplicates(df=DATA_SET, cases=True, features=True)
                          )
Beispiel #4
0
 def test_convert_jupyter(self):
     EasyExploreUtils().convert_jupyter(notebook_name=os.path.join(OUTPUT_PATH, 'test_notebook.ipynb'), to='html')
     self.assertTrue(expr=os.path.isfile(os.path.join(OUTPUT_PATH, 'test_notebook.ipynb')))
Beispiel #5
0
 def test_check_dtypes(self):
     self.assertDictEqual(d1={'B': 'int', 'D': 'date', 'F': 'int', 'I': 'int', 'J': 'int', 'K': 'int'},
                          d2=EasyExploreUtils().check_dtypes(df=DATA_SET, date_edges=None).get('conversion')
                          )
    def __init__(self,
                 df: Union[dd.DataFrame, pd.DataFrame],
                 n_chains: int = 3,
                 n_iter: int = 15,
                 n_burn_in_iter: int = 3,
                 ml_meth: dict = None,
                 predictors: dict = None,
                 imp_sequence: List[str] = None,
                 cor_threshold_for_predictors: float = None,
                 pool_eval_meth: str = 'std',
                 impute_hard_missing: bool = False,
                 soft_missing_values: list = None):
        """
        :param df: Pandas or dask DataFrame
            Data set

        :param n_chains: int
            Number of markov chains

        :param n_iter: int
            Number of iterations

        :param n_burn_in_iter: int
            Number of burn-in iterations (warm start)

        :param ml_meth: str
            Name of the supervised machine learning algorithm

        :param predictors: dict
            Pre-defined predictors for each feature imputation

        :param imp_sequence:

        :param cor_threshold_for_predictors: float
        :param pool_eval_meth:
        :param impute_hard_missing:
        :param soft_missing_values:
        """
        if isinstance(df, pd.DataFrame):
            self.df: dd.DataFrame = dd.from_pandas(data=df, npartitions=4)
        elif isinstance(df, dd.DataFrame):
            self.df: dd.DataFrame = df
        self.feature_types: dict = EasyExploreUtils().get_feature_types(
            df=self.df,
            features=list(self.df.columns),
            dtypes=self.df.dtypes.tolist())
        self.n_chains: int = 3 if n_chains <= 1 else n_chains
        self.chains: dict = {m: {} for m in range(0, self.n_chains, 1)}
        self.n_burn_in_iter: int = 3 if n_burn_in_iter <= 0 else n_burn_in_iter
        self.n_iter: int = (15
                            if n_iter <= 1 else n_iter) + self.n_burn_in_iter
        self.data_types: List[str] = ['cat', 'cont', 'date']
        _encoder = LabelEncoder()
        for ft in self.df.columns:
            if str(self.df[ft].dtype).find('object') >= 0:
                self.df[ft] = self.df[ft].fillna('NaN')
                #self.df.loc[self.df[ft].isnull().compute(), ft] = 'NaN'
                self.df[ft] = dd.from_array(x=_encoder.fit_transform(
                    y=self.df[ft].values))
        self.ml_meth: dict = ml_meth
        if self.ml_meth is not None:
            for meth in self.ml_meth:
                if meth.find('cat') >= 0:
                    pass
        else:
            self.ml_meth = dict(cat='xgb', cont='xgb', date='xgb')
        self.predictors: dict = predictors
        self.impute_hard_missing: bool = impute_hard_missing
        self.mis_freq: dict = MissingDataAnalysis(
            df=self.df, other_mis=soft_missing_values).freq_nan_by_features()
        self.nan_idx: dict = MissingDataAnalysis(
            df=self.df,
            other_mis=soft_missing_values).get_nan_idx_by_features()
        self.imp_sequence: List[
            str] = [] if imp_sequence is None else imp_sequence
        if len(self.imp_sequence) == 0:
            # self.imp_sequence = [mis_freq[0] for mis_freq in sorted(self.mis_freq.items(), key=lambda x: x[1], reverse=False)]
            for mis_freq in sorted(self.mis_freq.items(),
                                   key=lambda x: x[1],
                                   reverse=False):
                if mis_freq[1] > 0:
                    self.imp_sequence.append(mis_freq[0])
        if self.predictors is None:
            self.predictors = {}
            if cor_threshold_for_predictors is None:
                for ft in self.mis_freq.keys():
                    self.predictors.update({
                        ft:
                        list(set(list(self.df.columns)).difference([ft]))
                    })
            else:
                if (cor_threshold_for_predictors >
                        0.0) and (cor_threshold_for_predictors < 1.0):
                    _cor: pd.DataFrame = StatsUtils(
                        data=self.df,
                        features=list(self.df.columns)).correlation()
                    for ft in self.df.columns:
                        self.predictors.update({
                            ft:
                            _cor.loc[_cor[ft] >= cor_threshold_for_predictors,
                                     ft].index.values.tolist()
                        })
                        if len(self.predictors[ft]) == 0:
                            raise MultipleImputationException(
                                'No predictors found to impute feature "{}" based on given correlation threshold (>={})'
                                .format(ft, cor_threshold_for_predictors))
                else:
                    for ft in self.df.columns:
                        self.predictors.update({
                            ft:
                            list(set(list(self.df.columns)).difference([ft]))
                        })
        if pool_eval_meth not in ['std', 'var', 'aic', 'bic']:
            raise MultipleImputationException(
                'Method for pooling chain evaluation ({}) not supported'.
                format(pool_eval_meth))
        self.pool_eval_meth: str = pool_eval_meth
Beispiel #7
0
    def supervised(self,
                   models: List[str] = None,
                   feature_selector: str = 'shapley',
                   top_features: float = 0.5,
                   optimizer: str = 'ga',
                   force_target_type: str = None,
                   train: bool = True,
                   train_size: float = 0.8,
                   random: bool = True,
                   stratification: bool = False,
                   clf_eval_metric: str = 'auc',
                   reg_eval_metric: str = 'rmse_norm',
                   save_train_test_data: bool = True,
                   save_ga: bool = True,
                   **kwargs
                   ):
        """
        Run supervised machine learning models

        :param models: List[str]
            Name of the supervised machine learning models to use

        :param feature_selector: str
            Feature selection method:
                -> shapley: Shapley Value based on the FeatureTournament framework

        :param top_features: float
            Amount of top features to select

        :param optimizer: str
            Model optimizer method:
                -> ga: Genetic Algorithm
                -> None: Develop model manually using pre-defined parameter config without optimization

        :param force_target_type: str
            Name of the target type to force (useful if target type is ordinal)
                -> reg: define target type as regression instead of multi classification
                -> clf_multi: define target type as multi classification instead of regression

        :param train: bool
            Whether to train or to predict from supervised machine learning models

        :param train_size: float
            Proportion of cases in the training data set

        :param random: bool
            Whether to sample randomly or by index

        :param stratification: bool
            Whether to stratify train and test data sets

        :param clf_eval_metric: str
            Name of the metric to use for evaluate classification models

        :param reg_eval_metric: str
            Name of the metric to use for evaluate regression models

        :param save_train_test_data: bool
            Whether to save train-test data split or not

        :param save_ga: bool
            Whether to save "Genetic" object or not

        :param kwargs: dict
            Key-word arguments of classes FeatureSelector / DataExporter / Genetic / MLSampler / DataVisualizer
        """
        self.force_target_type = force_target_type
        if train:
            _train_size: float = train_size if (train_size > 0) and (train_size < 1) else 0.8
            if self.feature_generator:
                self.feature_engineer = FeatureLearning(feature_engineer=self.feature_engineer,
                                                        target=self.feature_engineer.get_target(),
                                                        force_target_type=force_target_type,
                                                        max_features=0,
                                                        keep_fittest_only=True if kwargs.get('keep_fittest_only') is None else kwargs.get('keep_fittest_only'),
                                                        train_continuous_critic=False if kwargs.get('train_continuous_critic') is None else kwargs.get('train_continuous_critic'),
                                                        train_categorical_critic=False if kwargs.get('train_categorical_critic') is None else kwargs.get('train_categorical_critic'),
                                                        engineer_time_disparity=True if kwargs.get('engineer_time_disparity') is None else kwargs.get('engineer_time_disparity'),
                                                        engineer_categorical=False if kwargs.get('engineer_categorical') is None else kwargs.get('engineer_categorical'),
                                                        output_path=self.output_path,
                                                        **self.kwargs
                                                        ).ga()
            else:
                self.feature_engineer.set_predictors(exclude_original_data=False)
            if feature_selector is not None:
                _imp_features: dict = FeatureSelector(df=self.feature_engineer.get_training_data(output='df_dask'),
                                                      target=self.feature_engineer.get_target(),
                                                      features=self.feature_engineer.get_predictors(),
                                                      force_target_type=force_target_type,
                                                      aggregate_feature_imp=self.feature_engineer.get_processing()['features']['raw'],
                                                      visualize_all_scores=self.plot if kwargs.get('visualize_all_scores') is None else kwargs.get('visualize_all_scores'),
                                                      visualize_variant_scores=self.plot if kwargs.get('visualize_variant_scores') is None else kwargs.get('visualize_variant_scores'),
                                                      visualize_core_feature_scores=self.plot if kwargs.get('visualize_core_feature_scores') is None else kwargs.get('visualize_core_feature_scores'),
                                                      path=self.output_path
                                                      ).get_imp_features(meth=feature_selector,
                                                                         imp_threshold=0.001 if kwargs.get('imp_threshold') is None else kwargs.get('imp_threshold')
                                                                         )
                _ratio: float = top_features if (top_features > 0) and (top_features <= 1) else 0.5
                _top_n_features: int = round(self.feature_engineer.get_n_predictors() * _ratio)
                self.feature_engineer.set_predictors(features=_imp_features.get('imp_features')[0:_top_n_features],
                                                     exclude_original_data=False
                                                     )
                if self.output_path is not None or kwargs.get('file_path') is not None:
                    DataExporter(obj=_imp_features,
                                 file_path='{}feature_importance.pkl'.format(self.output_path) if kwargs.get('file_path') is None else kwargs.get('file_path'),
                                 create_dir=True if kwargs.get('create_dir') is None else kwargs.get('create_dir'),
                                 overwrite=False if kwargs.get('overwrite') is None else kwargs.get('overwrite')
                                 ).file()
            if optimizer == 'ga':
                _ga = GeneticAlgorithm(mode='model',
                                       df=self.feature_engineer.get_training_data(),
                                       target=self.feature_engineer.get_target(),
                                       force_target_type=force_target_type,
                                       features=self.feature_engineer.get_predictors(),
                                       stratify=stratification,
                                       labels=None if kwargs.get('labels') is None else kwargs.get('labels'),
                                       models=models,
                                       burn_in_generations=10 if kwargs.get('burn_in_generations') is None else kwargs.get('burn_in_generations'),
                                       max_generations=25 if kwargs.get('max_generations') is None else kwargs.get('max_generations'),
                                       pop_size=64 if kwargs.get('pop_size') is None else kwargs.get('pop_size'),
                                       mutation_rate=0.1 if kwargs.get('mutation_rate') is None else kwargs.get('mutation_rate'),
                                       mutation_prob=0.15 if kwargs.get('mutation_prob') is None else kwargs.get('mutation_prob'),
                                       parents_ratio=0.5 if kwargs.get('parents_ratio') is None else kwargs.get('parents_ratio'),
                                       early_stopping=0 if kwargs.get('early_stopping') is None else kwargs.get('early_stopping'),
                                       convergence=False if kwargs.get('convergence') is None else kwargs.get('convergence'),
                                       convergence_measure='median' if kwargs.get('convergence_measure') is None else kwargs.get('convergence_measure'),
                                       timer_in_seconds=43200 if kwargs.get('timer_in_seconds') is None else kwargs.get('timer_in_seconds'),
                                       plot=self.plot,
                                       output_file_path=self.output_path
                                       )
                _ga.optimize()
                if save_train_test_data:
                    DataExporter(obj=_ga.data_set,
                                 file_path='{}train_test_data.pkl'.format(self.output_path),
                                 create_dir=True if kwargs.get('create_dir') is None else kwargs.get('create_dir'),
                                 overwrite=False if kwargs.get('overwrite') is None else kwargs.get('overwrite')
                                 ).file()
                if save_ga:
                    _ga.save_evolution(ga=True, model=False)
            else:
                _model_eval_plot: dict = {}
                _data_set: dict = MLSampler(df=self.feature_engineer.get_data(),
                                            target=self.feature_engineer.get_target(),
                                            features=self.feature_engineer.get_predictors(),
                                            train_size=_train_size,
                                            random_sample=random,
                                            stratification=stratification
                                            ).train_test_sampling(validation_split=0.1 if kwargs.get('validation_split') is None else kwargs.get('validation_split'))
                if save_train_test_data:
                    DataExporter(obj=_data_set,
                                 file_path='{}train_test_data.pkl'.format(self.output_path),
                                 create_dir=True if kwargs.get('create_dir') is None else kwargs.get('create_dir'),
                                 overwrite=False if kwargs.get('overwrite') is None else kwargs.get('overwrite')
                                 ).file()
                for model in models:
                    if HappyLearningUtils().get_ml_type(values=self.feature_engineer.get_target_values()) == 'reg':
                        _model = ModelGeneratorReg(model_name=model, reg_params=None).generate_model()
                        _model.train(x=_data_set.get('x_train').values,
                                     y=_data_set.get('y_train').values,
                                     validation=dict(x_val=_data_set.get('x_val').values,
                                                     y_val=_data_set.get('y_val').values
                                                     )
                                     )
                        _pred: np.array = _model.predict(x=_data_set.get('x_test').values)
                        _model.eval(obs=_data_set.get('y_test').values, pred=_pred, eval_metric=[reg_eval_metric])
                        _perc_table: pd.DataFrame = EasyExploreUtils().get_perc_eval(pred=_pred,
                                                                                     obs=_data_set.get('y_test').values.tolist(),
                                                                                     aggregation='median',
                                                                                     percentiles=10
                                                                                     )
                        _min_table: pd.DataFrame = EasyExploreUtils().get_perc_eval(pred=_pred,
                                                                                    obs=_data_set.get('y_test').values.tolist(),
                                                                                    aggregation='min',
                                                                                    percentiles=10
                                                                                    )
                        _max_table: pd.DataFrame = EasyExploreUtils().get_perc_eval(pred=_pred,
                                                                                    obs=_data_set.get('y_test').values.tolist(),
                                                                                    aggregation='max',
                                                                                    percentiles=10
                                                                                    )
                        _multi: dict = {'bar_obs': dict(y=_perc_table['obs'].values,
                                                        name='obs',
                                                        error_y=dict(type='data',
                                                                     array=_max_table['obs'].values - _min_table[
                                                                         'obs'].values)
                                                        ),
                                        'bar_preds': dict(y=_perc_table['preds'].values,
                                                          name='pred',
                                                          error_y=dict(type='data',
                                                                       array=_max_table['preds'].values - _min_table[
                                                                           'preds'].values)
                                                          )
                                        }
                        _model_eval_df: pd.DataFrame(data={'obs': _data_set.get('y_test').values, 'preds': _pred})
                        _model_eval_df['abs_diff'] = _model_eval_df['obs'] - _model_eval_df['preds']
                        _model_eval_df['rel_diff'] = _model_eval_df['obs'] / _model_eval_df['preds']
                        # TODO: Add train & test error to plot
                        _model_eval_plot.update({'Prediction vs. Observation (Value Based)': dict(data=_model_eval_df,
                                                                                                  features=['obs', 'preds'],
                                                                                                  plot_type='joint',
                                                                                                  render=True,
                                                                                                  file_path='{}prediction_scatter_{}.html'.format(self.output_path, model)
                                                                                                  ),
                                                 'Prediction vs. Observation (Range Based)': dict(data=_model_eval_df,
                                                                                                  features=['obs', 'preds', 'abs_diff', 'rel_diff'],
                                                                                                  plot_type='parcoords',
                                                                                                  render=True,
                                                                                                  file_path='{}prediction_coords_{}.html'.format(self.output_path, model)
                                                                                                  ),
                                                 'Prediction vs. Observation (Percentile Based)': dict(data=_perc_table,
                                                                                                       plot_type='multi',
                                                                                                       render=True,
                                                                                                       file_path='{}prediction_percentiles_{}.html'.format(self.output_path, model),
                                                                                                       kwargs=dict(layout=dict(barmode='group',
                                                                                                                               xaxis=dict(tickmode='array',
                                                                                                                                          tickvals=[p for p in range(0, 10, 1)],
                                                                                                                                          ticktext=[str(label) for label in _perc_table['obs'].values.tolist()]
                                                                                                                                          )
                                                                                                                               ),
                                                                                                                   multi=_multi
                                                                                                                   )
                                                                                                       )
                                                 })
                    else:
                        _model = ModelGeneratorClf(model_name=model, clf_params={}).generate_model()
                        _model.train(x=_data_set.get('x_train').values,
                                     y=_data_set.get('y_train').values,
                                     validation=dict(x_val=_data_set.get('x_val').values,
                                                     y_val=_data_set.get('y_val').values
                                                     )
                                     )
                        _pred: np.array = _model.predict(x=_data_set.get('x_test').values)
                        _model.eval(obs=_data_set.get('y_test').values, pred=_pred, eval_metric=[clf_eval_metric])
                        _confusion_matrix: pd.DataFrame = EvalClf(obs=_data_set.get('y_test').values.tolist(),
                                                                  pred=_pred,
                                                                  probability=True
                                                                  ).confusion(normalize='true')
                        _model_eval_plot.update({'Confusion Matrix': dict(data=_confusion_matrix,
                                                                          plot_type='heat',
                                                                          kwargs={'layout': {'xaxis': {'title': 'Observation'},
                                                                                             'yaxis': {'title': 'Prediction'}
                                                                                             },
                                                                                  'text': _confusion_matrix.values.tolist()
                                                                                  }
                                                                          )
                                                 })
                    if self.output_path is not None:
                        DataExporter(obj=_model.model,
                                     file_path='{}model_{}'.format(self.output_path, model),
                                     create_dir=True,
                                     overwrite=False
                                     ).file()
        else:
            raise NotImplementedError('Prediction method not implemented yet')