def __init__(self, num_features, **kwargs): super(GatedEnsembleClassifier, self).__init__() kwargs = {**constants.GATED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model( clf, num_features=self.num_features, **kwargs ) estimators.append((clf, model.kernel)) self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) # use as output the probability of a given class (not just # the class itself) self.kernel.add(estimators, proba=True) self.kernel.add_meta( utils.init_model( self.meta_layer, len(estimators) * self.num_folds, **kwargs ).kernel, proba=True, )
def _single_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): predictions, test_set = None, [] dataset, positive_samples_index = train.build_training_set( catalog, entity, dir_io) k_fold, binary_target_variables = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index) for train_index, test_index in k_fold.split(dataset, binary_target_variables): training, test = dataset.iloc[train_index], dataset.iloc[test_index] test_set.append(test) model = utils.init_model(classifier, dataset.shape[1], **kwargs) model.fit(training, positive_samples_index & training.index) preds = model.predict(test) K.clear_session() # Free memory if predictions is None: predictions = preds else: predictions |= preds test_set = concat(test_set) return ( predictions, _compute_performance(positive_samples_index & test_set.index, predictions, len(test_set)), )
def __init__(self, num_features, **kwargs): super(StackedEnsembleClassifier, self).__init__() kwargs = {**constants.STACKED_ENSEMBLE_PARAMS, **kwargs} self.num_features = num_features self.num_folds = kwargs.pop('folds', 2) self.meta_layer = kwargs.pop('meta_layer') def init_estimators(num_features): estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model( clf, num_features=num_features, **kwargs ) estimators.append((clf, model.kernel)) return estimators self.kernel = SuperLearner(verbose=2, n_jobs=1, folds=self.num_folds) l1_estimators = init_estimators(self.num_features) self.kernel.add(l1_estimators, proba=True) l2_estimators = init_estimators(len(l1_estimators) * self.num_folds) self.kernel.add(l2_estimators, proba=True) self.kernel.add_meta( utils.init_model( self.meta_layer, len(l2_estimators) * self.num_folds, **kwargs ).kernel, proba=True, )
def init_estimators(num_features): estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model( clf, num_features=num_features, **kwargs ) estimators.append((clf, model.kernel)) return estimators
def _train(classifier, feature_vectors, positive_samples_index, **kwargs): model = utils.init_model(classifier, feature_vectors.shape[1], **kwargs) LOGGER.info('Training a %s ...', classifier) model.fit(feature_vectors, positive_samples_index) LOGGER.info('Training done') return model
def _nested_k_fold_with_grid_search(classifier, param_grid, catalog, entity, k, scoring, dir_io, **kwargs): dataset, positive_samples_index = train.build_training_set( catalog, entity, dir_io) model = utils.init_model(classifier, dataset.shape[1], **kwargs).kernel inner_k_fold, target = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index) outer_k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1269) grid_search = GridSearchCV( model, param_grid, scoring=scoring, n_jobs=-1, cv=inner_k_fold, verbose=1, ) result = [] dataset = dataset.to_numpy() for k, (train_index, test_index) in enumerate(outer_k_fold.split(dataset, target), 1): # Run grid search grid_search.fit(dataset[train_index], target[train_index]) # Let grid search compute the test score test_score = grid_search.score(dataset[test_index], target[test_index]) # No reason to keep trained models in memory. We will instead just dump them # to a file and keep the path best_model = grid_search.best_estimator_ model_path = os.path.join( dir_io, constants.LINKER_NESTED_CV_BEST_MODEL.format( catalog, entity, classifier, k), ) joblib.dump(best_model, model_path) LOGGER.info("Best model for fold %d dumped to '%s'", k, model_path) # Grid search best score is the train score result.append({ f'train_{scoring}': grid_search.best_score_, f'test_{scoring}': test_score, 'best_model': model_path, 'params': grid_search.best_params_, }) return result
def __init__(self, num_features, **kwargs): super(VotingClassifier, self).__init__() kwargs = {**constants.VOTING_CLASSIFIER_PARAMS, **kwargs} voting = kwargs.pop('voting') self.num_features = num_features estimators = [] for clf in constants.CLASSIFIERS_FOR_ENSEMBLE: model = utils.init_model(clf, num_features=num_features, **kwargs) estimators.append((clf, model.kernel)) # use as kernel the VotingClassifier coming from sklearn self.kernel = SKVotingClassifier( estimators=estimators, voting=voting, n_jobs=None )
def _grid_search( k: int, feature_vectors: pd.DataFrame, positive_samples_index: pd.MultiIndex, classifier: str, **kwargs, ) -> Dict: k_fold, target = utils.prepare_stratified_k_fold(k, feature_vectors, positive_samples_index) model = utils.init_model(classifier, feature_vectors.shape[1], **kwargs) grid_search = GridSearchCV( model.kernel, constants.PARAMETER_GRIDS[classifier], scoring='f1', n_jobs=-1, cv=k_fold, ) grid_search.fit(feature_vectors.to_numpy(), target) return grid_search.best_params_
def _average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs): predictions, precisions, recalls, f_scores = None, [], [], [] dataset, positive_samples_index = train.build_training_set( catalog, entity, dir_io) k_fold, binary_target_variables = utils.prepare_stratified_k_fold( k, dataset, positive_samples_index) for train_index, test_index in k_fold.split(dataset, binary_target_variables): training, test = dataset.iloc[train_index], dataset.iloc[test_index] model = utils.init_model(classifier, dataset.shape[1], **kwargs) model.fit(training, positive_samples_index & training.index) preds = model.predict(test) K.clear_session() # Free memory p, r, f, _ = _compute_performance(positive_samples_index & test.index, preds, len(test)) if predictions is None: predictions = preds else: predictions |= preds precisions.append(p) recalls.append(r) f_scores.append(f) return ( predictions, mean(precisions), std(precisions), mean(recalls), std(recalls), mean(f_scores), std(f_scores), )