def test_pipeline_raise_set_params_error(): # Test pipeline raises set params error message for nested models. pipe = Pipeline([('cls', LinearRegression())]) # expected error message error_msg = ('Invalid parameter %s for estimator %s. ' 'Check the list of available parameters ' 'with `estimator.get_params().keys()`.') assert_raise_message(ValueError, error_msg % ('fake', 'Pipeline'), pipe.set_params, fake='nope') # nested model check assert_raise_message(ValueError, error_msg % ("fake", pipe), pipe.set_params, fake__estimator='nope')
def Create_XGBoost_Model(): learning_rates = [0.1, 0.05, 0.01] num_estimators = [10, 20, 30] + list(range(45, 100, 5)) max_depths = [2**x for x in range(1, 7)] grid = { 'xgbclassifier__learning_rate': learning_rates, 'xgbclassifier__n_estimators': num_estimators, 'xgbclassifier__max_depth': max_depths } xgb_model = xgb.XGBClassifier() cv_kfold = KFold(n_splits=N_split, shuffle=True, random_state=4) pipeline = Pipeline([('under', RandomUnderSampler()), ('xgbclassifier', xgb_model)]) xgb_model_grid_search = GridSearchCV(estimator=pipeline, param_grid=grid, cv=cv_kfold, n_jobs=-1, verbose=4) return xgb_model_grid_search
def __init__(self, model_file: str = None) -> None: super().__init__() # pip install sklearn from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from imblearn.over_sampling import SMOTE from sklearn.linear_model import LogisticRegression from imblearn.pipeline import Pipeline self.pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('smote', SMOTE()), ('clf', LogisticRegression( solver='newton-cg', multi_class='multinomial', random_state=42, max_iter=100, )), ])
def balanceSampling(X_tr, y_train, up_ratio=1,dn_ratio=1): """ Docstring: up and under sampling data Parameters ---------- up_ratio: upsampling ratio dn_ratio: downsampling ratio """ # Ratio argument is the percentage of the upsampled minority class in relation to the majority class. Default is 1.0 over = SMOTE(sampling_strategy = up_ratio) under = RandomUnderSampler(sampling_strategy = dn_ratio) steps = [('over', over), ('under', under)] pipeline = Pipeline(steps=steps) X_train_sm, y_train_sm = pipeline.fit_resample(X_tr, y_train) print(X_train_sm.shape, y_train_sm.shape) return X_train_sm, y_train_sm
def _train(X, y, save_model): #print y.value_counts() #min_sample = min(y.value_counts()) #print "min : ",min_sample skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) from imblearn.over_sampling import SMOTE # Pipeline item cv = CountVectorizer() # make into bag of words tfidf = TfidfTransformer(use_idf=True) # apply tfidf upsampling = SMOTE(k_neighbors=9) svm = SGDClassifier(penalty='l2', loss='modified_huber') parameters = { 'cv__stop_words': ('english', None), # remove stopwords or not 'cv__max_df': (0.8, 0.9, 0.85, 0.95), # if a word apper more than x*10% then ignore 'svm__alpha': (1e-3, 1e-4), #learning rate 'svm__max_iter': (5000, 10000), # max iteration 'svm__tol': (1e-4, 1e-3, 1e-2) # when to stop } # training #text_clf_svm = Pipeline([('cv', cv), ('tfidf', tfidf), ('upsampling', upsampling), ('svm', svm)]) text_clf_svm = Pipeline([('cv', cv), ('tfidf', tfidf), ('svm', svm)]) gs_clf = GridSearchCV(text_clf_svm, parameters, n_jobs=-1, cv=skf.split(X, y), scoring='f1_micro') gs_clf = gs_clf.fit(X, y) print "Best Parameter : ", gs_clf.best_params_ print "F1 Score : ", gs_clf.best_score_ print "============================================" # Saving model if save_model: saving_path = Utilities.construct_filepath(out_dir, [category, label], ".model") pickle.dump(gs_clf.best_estimator_, open(saving_path, 'wb'))
def confusion_matrix(data, target, category, clf, class_names, title): """ Plot and save confuction matrix for specified classifier. Args: data (numpy.ndarray): Data samples target (numpy.ndarray): Data labels (target variable values) category (str): Specification of the type of prediction being made. Valid values are 'book-relevance', 'type', 'category' and 'category-broad'. clf (object): Classifier for which to plot the confuction matrix. class_names (list): List of class names title (str): Plot title """ # Initialize random forest classifier, apply wrapper and add to pipeline. clf_eval = Pipeline([('scaling', RobustScaler()), ('clf', clf)]) # Split data into training and test sets. data_train, data_test, target_train, target_test = train_test_split( data, target, shuffle=False, test_size=0.1) # Fit model. clf_eval.fit(data_train, target_train) np.set_printoptions(precision=2) # Plot confusion matrix and save plot. disp = metrics.plot_confusion_matrix(clf_eval, data_test, target_test, display_labels=class_names, cmap=plt.cm.Blues, normalize='true', xticks_rotation='vertical') # UNCOMMENT TO SET TITLE. # disp.ax_.set_title("Normalized Confusion Matrix - " + title) disp.figure_.set_size_inches(9.0, 9.0, forward=True) plt.tight_layout() plt.savefig('../results/plots/cfm_' + category + '_' + title.lower().replace(' ', '_') + '.eps') plt.clf() plt.close()
def createPipeline(model, oversampler_type, *args, **kwargs): if oversampler_type == "SMOTE": oversampler = SMOTE(sampling_strategy="minority", random_state=0) elif oversampler_type == "SVMSMOTE": oversampler = SVMSMOTE(sampling_strategy="minority", random_state=0) elif oversampler_type == "RandomOverSampler": oversampler = RandomOverSampler(sampling_strategy="minority", random_state=0) else: raise ValueError("RAPIDS pipeline only supports 'SMOTE', 'SVMSMOTE' and 'RandomOverSampler' oversampling methods.") if model == "LogReg": from sklearn.linear_model import LogisticRegression clf = ("clf", LogisticRegression(random_state=0)) elif model == "kNN": from sklearn.neighbors import KNeighborsClassifier clf = ("clf", KNeighborsClassifier()) elif model == "SVM": from sklearn.svm import SVC clf = ("clf", SVC(random_state=0, probability=True)) elif model == "DT": from sklearn.tree import DecisionTreeClassifier clf = ("clf", DecisionTreeClassifier(random_state=0)) elif model == "RF": from sklearn.ensemble import RandomForestClassifier clf = ("clf", RandomForestClassifier(random_state=0)) elif model == "GB": from sklearn.ensemble import GradientBoostingClassifier clf = ("clf", GradientBoostingClassifier(random_state=0)) elif model == "XGBoost": from xgboost import XGBClassifier clf = ("clf", XGBClassifier(random_state=0, n_jobs=6)) elif model == "LightGBM": from lightgbm import LGBMClassifier clf = ("clf", LGBMClassifier(objective="binary", random_state=0, n_jobs=6)) else: raise ValueError("RAPIDS pipeline only supports LogReg, kNN, SVM, DT, RF, GB, XGBoost, and LightGBM algorithms for classification problems.") steps = [("sampling", oversampler), ("fs", kwargs["feature_selector"])] if "feature_selector" in kwargs.keys() else [("sampling", oversampler)] steps.append(clf) pipeline = Pipeline(steps) return pipeline
def model_training(self): pre = PreProcessing() print('Reading data') df = self.data.read_data(train=True) print('Starting training') X_train, y_train = pre.preprocess(df, train=True) print('Starting training model') model = CatBoostClassifier() steps = [('over', SMOTE()), ('model', CatBoostClassifier())] pipeline = Pipeline(steps=steps) pipeline.fit(X_train, y_train) modelo = pipeline['model'] model = { 'model': modelo, 'preprocessing': pre, 'columns': pre.feature_names } print(model) dump(model, '../output/modelo.pkl') return model
def _validate_estimator(self, default=DecisionTreeClassifier()): """Check the estimator and the n_estimator attribute, set the `base_estimator_` attribute.""" if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): raise ValueError("n_estimators must be an integer, " "got {0}.".format(type(self.n_estimators))) if self.n_estimators <= 0: raise ValueError("n_estimators must be greater than zero, " "got {0}.".format(self.n_estimators)) if self.base_estimator is not None: base_estimator = clone(self.base_estimator) else: base_estimator = clone(default) self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler( sampling_strategy=self.sampling_strategy, replacement=self.replacement, ratio=self.ratio)), ('classifier', base_estimator)])
def ensemble_pipe(self, pipes): """Create a mean ensemble pipe where individual pipes feed into a mean voting ensemble model. Args: pipes (list): List of pipes that will have their outputs averaged Returns: Pipeline: Pipeline object that has multiple multiple feeding Voting object """ ests = [] for i, p in enumerate(pipes): ests.append((f'p{i}', p)) if self.model_obj == 'reg': ensemble = VotingRegressor(estimators=ests) elif self.model_obj == 'class': ensemble = VotingClassifier(estimators=ests) return Pipeline([('ensemble', ensemble)])
def under_sample_with_SMOTE(X, y): ''' Undersample the date with SMOTE algorithm :param X: :param y: labels :return: ''' counter = collections.Counter(y) print(counter) # define pipeline over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) # transform the dataset X, y = pipeline.fit_resample(X, y) # summarize the new class distribution counter = collections.Counter(y) print(counter) return X, y
def test_pipeline_sample_transform(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pca = PCA() pca2 = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus), ('pca2', pca2)]) pipeline.fit(X, y).transform(X)
def Create_XGBoost_Model(): learning_rates = [0.1, 0.05, 0.01] num_estimators = [10, 20, 30] + list(range(45, 100, 5)) max_depths = [2**x for x in range(1, 7)] grid = { 'xgbclassifier__learning_rate': learning_rates, 'xgbclassifier__n_estimators': num_estimators, 'xgbclassifier__max_depth': max_depths } xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss') cv_kfold = KFold(n_splits=5, shuffle=True, random_state=4) pipeline = Pipeline([('sample', SMOTE()), ('xgbclassifier', xgb_model)]) xgb_model_grid_search = GridSearchCV(estimator=pipeline, param_grid=grid, scoring='roc_auc', cv=cv_kfold, n_jobs=-1, verbose=4) return xgb_model_grid_search
def _build_pipeline(self): """ Built the classifier pipeline. Returns: clf: (Pipeline) Pipeline. """ # assign appropriate classifier if self.classifier.lower() == 'dummyclassifier': clf = DummyClassifier(strategy='most_frequent') elif self.classifier.lower() == 'decisiontreeclassifier': clf = DecisionTreeClassifier( **self.parameters if not self.mode == 'grid' else {}) elif self.classifier.lower() == 'gaussiannb': clf = GaussianNB() elif self.classifier.lower() == 'multinomialnb': clf = MultinomialNB() elif self.classifier.lower() == 'svc': clf = SVC(probability=True, **self.parameters) elif self.classifier.lower() == 'adaboostclassifier': clf = AdaBoostClassifier( **self.parameters if not self.mode == 'grid' else {}) elif self.classifier.lower() == 'randomforestclassifier': clf = RandomForestClassifier( n_jobs=-1, **self.parameters if not self.mode == 'grid' else {}) elif self.classifier.lower() == 'mlpclassifier': clf = MLPClassifier( max_iter=3000, **self.parameters if not self.mode == 'grid' else {}) else: raise ValueError('Invalid classifier: {}'.format(self.classifier)) log.info('Selected classifier: %s', self.classifier) log.debug('Classifier info: %s', clf) # SMOTE over-sample smote = SMOTE(sampling_strategy='minority') clf = Pipeline([('SMOTE', smote), (self.classifier, clf)]) return clf
def check_oversamplers_classifiers(oversamplers, classifiers, n_runs, random_state): """Extract estimators and parameters grids.""" # Extract estimators estimators_products = product([smpl[0:2] for smpl in oversamplers], [clf[0:2] for clf in classifiers], range(n_runs)) estimators = [('%s|%s_%s' % (smpl_name, clf_name, run_id), Pipeline([(smpl_name, smpl), (clf_name, clf)])) for (smpl_name, smpl), (clf_name, clf), run_id in estimators_products] # Extract parameters grids oversamplers_param_grids = [ {('%s__%s' % (smpl[0], par)): val for par, val in smpl[2].items()} if len(smpl) > 2 else {} for smpl in oversamplers ] classifiers_param_grids = [ {('%s__%s' % (clf[0], par)): val for par, val in clf[2].items()} if len(clf) > 2 else {} for clf in classifiers ] param_grids_products = product(oversamplers_param_grids, classifiers_param_grids, range(n_runs)) random_states = check_random_states(random_state, len(estimators)) param_grids = [] est_names, _ = zip(*estimators) for (oversampler_param_grid , classifier_param_grid, run_id), random_state, est_name in \ zip(param_grids_products, random_states, est_names): param_grid = {} param_grid.update(oversampler_param_grid) param_grid.update(classifier_param_grid) param_grid = {('%s__%s' % (est_name, par)): val for par, val in param_grid.items()} param_grid.update({'est_name': [est_name]}) param_grid.update({'random_state': [random_state]}) param_grids.append(param_grid) return {'estimators': estimators, 'param_grids': param_grids}
def hyper_paramytize_optimization(f): print ("model with no experience with Smote STSRCOM", file = f) print ("--------------------------------------------------------------------", file = f) counter = Counter(y) # estimate scale_pos_weight value estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate, file = f) print(counter[0], file = f) print(counter[1], file = f) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss') random = RandomUnderSampler(sampling_strategy=0.33) # define grid # weights = [1,3, 10, 25,30, 50, 75, 99, 100] # param_grid = dict(scale_pos_weight=weights) # param_grid= {'xgbclassifier__scale_pos_weight': weights} learning_rates = [0.1, 0.05, 0.01] max_depths = [1, 2, 3, 5, 8, 10, 14,18] n_estimator = range(60, 220, 40) weights = [1, 10, 25, 50, 75, 99, 100, 1000] param_grid = {'xgbclassifier__max_depth': max_depths, 'xgbclassifier__learning_rate': learning_rates, 'xgbclassifier__n_estimators': n_estimator} print (param_grid, file = f) # define evaluation procedure cv = StratifiedKFold(n_splits=10) # define grid search # pipeline = Pipeline([('under', random), ('xgbclassifier', model)]) pipeline = Pipeline([('sample', SMOTE()), ('xgbclassifier', model)]) grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc') # execute the grid search grid_result = grid.fit(X, y) # report the best configuration print (grid_result, file=f) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_), file = f) # report all configurations means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param), file = f)
def SMOTE_Analysis(k, o, u): try: model = DecisionTreeClassifier() over = SMOTE(sampling_strategy=o, k_neighbors=k, random_state=2) under = RandomUnderSampler(sampling_strategy=u) steps = [('over', over), ('under', under)] pipeline = Pipeline(steps=steps) Xn, yn = pipeline.fit_resample(X, y.ravel()) cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) scores = cross_val_score(model, Xn, yn, scoring='roc_auc', cv=cv, n_jobs=-1) score = np.mean(scores) print("k={}, over={}, under={}, Mean ROC AUC: {:.3f}".format( k, o, u, score)) return [k, o, u] except Exception as e: return ""
def tune_model_hyperparameters(self): # this can be used to tune classifier hyperparameters pipe = Pipeline([('resample', SMOTE()), ('model', RandomForestClassifier())]) kf = StratifiedKFold(n_splits=10, shuffle=True) p_grid = dict(model__n_estimators=[50, 100, 200]) grid_search = GridSearchCV(estimator=pipe, param_grid=p_grid, cv=kf, refit=True) grid_search.fit(self._X_pca_train, self._y_train) # Adding below in as could be helpful to know how to get fitted scaler if used # best = grid_search.best_estimator_ # print(best) prediction = grid_search.predict(self._X_pca_test) cnf_matrix = confusion_matrix(self._y_test, prediction) return prediction, cnf_matrix
def resampling(self, oversample_ratio=0.3, minority_num=368, majority_num=10000, minority_label='1.0', majority_label='0.0'): # define resampling under = RandomUnderSampler(sampling_strategy={ majority_label: majority_num, minority_label: minority_num }) over = SMOTE(sampling_strategy=oversample_ratio) # define pipeline pipeline = Pipeline(steps=[('u', under), ('o', over)]) X_sm, y_sm = pipeline.fit_resample(self.X, self.y) print('Proportion in data after resample: ', Counter(y_sm)) return X_sm, y_sm
def syntetic_sampling(X, y, over_sampling, under_sampling): """ Apply Synthetic Minority Oversampling Technique (SMOTE) to tn unbalanced class :type X: pandas DataFrame :param X: Training Features :type y: pandas Series :param y: Training Features :return: resampled data :rtype: tuple """ over = SMOTE(sampling_strategy=over_sampling) under = RandomUnderSampler(sampling_strategy=under_sampling) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) return pipeline.fit_resample(X, y)
def test_fit_predict_on_pipeline(): # test that the fit_predict method is implemented on a pipeline # test that the fit_predict on pipeline yields same results as applying # transform and clustering steps separately iris = load_iris() scaler = StandardScaler() km = KMeans(random_state=0) # As pipeline doesn't clone estimators on construction, # it must have its own estimators scaler_for_pipeline = StandardScaler() km_for_pipeline = KMeans(random_state=0) # first compute the transform and clustering step separately scaled = scaler.fit_transform(iris.data) separate_pred = km.fit_predict(scaled) # use a pipeline to do the transform and clustering in one step pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)]) pipeline_pred = pipe.fit_predict(iris.data) assert_array_almost_equal(pipeline_pred, separate_pred)
def training_imbalance(descr_series, classes_codes, TFIDF_, IMB_, FS_, req_percentage, CLF_, model_path): """ Trains models using handled setting and saves them as .sav objects. Parameters: descr_series(Series): description series; classes_codes(Series): series with classes' codes; TFIDF_: vectorizer; IMB_: SMOTE method; FS_: ranking terms method; req_percentage(int): percentage to be taken from the ranked list; CLF_: classifier; model_path(str): the path to the model. """ transformer = feature_selection.SelectPercentile(FS_) clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_), ('fs', transformer), ('clf', CLF_)]) clf_model.set_params(fs__percentile=req_percentage).fit( descr_series, classes_codes) dump(clf_model, open(model_path + '.sav', 'wb'))
def split_smote(drug_df, drug_name): X = drug_df.drop([drug_name], axis=1) y = drug_df[drug_name] counter = Counter(y) print('Originally, the distribution of classes is: {}'.format(counter)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y) over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('o', over), ('u', under)] pipeline = Pipeline(steps=steps) Xsm_train, ysm_train = pipeline.fit_resample(X_train, y_train) counter_balance = Counter(ysm_train) print( 'After SMOTE sampling, the distribution of classes in Training set is: {}' .format(counter_balance)) XSM_train = pd.DataFrame(Xsm_train, columns=X_train.columns) return XSM_train, ysm_train, X_test, y_test
def cvsmote(): X = df_small.drop(['HospID', 'SiteID', 'surgid', 'Complics', 'Mortality'], axis=1) y = df_small['Mortality'] steps = [('over', SMOTE()), ('model', XGBClassifier(objective='binary:logistic', eval_metric='logloss'))] pipeline = Pipeline(steps=steps) # evaluate pipeline for scoring in ["accuracy", "roc_auc"]: cv = StratifiedKFold(n_splits=10, random_state=0) scores = cross_val_score(pipeline, X, y, scoring=scoring, cv=cv, n_jobs=-1) print("Model", scoring, " mean=", scores.mean(), "stddev=", scores.std())
def model_select(): for nome_balanceador, balanceador in balanceadores: if classificador_ja_executado(nome, nome_balanceador): continue else: print(balanceador) pipeline = Pipeline([('dimension', PCA(n_components=250)), ('balance', balanceador), ('clf', modelo)]) print("# Rodando o algoritmo %s" % nome) print() np.set_printoptions(precision=4) pipeline.fit(dados_completo_x, dados_completo_y) print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_pred = pipeline.predict(test_x) matriz_confusao = confusion_matrix(test_y, y_pred) nome_arquivo = nome + '_' + nome_balanceador + '_best_mucilage' plot_confusion_matrix(matriz_confusao, nome_arquivo, [1, 2, 3, 4], False, title='Confusion matrix' + nome + ' (best parameters)') plot_confusion_matrix(matriz_confusao, nome_arquivo, [1, 2, 3, 4], True, title='Confusion matrix ' + nome + ', normalized') print('Matriz de Confusão') print(matriz_confusao) print(classification_report(y_true=test_y, y_pred=y_pred, digits=4)) y_pred = pipeline.predict_proba(test_x) roc_auc_aux(test_y, y_pred, nome, nome_balanceador) print() sys.stdout.flush()
def train_validate(model, preprocess, param_grid, X_train, y_train, metric='roc_auc', n_iter=20): final_model = Pipeline([('upsampling', preprocess['upsampling']), ('transform', preprocess['transform']), ('classifier', model)]) model_search = BayesSearchCV(estimator=final_model, search_spaces=param_grid, scoring=metric, n_iter=n_iter, n_jobs=-1).fit(X_train, y_train) print("Parameters search completed!") best_model = model_search.best_estimator_ best_model_scores = cross_validate(best_model, X_train, y_train, cv=RepeatedStratifiedKFold(n_repeats=5), scoring=metric, n_jobs=-1) print("Cross validation on best model completed!") best_model_scores = best_model_scores["test_score"] mean_valid_score = np.round(np.mean(best_model_scores), 4) print("Mean validation score: ", mean_valid_score) done() return best_model, best_model_scores, mean_valid_score
def hyper_paramitize_scale_gridSearch(): counter = Counter(y) # estimate scale_pos_weight value estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate) print(counter[0]) print(counter[1]) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss') random = RandomUnderSampler(sampling_strategy=0.33) # define grid # weights = [1,3, 10, 25,30, 50, 75, 99, 100] #param_grid = dict(scale_pos_weight=weights) #param_grid= {'xgbclassifier__scale_pos_weight': weights} learning_rates = [0.1, 0.05, 0.01] max_depths = [1, 2, 3, 5, 8, 10] param_grid = { 'xgbclassifier__max_depth': max_depths, 'xgbclassifier__learning_rate': learning_rates } # define evaluation procedure cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1) # define grid search pipeline = Pipeline([('under', random), ('xgbclassifier', model)]) grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='roc_auc') # execute the grid search grid_result = grid.fit(X, y) # report the best configuration print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # report all configurations means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression() rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def under_over_sample(X, y, under_samp_rate=0.15, over_samp_rate=0.75, random_state=42): under = RandomUnderSampler( sampling_strategy=under_samp_rate, random_state=random_state, ) over = RandomOverSampler(sampling_strategy=over_samp_rate, random_state=random_state) steps = [('under', under), ('over', over)] pipeline = Pipeline(steps=steps) X_res, y_res = pipeline.fit_resample(np.array(X).reshape(-1, 1), y) combined = pd.DataFrame(data={ "TEXT": X_res.squeeze(), "OUTPUT_LABEL": y_res }) return combined.fillna("")
def test_evaluate_pipeline(self): runner = CliRunner() pattern = "/*.joblib" X, y = load_dataset() dummy_pipeline = Pipeline( [("dummy_classifier", DummyClassifier(strategy="constant", constant=0))] ) with tempfile.TemporaryDirectory() as destination: threshold = destination + "/DUMMY_threshold.json" train_pipeline( X=X, y=y, model="DUMMY", pipeline=dummy_pipeline, destination=destination, ignore_prints=True, ignore_html=True, ) pipeline_path = glob.glob(destination + pattern) runner.invoke( main, [ "evaluate", "--pipeline", pipeline_path[0], "--threshold", threshold, "--prefix", "DUMMY", "--destination", destination, ], ) files = glob.glob(destination + "/*") self.assertTrue(any([".png" in file for file in files])) self.assertTrue(any([".json" in file for file in files])) self.assertTrue(any([".csv" in file for file in files]))