def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([("rus", rus), ("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression(solver="lbfgs", multi_class="auto") filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([("anova", filter1), ("logistic", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA(svd_solver="full", n_components="mle", whiten=True) pipe = Pipeline([("pca", pca), ("svc", clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_pipeline_methods_anova_rus(): # Test the various methods of the pipeline (anova). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression() rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver="randomized", whiten=True) clf = SVC( gamma="scale", probability=True, random_state=0, decision_function_shape="ovr", ) for preprocessing in [scaler, pca]: pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def Predict(data, mode): train, test = data idx = test.id.values.astype(int) y = train.median_relevance.values train_query = list( train.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) train_title = list( train.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) test_query = list( test.apply(lambda x: '%s' % x['query_preprocessed'], axis=1)) test_title = list( test.apply(lambda x: '%s' % x['product_title_preprocessed'], axis=1)) stop_words = text.ENGLISH_STOP_WORDS.union(['http','www','img','border','color','style','padding','table','font', \ 'thi','inch','ha','width','height','0','1','2','3','4','5','6','7','8','9']) stop_words = text.ENGLISH_STOP_WORDS.union(set(stopwords.words('english'))) tfv = text.TfidfVectorizer(min_df=7, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', \ ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words) tfv.fit(train_query + train_title) X_train = hstack([tfv.transform(train_query), tfv.transform(train_title)]) X_test = hstack([tfv.transform(test_query), tfv.transform(test_title)]) sim = similarlity_stack() if mode == 'eda': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('svm', svm)]) elif mode == 'sampling': svd = TruncatedSVD(n_components=200) scl = StandardScaler(with_mean=False) svm = SVC(C=10, gamma="auto", kernel="rbf", class_weight=None, probability=True) sampling = SVMSMOTE(svm_estimator=svm, k_neighbors=4) clf = Pipeline([('FeatureUnion', FeatureUnion( [('svd', svd), ('sim', sim)] )),\ ('scl', scl),\ ('sampling', sampling),\ ('svm', svm)]) clf.fit(X_train, y) preds = clf.predict(X_test) pred_probas = clf.predict_proba(X_test) submission = pd.DataFrame({"id": idx, "prediction": preds}) submission_probas = pd.DataFrame(pred_probas, index=idx) return submission, submission_probas
def find_expert(tag): """ 输出话题标签[TAG]下模型预测的最有可能是潜在专家的20名用户 """ fold = StratifiedKFold(n_splits=4) params = best_solution(tag) data, target, ratio = load_data(tag) fold.random_state = int(params['seed']) samp = ADASYN(n_neighbors=2, sampling_strategy=float(params['sampling_strategy']) * ratio, random_state=int(params['seed'])) clf = XGBClassifier(n_estimators=int(params['n_estimators']), gamma=float(params['gamma']), eta=float(params['eta']), reg_lambda=int(params['reg_lambda']), verbosity=0, n_jobs=-1, random_state=int(params['seed'])) pipeline = Pipeline([(type(samp).__name__, samp), (type(clf).__name__, clf)]) experts = pd.DataFrame(columns=['id', 'probability']) for _, (train, test) in tqdm(enumerate(fold.split(data, target)), total=4): pipeline.fit(data.iloc[train], target.iloc[train]) pred_proba = pd.Series(pipeline.predict_proba(data.iloc[test])[:, 1], index=target.iloc[test].index, name='probability') experts = experts.append(pred_proba.to_frame().reset_index()) experts = experts.sort_values(by=['probability'], ascending=False).iloc[:20] experts['probability'] = experts['probability'].astype(float).map( "{:.1%}".format) print(experts.to_string(index=False))
def test_pipeline_methods_rus_pca_svm(): # Test the various methods of the pipeline (pca + svm). X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) # Test with PCA + SVC clf = SVC(probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def train_test_and_evaluate(seed, X_train, X_test, y_train, y_test, onehots, numericals, cv): # define transformer for features that are one hot encoded and pre-select features using chi square selector = SelectKBest(chi2, k=15) ohc = OneHotEncoder(handle_unknown='ignore') onehot_transformer = Pipeline(steps=[('selector', selector), ('ohc', ohc)]) # define transformer for numerical features, scale features and shrink space using pca scaler = StandardScaler() pca = PCA() numeric_transformer = Pipeline(steps=[('scaler', scaler), ('pca', pca)]) # define resampling: oversample minority class first and undersample majority class afterwards over = RandomOverSampler(random_state=random_seed, sampling_strategy=0.1) under = RandomUnderSampler(random_state=random_seed, sampling_strategy=0.5) # combine steps into preprocessing and machine learning pipeline using logistic regression preprocessor = ColumnTransformer( transformers=[('numeric', numeric_transformer, numericals), ('onehot', onehot_transformer, onehots)]) pipe_model = Pipeline( steps=[('over', over), ('under', under), ( 'prep', preprocessor), ('classifier', LogisticRegression(max_iter=1000))]) # Cross-validate model on training data to estimate performance cv_results = 2 * roc_auc_score( y_train, cross_val_predict( pipe_model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]) - 1 print("Mean training gini after CV: {res}".format(res=cv_results.mean())) # Fit the model to training data and evaluate on test data and finally on evaluation data pipe_model.fit(X_train, y_train) y_true, y_pred = y_test, pipe_model.predict(X_test) gini = 2 * roc_auc_score(y_true, pipe_model.predict_proba(X_test)[:, 1]) - 1 print("Gini score on test set: " + str(gini)) y_true, y_pred = y_val, pipe_model.predict(X_val) gini = 2 * roc_auc_score(y_true, pipe_model.predict_proba(X_val)[:, 1]) - 1 print("Gini score on validation set: " + str(gini))
class ImblearnRecalibrator(BaseEstimator, ClassifierMixin): """ imblearnのリサンプリングの偏りを再較正するやつ 再較正のコードを毎回書きたくない. scikit-learnの設計思想に則りオブジェクト指向プログラミングをしよう estimator, resampler, サンプリング割合を指定したら後は fit & predict/predict_proba するだけ * 注意: 不均衡データに対するリサンプリングは分類性能を目的としているので判別性能等に効果があるかは知らない :param estimatror: scikit-learn API 準拠の estimator オブジェクト :param resampler: imblearn で使われる各種 resampler オブジェクト :param post_minor_rate: リサンプリング後の**全件に対する少数例の割合**を指定. default is None. alpha とどちらか片方を使う. :param alpha: **リサンプリング前に対する**事後の少数例の割合**を指定. default is 'auto'. post_minor_rate とどちらか片方を使う. """ def __init__(self, estimator, resampler, alpha='auto', post_minor_rate=None): resampler = clone(resampler) if post_minor_rate is None and alpha is None: warnings.warn( 'neither of `post_minor_rate` nor `alpha` are specified. Instead resampling stragegy specified in `resampler` object is used.' ) elif post_minor_rate and alpha: warnings.warn( 'both of `post_minor_rate` and `alpha` are specified. the former is applied.' ) self.post_minor_rate = post_minor_rate self.resampling_strategy = 'posterior_rate' elif post_minor_rate: self.post_minor_rate = post_minor_rate self.resampling_strategy = 'posterior_rate' elif alpha: self.alpha = alpha self.resampling_strategy = 'alpha' resampler.set_params(sampling_strategy=alpha) else: raise ('initialized error') self.estimator_ = Pipeline([('resampler', resampler), ('estimator', clone(estimator))]) def fit(self, X, y): if self.resampling_strategy == 'posterior_rate': alpha = get_oversampling_rate(self.post_minor_rate) self.alpha = alpha self.estimator_['resampler'].set_params(sampling_strategy=alpha) self.estimator_.fit(X, y) self.minor_rate_ = np.min([y.mean(), 1 - y.mean()]) return self def predict(self, X): return self.estimator_.predict(X) def predict_proba(self, X): return calibrate_imbalanceness(self.estimator_.predict_proba(X), pos_rate=get_oversampling_power( self.alpha, self.minor_rate_))
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def get_probabilities(text: str, classes: list, model: Pipeline): """ Calculates probabilities of text belonging to each model's class. Parameters: ---------- text: Text for analysis. classes: Models' classes. model: The trained model. Returns: ---------- Probabilities of finding a word in a particular class. """ probabilities = np.array(np.around(model.predict_proba([text])[0], 3), dtype=float).flatten() probabilities = dict(zip(classes, probabilities)) return probabilities
def model_select(): for nome_balanceador, balanceador in balanceadores: if classificador_ja_executado(nome, nome_balanceador): continue else: print(balanceador) pipeline = Pipeline([('dimension', PCA(n_components=250)), ('balance', balanceador), ('clf', modelo)]) print("# Rodando o algoritmo %s" % nome) print() np.set_printoptions(precision=4) pipeline.fit(dados_completo_x, dados_completo_y) print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_pred = pipeline.predict(test_x) matriz_confusao = confusion_matrix(test_y, y_pred) nome_arquivo = nome + '_' + nome_balanceador + '_best_mucilage' plot_confusion_matrix(matriz_confusao, nome_arquivo, [1, 2, 3, 4], False, title='Confusion matrix' + nome + ' (best parameters)') plot_confusion_matrix(matriz_confusao, nome_arquivo, [1, 2, 3, 4], True, title='Confusion matrix ' + nome + ', normalized') print('Matriz de Confusão') print(matriz_confusao) print(classification_report(y_true=test_y, y_pred=y_pred, digits=4)) y_pred = pipeline.predict_proba(test_x) roc_auc_aux(test_y, y_pred, nome, nome_balanceador) print() sys.stdout.flush()
def performance(): """ 分析模型性能 """ tqdm.write("*" * 50 + "\n\tStackOverlflow Expert Prediction\n" + "*" * 50) val = RepeatedStratifiedKFold(n_splits=3, n_repeats=2) y_real = [[], [], [], [], [], []] y_proba = [[], [], [], [], [], []] f_importance = [] # 对每个话题标签分别进行测试 for tag in tqdm(get_tags()): params = best_solution(tag) # 获取最优参数 data, target, ratio = load_data(tag) # 加载数据 val.random_state = int(params['seed']) # 设置随机数种子 # 建立过采样和分类器的流水线模型 samp = ADASYN(n_neighbors=2, sampling_strategy=float(params['sampling_strategy']) * ratio, random_state=int(params['seed'])) clf = XGBClassifier(n_estimators=int(params['n_estimators']), gamma=float(params['gamma']), eta=float(params['eta']), reg_lambda=int(params['reg_lambda']), verbosity=0, n_jobs=-1, random_state=int(params['seed'])) pipeline = Pipeline([(type(samp).__name__, samp), (type(clf).__name__, clf)]) # 对交叉验证的子集分别进行测试 for ind, (train, test) in tqdm(enumerate(val.split(data, target)), leave=False, total=6): pipeline.fit(data.iloc[train], target.iloc[train]) y_real[ind].append(target.iloc[test]) # 真实结果 y_proba[ind].append(pipeline.predict_proba( data.iloc[test])[:, 1]) # 预测概率 f_importance.append( pipeline[type(clf).__name__].feature_importances_) # 特征重要性 display(y_real, y_proba, f_importance, data.columns)
def run(fold,model): df = pd.read_csv('../input/train_fold.csv') df['bmi'] = df['bmi'].fillna(np.mean(df['bmi'])) df_train = df[df.kfold!=fold].reset_index(drop=True) df_valid = df[df.kfold==fold].reset_index(drop=True) features = [f for f in df.columns if f not in ('id','stroke','kfold')] categorial_features = [f for f in features if df[f].dtype==object] numerical_features = [f for f in features if df[f].dtype!=object] over = SMOTE(sampling_strategy=0.1) under = RandomUnderSampler(sampling_strategy=0.5) preprocess = make_column_transformer((OneHotEncoder(),categorial_features),(StandardScaler(),numerical_features)) x_train = df_train[features] y_train = df_train.stroke x_valid = df_valid[features] y_valid = df_valid.stroke clf = models[model] steps = [('preprocess',preprocess),('over',over),('under',under),('clf',clf)] pipe = Pipeline(steps=steps) pipe.fit(x_train,y_train) y_pred = pipe.predict_proba(x_valid)[:,1] auc = roc_auc_score(y_valid,y_pred) print("Fold : {} AUC Score: {:.3f}".format(fold,auc)) joblib.dump(pipe,f'../models/dt_{model}_{fold}.bin')
def ROC_curve(classifiers): table = pd.DataFrame(columns=['classifiers', 'fpr', 'tpr', 'auc']) for model in classifiers: model_name = type(model).__name__ model.probability = True model = Pipeline([('sampling', SMOTE(sampling_strategy='minority')), ('model', model)]) model = model.fit(X_train, y_train) y_pred = model.predict_proba(X_test)[::, 1] fpr, tpr, _ = roc_curve(y_test, y_pred) auc = roc_auc_score(y_test, y_pred) table = table.append( { 'classifiers': model_name, 'fpr': fpr, 'tpr': tpr, 'auc': auc }, ignore_index=True) # Set name of the classifiers as index labels table.set_index('classifiers', inplace=True) for i in table.index: plt.plot(table.loc[i]['fpr'], table.loc[i]['tpr'], label="{}, AUC={:.3f}".format(i, table.loc[i]['auc'])) plt.plot([0, 1], [0, 1], color='black', linestyle='--') plt.xticks(np.arange(0.0, 1.1, step=0.1)) plt.xlabel("False Positive Rate", fontsize=15) plt.yticks(np.arange(0.0, 1.1, step=0.1)) plt.ylabel("True Positive Rate", fontsize=15) plt.title('ROC Curves', fontweight='bold', fontsize=10) plt.legend(prop={'size': 9}, loc='lower right') plt.savefig(folder_plots + 'ROC_Curve.png', dpi=400) plt.show()
######################### FITTING THE MODEL AND PREDICTING ############################## # we fit the model clf.fit(X_train, y_train) y_pred = clf.predict(X_test) ######################## METRICS ###################################################### # we see how well our model is at making predictions # confusion matrix # we can see how our model classified the items. cnf_matrix = metrics.confusion_matrix(y_test, y_pred) cnf_matrix # ROC CURVE # we draw a roc curve and look at auc score. Straight line indicates that model is not # classifying well. The closer to 1 the auc score is, the better the model. import matplotlib.pyplot as plt y_pred_proba = clf.predict_proba(X_test)[::, 1] fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba) auc = metrics.roc_auc_score(y_test, y_pred_proba) plt.plot(fpr, tpr, label="data 1, auc=" + str(auc)) plt.legend(loc=4) plt.show() # We can also look at the report. We can see f1 score for the event haappening. We use it to # compare the models print(metrics.classification_report(y_test, y_pred))
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def score(self, pipeline: Pipeline, dataset: Dataset, class_names: List[str] = None): """ Computes scores for metrics provided in Scorer constructor. If y_true is multi class then scorer does macro average mode for precision/recall/f1. If y_true is multilabel the scores performs macro averaging. Parameters ---------- pipeline: Pipeline Complete pipeline including features pipeline and classifier. dataset: Dataset Dataset containing x and y pd.DataFrames For Multiclass the shape of y_true should be 1-D (NOT one-hot encoded). For Multilabel the shape should be n-dimensional (where n is number of classes). class_names: List of strings, optional If given, the scores for separate classes will be displayed with appropriate names. Returns ------- metrics: Dict Dictionary with metrics' names as keys and scores as values """ x, y_true = dataset.x, dataset.y.to_numpy() if len(y_true.shape) == 1: y_true = y_true.reshape(-1, 1) # run inference for probabilities probabilities = pipeline.predict_proba(x) # check if the output of inference is a list (sklearn models often output probabilities in this form # in case of multilabel task). If yes, convert to a single array. if isinstance(probabilities, list): probabilities = convert_list_probas_to_array(probabilities) # check task type based on the true and predicted arrays. self.task = check_task(y_true, probabilities) # turn probabilities into predictions with chosen threshold if self.task in ['binary', 'multiclass']: predictions = np.argmax(probabilities, axis=-1) else: predictions = np.where(probabilities >= self.threshold, 1, 0) # assign number of classes based on given array self.n_classes = probabilities.shape[-1] # assign names of classes self.class_names = class_names if class_names else [f'class_{i}' for i in range(self.n_classes)] # check if any of ['precision', 'recall', 'f1', 'accuracy'] are in the metrics. # if yes generate classification report - it calculates all of these metrics. # it does not calculate accuracy for multilabel problem so additional check is done in such case. if [metric for metric in ['precision', 'recall', 'f1', 'accuracy'] if metric in self.metrics]: self.scores_dict.update(classification_report(y_true, predictions, target_names=self.class_names, output_dict=True)) if self.task == 'multilabel': self.scores_dict['accuracy'] = accuracy_score(y_true, predictions) if 'auc' in self.metrics: self.fpr, self.tpr, self.roc_auc_dict = calculate_roc_auc(y_true, probabilities, self.class_names, self.task) for key, value in self.roc_auc_dict.items(): self.scores_dict[key]['auc'] = value if self.report: print(pd.DataFrame(self.scores_dict).transpose()) return self._get_metrics()
def mrmr_feature_selection(self, input, output, dict_of_models, list_number_of_features_to_select): """ Performs models evaluation within the No_outer times repeated No_inner-fold cross-validation procedure for different number of features selected by mRMR algorithm with nested 10-times cross-validation for model hyperparameters' tuning ---------- :param input : array-like, shape (n_samples, n_features) The training input samples. :param output : array-like, shape (n_samples, 1) The target values. :param dict_of_models: dictionary Models with details for grid-search. :param list_number_of_features_to_select - list Number of features to select. :return df_aucs : DataFrame object, shape (No_outer x No_inner, number of models x length of list_number_of_features) AUC values for every step of No_outer x No_inner-times CV are provided. :return df_res : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 9) For every model and every No. of selected features best classifier's parameters and averaged classification metrics are provided : Accuracy, Sensitivity, Specificity, Precision, F1-Score, AUC. :return df_stds : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 8) For every model and every No. of selected features standard deviations of classification metrics are provided. """ df_res = pd.DataFrame(columns=[ 'Classifier', 'Selected features', 'Best parameters', 'Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'F1-score', 'ROC_AUC' ]) df_stds = pd.DataFrame(columns=[ 'Classifier', 'Selected features', 'Acc_std', 'Sens_std', 'Spec_std', 'Prec_std', 'F1_std', 'ROC_AUC_std' ]) df_aucs = pd.DataFrame() for m in dict_of_models: for k in list_number_of_features_to_select: accuracy = [] aucs = [] sensitivity = [] specificity = [] precision = [] f1score = [] tprs = [] params = [] X, y = input, output skf = RepeatedStratifiedKFold(n_splits=self.N_inner, n_repeats=self.N_outer, random_state=88) clf = m['classifier'] for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] best_params = [] # MRMR mrmr = MRMR(n_features=k) mrmr_smote_clf = Pipeline([('oversampling', ADASYN(random_state=88)), ('feature_selection', mrmr), ('classifier', clf)]) mrmr_smote_clf.fit(X_train, y_train) # best_params.append(gridsearch_cv.best_params_) # predicted class y_predict = mrmr_smote_clf.predict(X_test) # predicted probabilities probas_ = mrmr_smote_clf.predict_proba(X_test) # confusion matrix cm = confusion_matrix(y_test, y_predict) # accuracy acc = accuracy_score(y_predict, y_test) accuracy.append(acc) # sensitivity = recall sens = recall_score(y_test, y_predict) sensitivity.append(sens) # specificity spec = self.get_specificity(y_test, y_predict) specificity.append(spec) # precision prec = precision_score(y_test, y_predict) precision.append(prec) # f1-score f1 = f1_score(y_test, y_predict) f1score.append(f1) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test_index], probas_[:, 1]) tprs.append(interp(self.mean_fprs, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) # best parameters params.append(best_params) df_aucs[m['name'] + str(k)] = aucs df_stds = df_stds.append( { 'Classifier': m['name'], 'Selected features': k, 'Acc_std': np.std(accuracy), 'Sens_std': np.std(sensitivity), 'Spec_std': np.std(specificity), 'Prec_std': np.std(precision), 'F1_std': np.std(f1score), 'ROC_AUC_std': np.std(aucs) }, ignore_index=True) df_res = df_res.append( { 'Classifier': m['name'], 'Selected features': k, 'Best parameters': params, 'Accuracy': np.mean(accuracy), 'Sensitivity': np.mean(sensitivity), 'Specificity': np.mean(specificity), 'Precision': np.mean(precision), 'F1-score': np.mean(f1score), 'ROC_AUC': np.mean(aucs) }, ignore_index=True) return df_aucs, df_res, df_stds
def compute_pr_auc_score(model: Pipeline, features: np.ndarray, labels: np.ndarray) -> float: probabilities = model.predict_proba(features)[:, 1] precision, recall, _ = precision_recall_curve(labels, probabilities) return auc(recall, precision)
over = SMOTE(sampling_strategy=0.1, k_neighbors=5) under = RandomUnderSampler(sampling_strategy=0.5) steps = [('over', over), ('under', under), ('model', model)] pipeline = Pipeline(steps=steps) cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1) scores_over = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1) print(f"k={k}\n") print(f"mean recall: {np.mean(scores_over)}\n") print(scores_over) X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3) pipeline.fit(X_train,y_train) yhat_test = pipeline.predict(X_test) yhat_test_proba = pipeline.predict_proba(X_test)[:,1] confusion_matrix = CM(y_test,yhat_test,np.unique(y_train)) precision_ls, recall_ls, threshold_ls = precision_recall_curve(y_test,yhat_test_proba) plt.figure(figsize=(10,10)) threshold_ls = np.append(threshold_ls,1) plt.plot(threshold_ls, precision_ls) plt.plot(threshold_ls, recall_ls) plt.legend(["precision","recall"]) tree1 = DecisionTreeClassifier( max_depth=3, min_samples_leaf = 30, class_weight="balanced") tree1.fit(X_train, y_train) fig = plt.figure(figsize=(25,20))
('learner', learner)] pipeline = Pipeline(steps=steps) pipeline.fit(exec_training_features, exec_training_target) # prediction predicted = pipeline.predict(exec_test_features) # evaluation acc = accuracy_score(exec_test_target, predicted) precision, recall, f1, _ = precision_recall_fscore_support( exec_test_target, predicted, average='binary', zero_division=0) if hasattr(pipeline, "predict_proba"): false_positive_rate, true_positive_rate, _ = roc_curve( exec_test_target, pipeline.predict_proba(exec_test_features)[:, 1]) else: false_positive_rate, true_positive_rate, _ = roc_curve( exec_test_target, pipeline['learner']._predict_proba_lr( exec_test_features)[:, 1]) auroc = auc(false_positive_rate, true_positive_rate) # precision-recall AUC if precision == 0.0 and recall == 0.0 and f1 == 0.0: f1 = 'ND' auprc = 'ND' else: precision_, recall_, _ = precision_recall_curve( exec_test_target, predicted) f1 = '{:.3f}'.format(f1)
pipeline_xgboost=Pipeline([('xgboost_over' , SMOTE(random_state=45)), ('xgboost_scalar',StandardScaler()), ('classifier_xgboost',XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3, gamma=0.0, learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=None, subsample=1, verbosity=1))]) pipeline_xgboost.fit(x_train, y_train) prediction = pipeline_xgboost.predict_proba(x_test) prediction=pd.DataFrame(prediction) prediction.columns=['class_0' , 'prediction'] prediction=prediction.drop('class_0' , axis = 1) prediction=prediction*100 prediction['prediction']=prediction['prediction'].astype(int) for i in prediction['prediction']: if i >=60: prediction['prediction']=prediction['prediction'].replace(i , '1') else: prediction['prediction']=prediction['prediction'].replace(i , '0') prediction.value_counts()
module__hidden_dim=hidden_dim, module__dropout_rate=dropout_rate, batch_size=batch_size, max_epochs=max_epochs, train_split=None, optimizer=torch.optim.Adam, iterator_train__shuffle=True, device='cuda') # 定义Pipeline pipe = Pipeline([('model', LSTM)]) # 使用LSTM进行训练 pipe.fit(X_train, y_train.astype(np.float)) H_list_lstm.clear() # 得到预测结果 X_test_predictions_1 = pipe.predict_proba(X_test)[:, 1] model_evaluate(y_test, np.ones(len(y_test))) print(X_test_predictions_1) print(y_test) # 对LSTM预测结果进行评估 model_evaluate(y_test, X_test_predictions_1) fpr_1, tpr_1, thresholds_1 = roc_curve(y_test, X_test_predictions_1) pyplot.plot([0, 1], [0, 1], linestyle='--') pyplot.plot(fpr_1, tpr_1) pyplot.show() # 可视化 def visualize(X, y, points, n_features): # points随机排列
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
class PennyModel: """ The Model for the penny auction. Takes a sklearn classifier and fits the model after transformation. Attributes: model (SklearnClassifier): The model for the regression user_scaler (bool): Whether or not to scale the data first sampling_ratio (float): The ratio of the minority class to the majority class numeric_features (list(str)): The numerical features of the model cateogorical_features (list(str)): The categorical features of the model """ def __init__(self, model, use_scaler=False, sampling_ratio=1): """ Parameters: Returns: """ self.model = model self.sampling_ratio = sampling_ratio self.use_scaler = use_scaler self.categorical_features = [ 'cardtype', 'limited_allowed', 'is_locked', 'is_bidomatic', 'is_bidomatic0', 'is_bidomatic1', 'is_bidomatic2', 'is_bidomatic3' ] self.numeric_features = [ 'bid', 'cashvalue', 'bidvalue', 'prevusers', 'bids_so_far0', 'perc_to_bin0', 'bom_bids_so_far0', 'bom_streak0', 'prev_is_new_user0', 'prev_auction_count0', 'prev_overbid0', 'prev_giveup_one0', 'prev_give_before_six0', 'prev_wins0', 'prev_bids0', 'prev_bom_bids0', 'distance1', 'bids_so_far1', 'perc_to_bin1', 'bom_bids_so_far1', 'bom_streak1', 'prev_is_new_user1', 'prev_auction_count1', 'prev_overbid1', 'prev_giveup_one1', 'prev_give_before_six1', 'prev_wins1', 'prev_bids1', 'prev_bom_bids1', 'distance2', 'bids_so_far2', 'perc_to_bin2', 'bom_bids_so_far2', 'bom_streak2', 'prev_is_new_user2', 'prev_auction_count2', 'prev_overbid2', 'prev_giveup_one2', 'prev_give_before_six2', 'prev_wins2', 'prev_bids2', 'prev_bom_bids2', 'distance3', 'bids_so_far3', 'perc_to_bin3', 'bom_bids_so_far3', 'bom_streak3', 'prev_is_new_user3', 'prev_auction_count3', 'prev_overbid3', 'prev_giveup_one3', 'prev_give_before_six3', 'prev_wins3', 'prev_bids3', 'prev_bom_bids3', 'is_weekend', 'time_of_day' ] def get_features_as_string(self): """ Returns all the features of the model. Parameters: Returns: """ return ",".join(self.categorical_features + self.numeric_features) def get_column_names_from_ColumnTransformer(self, column_transformer): """ Parameters: Returns: """ col_name = [] for transformer_in_columns in column_transformer.transformers_[: -1]: #the last transformer is ColumnTransformer's 'remainder' raw_col_name = transformer_in_columns[2] if isinstance(transformer_in_columns[1], Pipeline): transformer = transformer_in_columns[1].steps[-1][1] else: transformer = transformer_in_columns[1] try: names = transformer.get_feature_names( self.categorical_features) except AttributeError: # if no 'get_feature_names' function, use raw column name names = raw_col_name if isinstance(names, np.ndarray): # eg. col_name += names.tolist() elif isinstance(names, list): col_name += names elif isinstance(names, str): col_name.append(names) return col_name def transform(self, X): """ Parameters: Returns: """ rX = X.copy() return self.transform_no_copy(rX) def transform_no_copy(self, X): """ Parameters: Returns: """ #rX = X.copy() #print ("2. Transforming data") X.is_bidomatic0 = X.is_bidomatic0.astype(str) X.is_bidomatic1 = X.is_bidomatic1.astype(str) X.is_bidomatic2 = X.is_bidomatic2.astype(str) X.is_bidomatic3 = X.is_bidomatic3.astype(str) X["fee"] = [ 0 if x == 0 else (1 if x < 50 else 1.99) for x in X["cardvalue"] ] X["time_of_day"] = [x.hour for x in X["auctiontime"]] X["is_weekend"] = [x.weekday() >= 6 for x in X["auctiontime"]] return X def internal_fit(self, X, y): """ Fits self.model Parameters: Returns: """ self.train_pop = X.shape[0] self.target_pop = sum(y) self.sampled_train_pop = self.target_pop / self.sampling_ratio + self.target_pop self.sampled_target_pop = self.target_pop numeric_transformer = Pipeline_imb( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-1)) # ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline_imb( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='unknown')), ('onehot', OneHotEncoder(handle_unknown='error', drop='first'))]) preprocessor = ColumnTransformer( transformers=[('num', numeric_transformer, self.numeric_features), ('cat', categorical_transformer, self.categorical_features)]) steps = [('preprocessor', preprocessor)] steps.append( ('sampler', RandomUnderSampler(sampling_strategy=self.sampling_ratio))) steps.append(('classifier', self.model)) self.pipeline = Pipeline_imb(steps=steps) print("4. Fitting model") self.pipeline.fit(X, y) def fit_already_transformed(self, X, y): """ fits X if it's already been transformed. Parameters: Returns: """ self.internal_fit(X, y) def fit_transform(self, X, y): """ fits and transforms X. Parameters: Returns: """ self.transform_no_copy(X) self.internal_fit(X, y) def pickle(self, filename): """ Writes this class as a pickle file to filename Parameters: Returns: """ print("5. Pickling model as penny_auction.pickle") pickle.dump(self, open(filename, "wb")) def predict_proba(self, X): """ Returns the predicted probabilities that the auction will end, in the UNDERSAMPLED data set. Parameters: Returns: """ return self.pipeline.predict_proba(self.transform(X)) def predict_proba_calibrated(self, X): """ Returns the probabilities from the model AFTER accounting for the undersampling. Parameters: Returns: """ return self.calibrate_probabilties(self.predict_proba(X)) def predict(self, X): """ Calls predict on the model to get binary whether or not the auction will end. Parameters: Returns: """ return self.pipeline.predict(self.transform(X)) def get_feature_scores(self): """ Returns the feature importances from the model Parameters: Returns: """ return pd.Series(self.pipeline.steps[2][1].feature_importances_, index=self.get_column_names_from_ColumnTransformer( self.pipeline.named_steps['preprocessor'])) def calibrate_probabilties(self, data): """ Recalibrates the probabilities to account for the undersampling. So if the model says 20%, it will comeout as something like 1.2% Parameters: Returns: """ calibrated_data = \ ((data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop)) / (( (1 - data) * (1 - self.target_pop / self.train_pop) / (1 - self.sampled_target_pop / self.sampled_train_pop) ) + ( data * (self.target_pop / self.train_pop) / (self.sampled_target_pop / self.sampled_train_pop) ))) return calibrated_data def get_actual_and_potential_profits(self, X, y): """ returns the actual and potential profits over X Parameters: Returns: """ potential_profits = (X.cashvalue - X.fee - X.bid / 100) - .4 actual_profits = y * (X.cashvalue - X.fee - X.bid / 100) - .4 return potential_profits, actual_profits def get_score(self, X, y): """ Returns the expected profit over the set X Parameters: Returns: """ cprobs = self.predict_proba_calibrated(X)[:, 1] pp, ap = self.get_actual_and_potential_profits(X, y) expected_value = np.multiply(cprobs, pp) - (1 - cprobs) * .4 return sum(ap[expected_value > 0])
def train_model(transactions_details): X = transactions_details.drop(columns='fraudster') y = transactions_details['fraudster'].copy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) categorical_cols = [ 'currency', 'transaction_state', 'type', 'source', 'entry_method', 'is_crypto', 'merchant_country', 'phone_country', 'user_country', 'kyc' ] numerical_cols = [ 'failed_sign_in_attempts', 'age', 'diff_in_days', 'amount_usd' ] # pre processing pipeline # Feature Scaling # One Hot Encoding preprocess = make_column_transformer( (make_pipeline(SimpleImputer(), StandardScaler()), numerical_cols), (OneHotEncoder(handle_unknown='ignore'), categorical_cols)) # Create a pipeline model = Pipeline([('preprocess', preprocess), ('sampling', SMOTE(random_state=42)), ('classification', RandomForestClassifier())]) # fit model model.fit(X_train, y_train) # Predict target vector y_pred = model.predict(X_test) print('Confusion matrix:\n', confusion_matrix(y_test, y_pred)) print('Classification report:\n', classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) # Create true and false positive rates false_positive_rate, true_positive_rate, threshold = roc_curve( y_test, y_pred) # Calculate Area Under the Receiver Operating Characteristic Curve probs = model.predict_proba(X_test) roc_auc = roc_auc_score(y_test, probs[:, 1]) print('ROC AUC Score:', roc_auc) # Obtain precision and recall precision, recall, thresholds = precision_recall_curve(y_test, y_pred) # Calculate average precision average_precision = average_precision_score(y_test, y_pred) # Plot the roc curve plot_roc_curve(false_positive_rate, true_positive_rate, roc_auc) # Plot recall precision curve plot_pr_curve(recall, precision, average_precision) return model
from sklearn.metrics import plot_roc_curve # Set the axes ax = plt.gca() # Plot the ROC curves of each GridSearch object on one graph for comparison. logis_disp.plot(ax=ax, alpha=0.8) gs_disp = plot_roc_curve(gs, X_test, y_test, ax=ax, alpha=0.8) gs2_disp= plot_roc_curve(gs2, X_test, y_test, ax=ax, alpha=0.8) # - # Plot the confusion matrixes of the fitted models. from sklearn.metrics import plot_confusion_matrix plot_confusion_matrix(search, X_test, y_test) plot_confusion_matrix(gs, X_test, y_test) plot_confusion_matrix(gs2, X_test, y_test) # As we can see from the above results, it seems like the Bagged Classifier displays the best performance in terms of the greatest AUC and also in tmers of the largest number of properly classified observations, hence we shall be using this as our final inferential model. # # Final Model Pipeline # + # Build the final model using the tuned parameters from before bestgbc =GradientBoostingClassifier(n_estimators = 770, learning_rate=0.05,max_features=10,subsample=0.8,random_state=42, max_depth = 3, min_samples_split = 400) # Put together the final pipeline with scaled inputs for the model, and make predictions. finalpipe = Pipeline(steps=[('scale',StandardScaler()),('classifier', bestgbc)]) finalpipe.fit(X_train_res, y_train_res.values.ravel()) y_predfinal=finalpipe.predict(X_test_res) y_predprobs=finalpipe.predict_proba(X_test_res)