# N = int(len(X) * 9 / 10)
# X_train, y_train = X[:N], y[:N]
# X_test, y_test = X[N:], np.array(y[N:])

N_train = int(len(X) * 6 / 10)
N_valid = int(len(X) * 8 / 10)
X_train, y_train = X[:N_train], y[:N_train]
X_valid, y_valid = X[N_train:N_valid], y[N_train:N_valid]
X_test, y_test = X[N_valid:], np.array(y[N_valid:])

Cs = np.logspace(-2, 5, 10)
valid_predict = []

for C in Cs:
    estimator = LogisticRegression(class_weight='auto', C=C)
    estimator.fit(X_train, y_train)
    y_predict_val = estimator.predict(X_valid)
    valid_predict.append(1.0 * np.sum(y_predict_val == y_valid) / len(y_valid))

valid_predict = np.array(valid_predict)
C = Cs[np.argmax(valid_predict)]

print("C:", C, "Accurary(valid):", np.max(valid_predict))
# estimator = RandomForestClassifier(n_estimators=200)

estimator = LogisticRegression(class_weight='auto', C=C)
estimator.fit(X_train, y_train)
y_predict = estimator.predict(X_test)
print(y_predict)
print(y_test)
Beispiel #2
0
                               "punctuation", "length"
                           ], ["afinn", "dal", "liwc", "hl"],
                           ["language", "BoURL", "targetrelation"]],
    "es indipendencia test": [["BoW", "BoP", "BoL", "BoC"],
                              [
                                  "hashtagplus", "hashtag", "mention",
                                  "nummention", "numhashtag", "uppercase",
                                  "punctuation", "length"
                              ], ["afinn", "dal", "liwc", "hl"],
                              ["language", "BoURL", "targetrelation"]],
}

clfs = {
    "NB": GaussianNB(),
    "SVM": SVC(kernel="linear"),
    "LR": LogisticRegression()
}

for i in range(0, len(training)):

    for key, clf in clfs.items():
        print(key, label[i])

        tweets_training = training[i]
        tweets_test = test[i]
        stance_training = numpy.array(
            feature_manager.get_stance(tweets_training))
        stance_test = numpy.array(feature_manager.get_stance(tweets_test))

        prec, recall, f, support = precision_recall_fscore_support(
            stance_test,
# -*- coding: utf-8 -*-
"""
Created on Mon Dec  3 15:53:48 2018

@author: Administrator
"""
from sklearn.linear_model.logistic import LogisticRegression
import logic_regression
from sklearn.cross_validation import train_test_split
X, y = logic_regression.loadData()
clss = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
clss.fit(X_train, y_train)
print clss.coef_
print clss.intercept_

print clss.score(X_test, y_test)
Beispiel #4
0
def test_logistic_regression_sample_weights():
    X, y = make_classification(n_samples=20, n_features=5, n_informative=3,
                               n_classes=2, random_state=0)
    sample_weight = y + 1

    for LR in [LogisticRegression, LogisticRegressionCV]:

        # Test that passing sample_weight as ones is the same as
        # not passing them at all (default None)
        for solver in ['lbfgs', 'liblinear']:
            clf_sw_none = LR(solver=solver, fit_intercept=False,
                             random_state=42)
            clf_sw_none.fit(X, y)
            clf_sw_ones = LR(solver=solver, fit_intercept=False,
                             random_state=42)
            clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
            assert_array_almost_equal(
                clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)

        # Test that sample weights work the same with the lbfgs,
        # newton-cg, and 'sag' solvers
        clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False, random_state=42)
        clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight)
        clf_sw_n = LR(solver='newton-cg', fit_intercept=False, random_state=42)
        clf_sw_n.fit(X, y, sample_weight=sample_weight)
        clf_sw_sag = LR(solver='sag', fit_intercept=False, tol=1e-10,
                        random_state=42)
        # ignore convergence warning due to small dataset
        with ignore_warnings():
            clf_sw_sag.fit(X, y, sample_weight=sample_weight)
        clf_sw_liblinear = LR(solver='liblinear', fit_intercept=False,
                              random_state=42)
        clf_sw_liblinear.fit(X, y, sample_weight=sample_weight)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_liblinear.coef_, decimal=4)

        # Test that passing class_weight as [1,2] is the same as
        # passing class weight = [1,1] but adjusting sample weights
        # to be 2 for all instances of class 2
        for solver in ['lbfgs', 'liblinear']:
            clf_cw_12 = LR(solver=solver, fit_intercept=False,
                           class_weight={0: 1, 1: 2}, random_state=42)
            clf_cw_12.fit(X, y)
            clf_sw_12 = LR(solver=solver, fit_intercept=False, random_state=42)
            clf_sw_12.fit(X, y, sample_weight=sample_weight)
            assert_array_almost_equal(
                clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)

    # Test the above for l1 penalty and l2 penalty with dual=True.
    # since the patched liblinear code is different.
    clf_cw = LogisticRegression(
        solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
        penalty="l1", tol=1e-5, random_state=42)
    clf_cw.fit(X, y)
    clf_sw = LogisticRegression(
        solver="liblinear", fit_intercept=False, penalty="l1", tol=1e-5,
        random_state=42)
    clf_sw.fit(X, y, sample_weight)
    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)

    clf_cw = LogisticRegression(
        solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
        penalty="l2", dual=True, random_state=42)
    clf_cw.fit(X, y)
    clf_sw = LogisticRegression(
        solver="liblinear", fit_intercept=False, penalty="l2", dual=True,
        random_state=42)
    clf_sw.fit(X, y, sample_weight)
    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
Beispiel #5
0
def main():

    # COLUMN NAMES
    #   sepal-length
    #   sepal-width
    #   petal-length
    #   petal-width
    #   class: 0, 1, 2

    # Get default project directory path
    project_directory_path = os.path.dirname(sys.argv[0])
    input_file_path = os.path.join(project_directory_path,
                                   config.INPUT_FILE_PATH)
    output_report_path = os.path.join(project_directory_path,
                                      config.OUTPUT_REPORT_PATH)

    #   Open output report file for append
    global output_report_file
    output_report_file = open(output_report_path, 'a')

    #     1. LOAD DATA INTO PANDAS DATAFRAME
    df_data = pd.read_csv(filepath_or_buffer=input_file_path)
    print(df_data)
    print()
    #
    #     DATA PREPROCESSING
    #    df_data['preg_count'] = df_data['preg_count'].map( lambda x : df_data.preg_count.median() if x == 0 else x)
    #    df_data['glucose_concentration'] = df_data['glucose_concentration'].map( lambda x : df_data.glucose_concentration.median() if x == 0 else x)
    #    df_data['blood_pressure'] = df_data['blood_pressure'].map( lambda x : df_data.blood_pressure.median() if x == 0 else x)
    #    df_data['skin_thickness'] = df_data['skin_thickness'].map( lambda x : df_data.skin_thickness.median() if x == 0 else x)
    #    df_data['serum_insulin'] = df_data['serum_insulin'].map( lambda x : df_data.serum_insulin.median() if x == 0 else x)
    #    df_data['bmi'] = df_data['bmi'].map( lambda x : df_data.bmi.median() if x == 0 else x)

    #         this code needs to be test it!
    #     df_data = df_data.replace(0, np.nan)
    #     df_data.fillna(value=df_data.median(), inplace=True)
    #     df_data = df_data.fillna(df_data.median())

    #     SHOW df_data AFTER DATA PREPROCESSING
    #     print(df_data)
    #     print()

    #     PRINT DATAFRAME INFORMATION
    #     df_data.info()
    #     print()
    #     2. DEFINE THE FEATURES
    X = df_data.drop(labels="class", axis=1)
    feature_name = X.columns.values

    #     3. DEFINE THE TARGET
    y = df_data["class"]
    y_unique_class = list(y.unique())
    print(y)

    #     4. GET TRAIN AND TEST DATA
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        stratify=y,
                                                        random_state=1)

    #     5. SCALE THE DATA - STANDARDIZE FEATURES BY REMOVING THE MEAN AND SCALING TO UNIT VARIANCE
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    #     result = 76.62 %

    #     robust_scaler = RobustScaler(quantile_range=(25, 75))
    #     robust_scaler.fit_transform(X_train)
    #     X_train = robust_scaler.transform(X_train)
    #     X_test = robust_scaler.transform(X_test)
    #     result = 75.76 %

    #     normalizer_scaler = Normalizer()
    #     normalizer_scaler.fit_transform(X_train)
    #     X_train = normalizer_scaler.transform(X_train)
    #     X_test = normalizer_scaler.transform(X_test)
    #     result = 63.2 %

    #     min_max_scaler = MinMaxScaler()
    #     min_max_scaler.fit_transform(X_train)
    #     X_train = min_max_scaler.transform(X_train)
    #     X_test = min_max_scaler.transform(X_test)
    #     result = 76.62 %

    #     max_abs_scaler = MaxAbsScaler()
    #     max_abs_scaler.fit_transform(X_train)
    #     X_train = max_abs_scaler.transform(X_train)
    #     X_test = max_abs_scaler.transform(X_test)
    #     result = 75.32%

    #     ---------------------------------------------------------------------------------------------
    #     6.1 CREATE MULTI-LAYER PERCEPTRON CLASSIFIER MODEL
    if (config.RUN_MLP_CLASSIFIER):
        model_type = Model.MLP_CLASSIFIER
        print("MULTI-LAYER PERCEPTRON CLASSIFIER", file=output_report_file)
        classifier_model = MLPClassifier(activation="identity",
                                         hidden_layer_sizes=(100, 100, 100),
                                         max_iter=1000,
                                         random_state=1)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, False, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.2 BUILDING A LOGISTIC REGRESSION IN PYTHON, STEP BY STEP
#     https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
    if (config.RUN_LOGISTIC_REGRESSION):
        print("LOGISTIC REGRESSION CLASSIFIER", file=output_report_file)
        model_type = Model.LOGISTIC_REGRESSION
        classifier_model = LogisticRegression(random_state=1,
                                              solver="liblinear",
                                              max_iter=1000)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, False, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.3 RANDOM FOREST CLASSIFIER
    if (config.RUN_RANDOMFOREST_CLASSIFIER):
        print("RANDOM FOREST CLASSIFIER", file=output_report_file)
        best_rfc_params = best_params(X_train, y_train)
        print("Best params:")
        print(best_rfc_params)
        print()

        model_type = Model.RANDOMFOREST_CLASSIFIER
        #classifier_model = RandomForestClassifier(criterion="gini", max_depth=5, n_estimators=10, max_features=1, n_jobs=2, random_state=1)
        #classifier_model = RandomForestClassifier(criterion="gini", max_depth=5, n_estimators=10, max_features=2, n_jobs=2, random_state=1)
        #classifier_model = RandomForestClassifier(criterion="entropy", max_depth=None, n_estimators=10, max_features="auto", n_jobs=-1, random_state=1)
        classifier_model = RandomForestClassifier(criterion="gini",
                                                  max_depth=None,
                                                  n_estimators=10,
                                                  max_features="auto",
                                                  n_jobs=-1,
                                                  random_state=1)
        # 97.78% accuracy score
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, True, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.4 ADA BOOST CLASSIFIER
    if (config.RUN_ADA_BOOST):
        print("ADA BOOST CLASSIFIER", file=output_report_file)
        model_type = Model.ADA_BOOST_CLASSIFIER
        classifier_model = AdaBoostClassifier(
            DecisionTreeClassifier(max_depth=5),
            algorithm="SAMME",
            n_estimators=200,
            random_state=1)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, True, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.5 GRADIENT BOOSTING CLASSIFIER
    if (config.RUN_GRADIENT_BOOST):
        print("GRADIENT BOOSTING CLASSIFIER", file=output_report_file)
        model_type = Model.GRADIENT_BOOST
        classifier_model = GradientBoostingClassifier(n_estimators=1000,
                                                      criterion="friedman_mse",
                                                      max_leaf_nodes=4,
                                                      random_state=1)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, True, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.6 DECISION TREE CLASSIFIER
    if (config.RUN_DECISIONTREE_CLASSIFIER):
        print("DECISION TREE CLASSIFIER", file=output_report_file)
        model_type = Model.DECISIONTREE_CLASSIFIER
        classifier_model = DecisionTreeClassifier(criterion="gini",
                                                  splitter="best",
                                                  max_depth=3,
                                                  min_samples_leaf=5,
                                                  random_state=1)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, True, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.7 SUPPORT VECTOR MACHINES
    if (config.RUN_SVM):
        print("SUPPORT VECTOR MACHINES", file=output_report_file)
        model_type = Model.SVM
        classifier_model = SVC(C=1, kernel="linear", gamma=1, random_state=1)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, False, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#    6.8 GAUSSIAN PROCESS CLASSIFIER
    if (config.RUN_GAUSSIAN_CLASSIFIER):
        print("GAUSSIAN PROCESS CLASSIFIER", file=output_report_file)
        model_type = Model.GAUSSIAN_CLASSIFIER
        classifier_model = GaussianProcessClassifier(kernel=1.0 * RBF(1.0),
                                                     optimizer="fmin_l_bfgs_b",
                                                     random_state=1)
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, False, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)
#     result = 79.0 % (+/- 3.53 %) - very good!

#     6.9 GAUSSIAN NAIVE BAYES (GAUSSIANNB)
    if (config.RUN_GAUSSIANB_CLASSIFIER):
        print("GAUSSIAN NAIVE BAYES (GAUSSIANNB)", file=output_report_file)
        model_type = Model.GAUSSIANB_CLASSIFIER
        classifier_model = GaussianNB()
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, False, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.10 K-NEAREST NEIGHBORS CLASSIFIER
    if (config.RUN_KNEAREST_NEIGHBORS):
        print("K-NEAREST NEIGHBORS CLASSIFIER", file=output_report_file)
        model_type = Model.KNEAREST_NEIGHBORS
        classifier_model = KNeighborsClassifier(n_neighbors=5,
                                                weights="uniform",
                                                algorithm="auto")
        cv_folds_list = [5, 10, 15, 20]
        run_model(classifier_model, model_type, feature_name, False, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)

#     6.11 XGBOOST CLASSIFIER
    if (config.RUN_XGBOOST_CLASSIFIER):
        print("XGBOOST CLASSIFIER", file=output_report_file)
        model_type = Model.XGBOOST_CLASSIFIER
        #       need to apply GridSearchCV() to determine best hyperparameters
        classifier_model = XGBClassifier()
        cv_folds_list = [5, 10, 15, 20]
        #run_cross_validation_score(classifier_model, X_train, y_train, cv_folds_list)
        #print(file=output_report_file)

        XGBClassifier(base_score=0.5,
                      colsample_bytree=1,
                      gamma=0,
                      learning_rate=0.1,
                      max_delta_step=0,
                      max_depth=3,
                      min_child_weight=1,
                      missing=None,
                      n_estimators=100,
                      nthread=-1,
                      objective='multi:softprob',
                      seed=0,
                      silent=True,
                      subsample=1)

        run_model(classifier_model, model_type, feature_name, True, X_train,
                  y_train, X_test, y_test, y_unique_class, cv_folds_list)
    'Distras',
    'Trans',
]

print('Computing whole-image texture features...')
features = []
labels = []
for ci, cl in enumerate(classes):
    images = glob('{}/{}/*.jpg'.format(basedir, cl))
    features.extend(features_for(images))
    labels.extend([ci for _ in images])

features = np.array(features)
labels = np.array(labels)

scores0 = cross_validation.cross_val_score(LogisticRegression(),
                                           features,
                                           labels,
                                           cv=10)
print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' %
      (0.1 * round(1000 * scores0.mean())))

tfeatures = features

from sklearn.cluster import KMeans
from mahotas.features import surf

images = []
labels = []

for ci, cl in enumerate(classes):
Beispiel #7
0
def test_predict_3_classes():
    check_predictions(LogisticRegression(C=10), X, Y2)
    check_predictions(LogisticRegression(C=10), X_sp, Y2)
Beispiel #8
0
data.shape

# 查看前10行数据
data.head(10)

# 提取特征值
feature = data[data.columns[:-1]]
feature.head()

#提取目标值,将-1替换为0
target = data[15].replace(-1, 0)
target.head()

# 划分训练集合测试集
from sklearn.model_selection import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    feature, target, test_size=0.3)

# 初始化模型
from sklearn.linear_model.logistic import LogisticRegression
logistic_model = LogisticRegression()

# 训练模型
logistic_model.fit(feature_train, target_train)

# 预测
logistic_model.predict(feature_test)

#查看模型准确率
logistic_model.score(feature_test, target_test)
Beispiel #9
0
def main():
    #1,加载数据(训练和测试)和预处理数据
    colnames = [
        'ID', 'label', 'loan_amnt', 'loan_mon', 'int_rate', 'installment',
        'grade', 'home_ownership', 'annual_inc', 'verification_status',
        'issue_d', 'loan_status', 'dti', 'earliest_cr_line', 'inq_last_6mths',
        'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc'
    ]
    col_nas = [
        '', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
        'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'
    ]
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("data\lendtrain1.csv",
                          names=colnames,
                          na_values=col_na_values,
                          skiprows=[0])
    #print(dftrain)
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x) for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("data\lendtest.csv",
                         names=colnames,
                         na_values=col_na_values,
                         skiprows=[0])
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()
    print(y_train)

    #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new

    #3,使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)

    #4,使用training_new数据建立RF模型:
    #a.设置rf的参数class_weight="balanced"为"balanced_subsample"
    #n_samples / (n_classes * np.bincount(y))
    rf = RandomForestClassifier(n_estimators=100,
                                oob_score=True,
                                min_samples_split=2,
                                min_samples_leaf=50,
                                n_jobs=-1,
                                class_weight='balanced_subsample',
                                bootstrap=True)

    #模型比较
    print(y_train)
    print(type(y_train))
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    predicted_probs_train = lr.predict_proba(x_train)
    predicted_probs_train = [x[1] for x in predicted_probs_train]
    computeAUC(y_train, predicted_probs_train)

    predicted_probs_test_new = lr.predict_proba(x_test_new)
    predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
    computeAUC(y_test_new, predicted_probs_test_new)

    model = tree.DecisionTreeClassifier()
    model.fit(x_train, y_train)
    predicted_probs_train = model.predict_proba(x_train)
    predicted_probs_train = [x[1] for x in predicted_probs_train]
    computeAUC(y_train, predicted_probs_train)

    predicted_probs_test_new = lr.predict_proba(x_test_new)
    predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
    computeAUC(y_test_new, predicted_probs_test_new)

    #输出特征重要性评估
    rf.fit(x_train, y_train)
    print(
        sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_),
                   dftrain.columns),
               reverse=True))
    #    importances = rf.feature_importances_
    #    indices = np.argsort(importances)[::-1]
    #    feat_labels = dftrain.columns
    #    for f in range(x_train.shape[1]):
    #        print("%2d) %-*s %f" % (f + 1, 30, feat_labels[f], importances[indices[f]]))

    #b.使用具有CrossValidation的网格搜索执行参数调整
    param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]}
    grid_search = GridSearchCV(rf, cv=10, param_grid=param_grid, iid=False)

    #c.输出最佳模型并对测试数据进行预测
    #使用最优参数和training_new数据构建模型
    grid_search.fit(x_train, y_train)
    print("the best parameter:", grid_search.best_params_)
    print("the best score:", grid_search.best_score_)

    #使用训练的模型来预测train_new数据
    predicted_probs_train = grid_search.predict_proba(x_train)
    predicted_probs_train = [x[1] for x in predicted_probs_train]
    computeAUC(y_train, predicted_probs_train)
    #使用训练的模型来预测test_new数据(validataion data)
    predicted_probs_test_new = grid_search.predict_proba(x_test_new)
    predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
    computeAUC(y_test_new, predicted_probs_test_new)
    #使用该模型预测test data
    predicted_probs_test = grid_search.predict_proba(x_test)
    predicted_probs_test = ["%.9f" % x[1] for x in predicted_probs_test]
    submission = pd.DataFrame({
        'Id': test_id,
        'Probability': predicted_probs_test
    })
    submission.to_csv("rf_benchmark.csv", index=False)
Beispiel #10
0
def getPipeline():

    return Pipeline([('vect',
                      TfidfVectorizer(stop_words='english',
                                      sublinear_tf=True)),
                     ('clf', LogisticRegression())])
Beispiel #11
0
                  verbose=False,
                  max_iter=-1,
                  decision_function_shape='ovr',
                  random_state=None)
clf_SVM.fit(training_vector, training_target)
print("SVM Prediction:")
prediction_SVM = clf_SVM.predict(test_vector)
print(prediction_SVM)
print("\n SVM Accuracy:")
print(clf_SVM.score(test_vector, test_target))
print("\n SVM Confusion Matrix:")
print(confusion_matrix(prediction_SVM, test_target))

#Analysing using logistic regression
print("\n ************************LOGITIC REGRESSION********************")
clf_logi = LogisticRegression(penalty='l2',
                              tol=0.0001,
                              C=2.0,
                              fit_intercept=True,
                              intercept_scaling=1,
                              class_weight=None,
                              random_state=None)
clf_logi.fit(training_vector, training_target)
print("logistic regression Prediction:")
prediction_logi = clf_logi.predict(test_vector)
print(prediction_logi)
print("\n logistic regression Accuracy:")
print(clf_logi.score(test_vector, test_target))
print("\n logistic regression Confusion Matrix:")
print(confusion_matrix(prediction_logi, test_target))
Beispiel #12
0
QENC_DIFF = False
qenc_width = 33
n_classes = 2

n_users = 1000
max_runs = None
percTest = 0.10

predictors = [
    # DummyClassifier(strategy="stratified"),
    # DummyClassifier(strategy="uniform"),
    # BernoulliNB(),
    LinearSVC(max_iter=100, class_weight="balanced"),
    MLPClassifier(max_iter=100, nesterovs_momentum=True,
                  early_stopping=True),  #, activation="logistic"),
    LogisticRegression(class_weight='balanced'),
    # GaussianNB(),
]

predictor_params = [
    # None,
    # None,
    # {'n_iter':50, 'alpha': numpy.logspace(-3, 2) },
    {
        'n_iter': 50,
        'C': numpy.logspace(-3, 2)
    },
    {
        'n_iter': 250,
        'hidden_layer_sizes': [(100, ), (66, 10)],
        'learning_rate_init': [0.001, 0.01, 0.1],
#print(egitimcikti.shape)
"""
ros = over_sampling.RandomOverSampler()
rosegitimgirdi,rosegitimcikti = ros.fit_sample(egitimgirdi, egitimcikti)
print(rosegitimgirdi.shape)

smote = over_sampling.SMOTE()
smoteegitimgirdi,smoteegitimcikti = smote.fit_sample(egitimgirdi, egitimcikti)
print(smoteegitimgirdi.shape)

adasyn = over_sampling.ADASYN()
adasynegitimgirdi,adasynegitimcikti = adasyn.fit_sample(egitimgirdi, egitimcikti)
print(adasynegitimgirdi.shape)
"""
models = []
models.append(("LR", LogisticRegression()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("DCT", DecisionTreeClassifier()))
models.append(("GNB", GaussianNB()))
models.append(("SVC", SVC()))
models.append(("MLP", MLPClassifier()))
models.append(("ADB", AdaBoostClassifier()))
models.append(('RAF', RandomForestClassifier()))
"""
for name,model in models: -imbalanced sorunu verilerin dengesiz olması sonucun ağır olan kısma göre yoğunluk vermesi
#    egitilmismodel = model.fit(egitimgirdi,egitimcikti)
#    egitilmismodelros = model.fit(rosegitimgirdi,rosegitimcikti)
#    egitilmismodelsmote = model.fit(smoteegitimgirdi,smoteegitimcikti)
    egitilmismodeladasyn = model.fit(adasynegitimgirdi,adasynegitimcikti)
    
from sklearn import datasets
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

iris = datasets.load_iris()

score = cross_val_score(estimator=LogisticRegression(),
                        X=iris.data,
                        y=iris.target,
                        cv=KFold(n_splits=10))

print(score.mean())
print(score.std())
features = np.array(features)
labels = np.array(labels)
n = features.shape
nl = labels.shape
print('features=' + str(n))
print(str(features))
print('labels=' + str(nl))
print(str(labels))

features_sobels = np.hstack([np.atleast_2d(sobels).T, features])

np.savetxt("featuresDogsCatsE.zat", features, delimiter=",")
np.savetxt("featuresDogsCats_sobelsE.zat", features_sobels, delimiter=",")
np.savetxt("labelsDogsCatsE.zat", labelVect, delimiter=",")

scores = cross_validation.cross_val_score(LogisticRegression(),
                                          features,
                                          labels,
                                          cv=5)
print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.
      format(0.1 * round(1000 * scores.mean())))

scores = cross_validation.cross_val_score(
    LogisticRegression(),
    np.hstack([np.atleast_2d(sobels).T, features]),
    labels,
    cv=5).mean()
print(
    'Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'
    .format(0.1 * round(1000 * scores.mean())))
Beispiel #16
0
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing.data import StandardScaler
from sklearn.cross_validation import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

from soma_workflow.client import WorkflowController

from mempamal.configuration import JSONify_estimator, JSONify_cv, build_dataset
from mempamal.workflow import create_wf, save_wf
from mempamal.datasets import iris

# create a simple pipeline with a StandardScaler and a LogisticRegression
s1 = StandardScaler(with_mean=True, with_std=False)
s2 = LogisticRegression()
p = [("scaler", s1), ("logit", s2)]
est = Pipeline(p)

# get the iris dataset
X, y = iris.get_data()

# jsonify the method and a cross-validation scheme
method_conf = JSONify_estimator(est, out="./est.json")
cv_conf = JSONify_cv(StratifiedKFold,
                     cv_kwargs={"n_folds": 5},
                     score_func=f1_score,
                     stratified=True,
                     out="./cv.json")
# build the dataset file
dataset = build_dataset(X, y, method_conf, cv_conf, ".", compress=1)
Beispiel #17
0
def create_model():
    from sklearn.linear_model.logistic import LogisticRegression
    clf = LogisticRegression()
    return clf
Beispiel #18
0
def test_multinomial_validation():
    for solver in ['lbfgs', 'newton-cg', 'sag']:
        lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial')
        assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1])
Beispiel #19
0
sentence_root.sort_values(["select"], inplace=True)

train_root = sentence_root
test_root = sentence_root[-300:]

# train on root of the sentence
print("Classifying using sentence root")
vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b',
                             use_idf=False,
                             stop_words=[])
train_matrix = vectorizer.fit_transform(train_root['sentence'])
words = vectorizer.get_feature_names()

clf_dic = {}
for topic in train_root.columns[1:-1]:
    clf = LogisticRegression(class_weight='balanced', C=10)
    clf.fit(train_matrix, train_root[topic])
    clf_dic[topic] = clf

vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b',
                             use_idf=False,
                             stop_words=[],
                             vocabulary=words)
for topic, clf in clf_dic.items():
    output = []
    for i in list(test_root.sentence):
        features = vectorizer.fit_transform([i])
        y_pred = clf.predict(features)
        output.append(y_pred[0])
    test_root[topic + "_pred"] = pd.Series(output, index=test_root.index)
    f1_bin = f1_score(test_root[topic], test_root[topic + "_pred"])
Beispiel #20
0
def test_nan():
    # Test proper NaN handling.
    # Regression test for Issue #252: fit used to go into an infinite loop.
    Xnan = np.array(X, dtype=np.float64)
    Xnan[0, 1] = np.nan
    LogisticRegression(random_state=0).fit(Xnan, Y1)
Beispiel #21
0
def test_multinomial_validation(solver):
    lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial')
    assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1])
print('训练集AUC:{:.2%}'.format(
    roc_auc_score(train_all_train_Y, y_train_proba[:, 1])))
print('测试集AUC:{:.2%}'.format(
    roc_auc_score(train_all_test_Y, y_test_proba[:, 1])))

### stacking 模型集成

from sklearn import datasets
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets.samples_generator import make_blobs
from sklearn.linear_model.logistic import LogisticRegression
'''创建模型融合中的基模型'''
clfs = [
    AdaBoostClassifier(),
    RandomForestClassifier(n_estimators=50, n_jobs=-1, criterion='entropy'),
    LogisticRegression(C=0.01),
    ExtraTreesClassifier(n_estimators=50, n_jobs=-1, criterion='entropy'),
    GradientBoostingClassifier(learning_rate=0.05,
                               subsample=0.5,
                               max_depth=6,
                               n_estimators=50)
]
'''对数据集进行切分,切分为训练集和测试集'''

X_train, X_test, y_train, y_test = train_test_split(woe_train_X,
                                                    train_Y,
                                                    test_size=0.3)

dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
'''5折stacking'''
Beispiel #23
0
def test_logreg_intercept_scaling_zero():
    # Test that intercept_scaling is ignored when fit_intercept is False

    clf = LogisticRegression(fit_intercept=False)
    clf.fit(X, Y1)
    assert_equal(clf.intercept_, 0.)
def classify(x_train1, y_train1, i):
    y_train1 = y_train1.ravel()
    min, max = get_min_max()
    x_test, y_test = load_data(500, min, max)

    # Random Forest
    rfc1 = RandomForestClassifier(n_estimators=40,
                                  max_depth=None,
                                  min_samples_split=2,
                                  random_state=2)
    rfc1.fit(x_train1, y_train1)
    RF_pre = rfc1.predict(x_test)
    RF_AC = accuracy_score(y_test, RF_pre)
    RF_f1 = f1_score(y_test, RF_pre, average='macro')

    # SVM
    # print("Phase 2 SVM parameters selecting...")
    # parameters = {
    #     'C': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    #     'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
    #     'kernel': ['rbf'],
    #     'decision_function_shape': ['ovr']
    # }
    # svc = svm.SVC()
    # grid_search = GridSearchCV(svc, parameters, scoring='accuracy', cv=5)
    # grid_search.fit(x_train1, y_train1.ravel())
    # best_parameters = grid_search.best_estimator_.get_params()
    # for para, val in list(best_parameters.items()):
    #     print("hello:", para, val)
    # clf = svm.SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True).fit(
    #     x_train1, y_train1.ravel())

    clf = SVC(kernel='rbf', C=9, gamma=0.1)
    clf.set_params(kernel='rbf', probability=True).fit(x_train1, y_train1)
    clf.predict(x_train1)
    test_pre = clf.predict(x_test)
    SVM_AC = accuracy_score(y_test, test_pre)
    SVM_f1 = f1_score(y_test, test_pre, average='macro')

    # decision tree
    dtc = DecisionTreeClassifier()
    dtc.fit(x_train1, y_train1)
    dt_pre = dtc.predict(x_test)
    DT_AC = accuracy_score(y_test, dt_pre)
    DT_f1 = f1_score(y_test, dt_pre, average='macro')

    # Bayes
    mnb = MultinomialNB()
    mnb.fit(x_train1, y_train1)
    NB_predict = mnb.predict(x_test)
    NB_AC = accuracy_score(y_test, NB_predict)
    NB_f1 = f1_score(y_test, NB_predict, average='macro')

    # Multilayer perceptron
    MLP = MLPClassifier(solver='lbfgs',
                        alpha=1e-4,
                        hidden_layer_sizes=(100, 3),
                        random_state=1)
    MLP.fit(x_train1, y_train1)
    MLP_predict = MLP.predict(x_test)
    MLP_AC = accuracy_score(y_test, MLP_predict)
    MLP_f1 = f1_score(y_test, MLP_predict, average='macro')

    # KNN
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train1, y_train1)
    knn_predict = knn.predict(x_test)
    KNN_AC = accuracy_score(y_test, knn_predict)
    KNN_f1 = f1_score(y_test, knn_predict, average='macro')

    # LogisticRegression
    classifier = LogisticRegression()
    classifier.fit(x_train1, y_train1)
    lg_predict = classifier.predict(x_test)
    LG_AC = accuracy_score(y_test, lg_predict)
    LG_f1 = f1_score(y_test, lg_predict, average='macro')
    print("===== Diagnosis Ensemble evaluation %d/1=======" % (i + 1))
    print('Ensemble Accuracy:')
    print(RF_AC, SVM_AC, DT_AC, NB_AC, MLP_AC, KNN_AC, LG_AC)
    print('F1-score')
    print(RF_f1, SVM_f1, DT_f1, NB_f1, MLP_f1, KNN_f1, LG_f1)
    file_name1 = "./temp_result/Diagnosis_" + str(
        select_number) + "Level" + str(level_num) + "_Accuracy_result.txt"
    file_name2 = "./temp_result/Diagnosis_" + str(
        select_number) + "Level" + str(level_num) + "_f1_score_result.txt"
    with open(file_name1, "a") as f:
        f.writelines([
            str(RF_AC), ' ',
            str(SVM_AC), ' ',
            str(DT_AC), ' ',
            str(NB_AC), ' ',
            str(MLP_AC), ' ',
            str(KNN_AC), ' ',
            str(LG_AC), '\n'
        ])
    with open(file_name2, "a") as f:
        f.writelines([
            str(RF_f1), ' ',
            str(SVM_f1), ' ',
            str(DT_f1), ' ',
            str(NB_f1), ' ',
            str(MLP_f1), ' ',
            str(KNN_f1), ' ',
            str(LG_f1), '\n'
        ])
    return RF_AC, SVM_AC, DT_AC, NB_AC, MLP_AC, KNN_AC, LG_AC, RF_f1, SVM_f1, DT_f1, NB_f1, MLP_f1, KNN_f1, LG_f1
Beispiel #25
0
    'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime'
]
lbe_list = []
for feature in attr:
    lbe = LabelEncoder()
    train[feature] = lbe.fit_transform(train[feature])
    test[feature] = lbe.transform(test[feature])
    lbe_list.append(lbe)
#print(train)

from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition',
                                                                 axis=1),
                                                      train['Attrition'],
                                                      test_size=0.2,
                                                      random_state=42)

model = LogisticRegression(max_iter=100,
                           verbose=True,
                           random_state=33,
                           tol=1e-4)

model.fit(X_train, y_train)
predict = model.predict_proba(test)[:, 1]
test['Attrition'] = predict

# 转化为二分类输出
test['Attrition'] = test['Attrition'].map(lambda x: 1 if x >= 0.5 else 0)
test[['Attrition']].to_csv('submit_lr.csv')
Beispiel #26
0
y = pd.read_csv('./data_preprocessing/training.csv')['IsBadBuy']

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=33)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#df = pd.read_csv('training.csv')
# y, X = dmatrices('IsBadBuy ~ PurchDate+Auction+VehYear+VehicleAge+Make+Model+Trim+SubModel+Color+Transmission+WheelTypeID+WheelType+VehOdo+Nationality+Size+TopThreeAmericanName+MMRAcquisitionAuctionAveragePrice+MMRAcquisitionAuctionCleanPrice+MMRAcquisitionRetailAveragePrice+MMRAcquisitonRetailCleanPrice+MMRCurrentAuctionAveragePrice+MMRCurrentAuctionCleanPrice+MMRCurrentRetailAveragePrice+MMRCurrentRetailCleanPrice+PRIMEUNIT+AUCGUART+BYRNO+VNZIP1+VNST+VehBCost+IsOnlineSale+WarrantyCost', df, return_type = 'dataframe')
model = LogisticRegression(fit_intercept=False, C=1e9)

##cross_val_score
score = cross_val_score(model, X_train, y_train)
avg_acore = score.mean()

##test score
mdl = model.fit(X_train, y_train)
mdl.score(X_test, y_test)

##lasso
from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, y_train)
#Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
def log_regression_fit(X, y):
    lso = LogisticRegression(tol=1e-8, penalty='l2')
    return lso.fit(X, y)
Beispiel #28
0
# credit_model_cost = DecisionTreeClassifier(max_depth=6,class_weight = class_weights)
# credit_model_cost.fit(X_train, y_train)
# credit_pred_cost = credit_model_cost.predict(X_test)
# print metrics.classification_report(y_test, credit_pred_cost)
# print metrics.confusion_matrix(y_test, credit_pred_cost)
# print metrics.accuracy_score(y_test, credit_pred_cost)

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(X_train)#计算样本的均值和标准差
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)
print X_train_std.shape
print X_test_std.shape
from sklearn.linear_model.logistic import LogisticRegression
lr=LogisticRegression(C=1000.0,random_state=0)
lr.fit(X_train_std,y_train)
# 模型预测
y_pred = lr.predict_proba(X_test_std)
print (y_pred)


from matplotlib.colors import ListedColormap
# 绘制决策边界
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # 设置标记点和颜色
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # 绘制决策面
Beispiel #29
0
y_data = data[:, -1]

prediction_list = []
for i in range(5):
    x_train, x_val, y_train, y_val = train_test_split(data[:, :-1],
                                                      data[:, -1],
                                                      test_size=0.2)
    x_train_new, x_test_new = filtering(x_train, y_train, x_test)
    x_train_new, x_val_new = filtering(x_train, y_train, x_val)
    print(x_test_new.shape)
    clf = SVC(kernel='linear', verbose=1)
    clf.fit(x_train_new, y_train.astype('int'))
    y_predition_test = clf.predict(x_test_new)
    prediction_list.append(y_predition_test)

    classifier = LogisticRegression()
    classifier.fit(x_train_new, y_train.astype('int'))
    y_predict = classifier.predict(x_test_new)
    prediction_list.append(y_predict)

    y_train_nn = to_categorical(y_train)
    y_val_nn = to_categorical(y_val)
    model = Sequential()
    model.add(Dense(100, input_shape=(x_train_new.shape[1], )))
    model.add(Activation('relu'))
    model.add(Dense(12))
    model.add(Activation('softmax'))
    fBestModel = 'best_model.h5'
    early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    best_model = ModelCheckpoint(fBestModel, verbose=0, save_best_only=True)
    sgd = optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
Beispiel #30
0
y = pd.read_csv('./training.csv')['IsBadBuy']

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.01,
                                                    random_state=33)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#df = pd.read_csv('training.csv')
# y, X = dmatrices('IsBadBuy ~ PurchDate+Auction+VehYear+VehicleAge+Make+Model+Trim+SubModel+Color+Transmission+WheelTypeID+WheelType+VehOdo+Nationality+Size+TopThreeAmericanName+MMRAcquisitionAuctionAveragePrice+MMRAcquisitionAuctionCleanPrice+MMRAcquisitionRetailAveragePrice+MMRAcquisitonRetailCleanPrice+MMRCurrentAuctionAveragePrice+MMRCurrentAuctionCleanPrice+MMRCurrentRetailAveragePrice+MMRCurrentRetailCleanPrice+PRIMEUNIT+AUCGUART+BYRNO+VNZIP1+VNST+VehBCost+IsOnlineSale+WarrantyCost', df, return_type = 'dataframe')
model = LogisticRegression(fit_intercept=False, C=1e9)

##cross_val_score
score = cross_val_score(model, X_train, y_train)
avg_acore = score.mean()

##test score
mdl = model.fit(X_train, y_train)
mdl.score(X_test, y_test)

##lasso
from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, y_train)
#Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,