Ejemplo n.º 1
0
rus = RandomUnderSampler(random_state=42)
# test_X,test_y = rus.fit_resample(test_X,test_y)
print("Training data size: ", train_X.shape)
print("Test data size: ", test_X.shape)

# Normalize using StandardScaler
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)

#NOTE: If you are trying to add some of your models - your input will be train_X and then test it on test_X.

# Models to try
try_models = [
    LogisticRegression(),
    LogisticRegressionCV(),
    KNeighborsClassifier(n_neighbors=20),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=500),
    MLPClassifier(hidden_layer_sizes=(300, ),
                  verbose=True,
                  max_iter=500,
                  alpha=0.00001),
    SVC()
]
# Gather metrics here
accuracy_by_model = {}

# Train then evaluate each model
i = 0
for model in try_models:
Ejemplo n.º 2
0
y_train = data_train['Class Label'].values
X_train = data_train.values
y_train = y_train.reshape(len(y_train), 1)

y_test = data_test['Class Label'].values
X_test = data_test.values
y_test = y_test.reshape(len(y_test), 1)

df.head()

# Fit a logistic regression classifier to the training set and report the accuracy of the classifier on the test set
clf = LogisticRegressionCV(
        Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,cv=10
        ,random_state=777
        ,fit_intercept=True
        ,solver='newton-cg'
        ,tol=10)
clf.fit(X_train, y_train)
print('\n')
print("The optimized L2 regularization paramater id:", clf.C_)

# The coefficients
print('Estimated beta1: \n', clf.coef_)
print('Estimated beta0: \n', clf.intercept_)

# Scoring
clf_y_pred_test = clf.predict(X_test)
clf_y_pred_test = clf_y_pred_test.reshape(len(clf_y_pred_test), 1)
test_df = pd.DataFrame(clf_y_pred_test)
Ejemplo n.º 3
0
df = pd.read_csv("data/march_madness_history.csv")
df['Winner'] =np.where(df['Winner'] =="TEAM_1", 1,0)
df.head()
# %%
# Build models
meta_data = dict(
    title="NCAA March Madness 2021",
    description="Very simple Estimate",
    analyst = "Kevin Joy",
    tags=["NCAA", "Basketball", "March Madness"]
)

model = ct.Models(
    df = df[~df.isnull()],
    formulas = ['Winner~Round+Favorite', 'Winner~Round + Favorite + Seed_diff','Winner~Round + Favorite * Seed_diff + np.square(Seed_diff)' ],
    models = [LogisticRegression(), LogisticRegressionCV(cv=5), RandomForestClassifier(), AdaBoostClassifier()],
    test_split=.10,
    **meta_data,
    Type ="classifier"
    )

model_reg = ct.Models(
    df = df,
    formulas = ['Spread~Round+Favorite', 'Spread~Round + Favorite + Seed_diff','Spread~Round + Favorite * Seed_diff + np.square(Seed_diff)' ],
    models = [LinearRegression(), LassoCV(), RidgeCV(), RandomForestRegressor()],
    test_split=.10,
    **meta_data,
   Type = 'regression'
    )

Ejemplo n.º 4
0
regsNames = ['LinearRegression', 'RidgeCV', 'LassoCV', 'ElasticNetCV']

Regs = [
    LinearRegression(normalize=True),
    RidgeCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0],
            cv=10,
            normalize=True),
    LassoCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0],
            cv=10,
            normalize=True),
    ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0],
                 cv=10,
                 normalize=True)
]
Classifiers = [
    LogisticRegressionCV(cv=10),
    DecisionTreeClassifier(max_depth=3),
    svm.SVC(kernel='rbf', probability=True),
    svm.SVC(kernel='linear', probability=True),
    neighbors.KNeighborsClassifier(n_neighbors=7)
]
#naive_bayes.GaussianNB(),neighbors.KNeighborsClassifier(n_neighbors=7)]
ClassifiersNames = [
    'LogisticRegressionCV', 'DecisionTreeClassifier', 'svm.SVC_rbf',
    'svm.SVC_linear', 'neighbors.KNeighborsClassifier'
]

RegsComp = [
    LinearRegression(normalize=True),
    DecisionTreeRegressor(max_depth=3),
    svm.SVR(kernel='rbf'),
Ejemplo n.º 5
0
    metrics.accuracy_score(y_test, lr_predict_test)))
print(metrics.confusion_matrix(y_test, lr_predict_test))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))
print(metrics.recall_score(y_test, lr_predict_test))

#%% [markdown]
# ### LogisticRegressionCV

#%%
from sklearn.linear_model import LogisticRegressionCV
lr_cv_model = LogisticRegressionCV(
    n_jobs=-1,
    random_state=42,
    Cs=3,
    cv=10,
    refit=False,
    class_weight="balanced"
)  # set number of jobs to -1 which uses all cores to parallelize
lr_cv_model.fit(X_train, y_train.ravel())

#%% [markdown]
# ### Predict on Test data

#%%
lr_cv_predict_test = lr_cv_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(
    metrics.accuracy_score(y_test, lr_cv_predict_test)))
print(metrics.confusion_matrix(y_test, lr_cv_predict_test))
Ejemplo n.º 6
0
test_label = test_label.values

################################
X_train = train_data
X_test = test_data
Y_train = train_label
Y_test = test_label

#对数据的训练集进行标准化
ss = StandardScaler()
X_train = ss.fit_transform(X_train)  #先拟合数据在进行标准化

lr = LogisticRegressionCV(multi_class="ovr",
                          fit_intercept=True,
                          Cs=np.logspace(-2, 2, 20),
                          cv=2,
                          penalty="l2",
                          solver="lbfgs",
                          tol=0.01)
re = lr.fit(X_train, Y_train)

r = re.score(X_train, Y_train)
print('R(score):', r)
print('coefficient:', re.coef_)
print("intercept:", re.intercept_)
print("稀疏化特征比率:%.2f%%" % (np.mean(lr.coef_.ravel() == 0) * 100))
print("=========sigmoid函数转化的值,即:概率p=========")
print(re.predict_proba(X_test))  #sigmoid函数转化的值,即:概率p

#模型的保存与持久化
from sklearn.externals import joblib
Ejemplo n.º 7
0
def QuickML_Ensembling(X_train,
                       y_train,
                       X_test,
                       y_test='',
                       modeltype='Regression',
                       Boosting_Flag=False,
                       scoring='',
                       verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    FOLDS = 5
    model_dict = {}
    model_tuples = []
    if len(X_train) <= 100000 and X_train.shape[1] < 50:
        NUMS = 100
    else:
        try:
            X_train = X_train.sample(frac=0.30, random_state=99)
            y_train = y_train[X_train.index]
        except:
            pass
        NUMS = 200
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        #scv = ShuffleSplit(n_splits=FOLDS,random_state=seed)
        scv = KFold(n_splits=FOLDS, shuffle=False, random_state=seed)
        if Boosting_Flag is None:
            ## Create an ensemble model ####
            model5 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                       n_estimators=NUMS,
                                       random_state=seed)
            model_tuples.append(('Adaboost', model5))
        elif not Boosting_Flag:
            model5 = LassoLarsCV(cv=scv)
            model_tuples.append(('LassoLarsCV', model5))
        else:
            model5 = LassoLarsCV(cv=scv)
            model_tuples.append(('LassoLarsCV', model5))
        if Boosting_Flag is None:
            model6 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            model_tuples.append(('Bagging_Regressor', model6))
        elif not Boosting_Flag:
            model6 = LinearSVR()
            model_tuples.append(('Linear_SVR', model6))
        else:
            model6 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model6))
        model7 = KNeighborsRegressor(n_neighbors=8)
        model_tuples.append(('KNN_Regressor', model7))
        if Boosting_Flag is None:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            model8 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model8))
        elif not Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                       n_estimators=NUMS,
                                       random_state=seed)
            model_tuples.append(('Adaboost', model8))
        else:
            model8 = RandomForestRegressor(bootstrap=False,
                                           max_depth=10,
                                           max_features='auto',
                                           min_samples_leaf=2,
                                           n_estimators=200,
                                           random_state=99)
            model_tuples.append(('RF_Regressor', model8))
    else:
        if scoring == '':
            scoring = 'accuracy'
        num_classes = len(np.unique(y_test))
        scv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        if Boosting_Flag is None:
            ## Create an ensemble model ####
            model5 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                        n_estimators=NUMS,
                                        random_state=seed)
            model_tuples.append(('Adaboost', model5))
        elif not Boosting_Flag:
            model5 = LinearDiscriminantAnalysis()
            model_tuples.append(('Linear_Discriminant', model5))
        else:
            model5 = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100],
                                          solver='liblinear',
                                          random_state=seed)
            model_tuples.append(('Logistic_Regression_CV', model5))
        if Boosting_Flag is None:
            model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model6))
        elif not Boosting_Flag:
            model6 = LinearSVC()
            model_tuples.append(('Linear_SVC', model6))
        else:
            model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model6))
        if modeltype == 'Binary_Classification':
            model7 = GaussianNB()
        else:
            model7 = MultinomialNB()
        model_tuples.append(('Naive_Bayes', model7))
        if Boosting_Flag is None:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            model8 = RandomForestClassifier(bootstrap=False,
                                            max_depth=10,
                                            max_features='auto',
                                            min_samples_leaf=2,
                                            n_estimators=200,
                                            random_state=99)
            model_tuples.append(('Bagging_Classifier', model8))
        elif not Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            sgd_best_model = SGDClassifier(alpha=1e-06,
                                           loss='log',
                                           max_iter=1000,
                                           penalty='l2',
                                           learning_rate='constant',
                                           eta0=.1,
                                           random_state=3,
                                           tol=None)
            model8 = OneVsRestClassifier(sgd_best_model)
            model_tuples.append(('One_vs_Rest_Classifier', model8))
        else:
            model8 = RandomForestClassifier(bootstrap=False,
                                            max_depth=10,
                                            max_features='auto',
                                            min_samples_leaf=2,
                                            n_estimators=200,
                                            random_state=99)
            model_tuples.append(('Bagging_Classifier', model8))
    model_dict = dict(model_tuples)
    models, results = run_ensemble_models(model_dict, X_train, y_train, X_test,
                                          y_test, scoring, modeltype)
    return models, results
Ejemplo n.º 8
0
impute_value = train.Age.median()
train.Age.fillna(impute_value, inplace=True)
test.Age.fillna(impute_value, inplace=True)
train['IsFemale'] = (train.Sex == 'female').astype(int)
test['IsFemale'] = (test.Sex == 'female').astype(int)
predictors = ['Pclass', 'IsFemale', 'Age']
X_train = train[predictors].values
X_test = test[predictors].values
y_train = train['Survived'].values
X_train[:5]
y_train[:5]
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
y_predict[:10]
(y_predict == test['Survived'].values).mean()
from sklearn.linear_model import LogisticRegressionCV

model_cv = LogisticRegressionCV(10)
model_cv.fit(X_train, y_train)
y_predict = model_cv.predict(X_test)
y_predict.shape
test.shape
from sklearn.model_selection import cross_val_score

model = LogisticRegression(C=10)
scores = cross_val_score(model, X_train, y_train, cv=4)
scores
Ejemplo n.º 9
0
# 1) Set ridge regression penalty<br>
# 2) Search 100 values of lambda<br>
# 3) Set 10-fold cross validation<br>
# 4) Use the liblinear solver<br>
# 5) Set class weight to balanced<br>
# 6) Use accuracy as the scoring measure
# 7) Start with 1,000 iterations and increase as necessary<br>

# In[19]:

# Build a logistic regression model as a baseline

logit_reg = LogisticRegressionCV(penalty="l2",
                                 Cs=100,
                                 solver='liblinear',
                                 cv=10,
                                 class_weight='balanced',
                                 scoring='accuracy',
                                 max_iter=1000)

logit_reg.fit(train_X, train_y)

# In[20]:

# display confusion matrices for train and test data

classificationSummary(train_y, logit_reg.predict(train_X))
classificationSummary(test_y, logit_reg.predict(test_X))

# In[21]:
Ejemplo n.º 10
0
    def train(self,
              train_path,
              as_text=False,
              standardization=False,
              cut=True,
              multitrain=False):

        sys.stderr.write("o Reading training data...\n")

        if multitrain:
            df_train, todrop_train = self.read_conll_sentbreak(
                train_path,
                neighborwindowsize=self.windowsize,
                as_text=as_text,
                cut=False,
                multitrain=multitrain)
        else:
            df_train, todrop_train = self.read_conll_sentbreak(
                train_path,
                neighborwindowsize=self.windowsize,
                as_text=as_text,
                cut=cut)
        cols2keep = [
            col for col in df_train.columns if col not in todrop_train
        ]
        X_train = df_train[cols2keep]
        Y_train = df_train['gold_seg']

        df_train = None
        predictors_train = list(X_train)

        # standardization of vectors
        if standardization:
            from sklearn import preprocessing
            std_scale = preprocessing.StandardScaler().fit(X_train)
            X_train = std_scale.transform(X_train)

        gc.collect()  # Free up memory for csr_matrix conversion

        X_train = X_train[sorted(X_train.columns)]
        #X_train = csr_matrix(X_train)

        logmodel = LogisticRegressionCV(cv=3,
                                        n_jobs=3,
                                        penalty='l1',
                                        solver="liblinear",
                                        random_state=42)
        if multitrain:
            if X_train.shape[0] <= 95000:
                multitrain_preds = get_multitrain_preds(
                    logmodel, X_train, Y_train, 5)
                multitrain_preds = "\n".join(
                    multitrain_preds.strip().split("\n"))
                with io.open(script_dir + os.sep + "multitrain" + os.sep +
                             self.name + '_' + self.corpus,
                             'w',
                             newline="\n") as f:
                    sys.stderr.write(
                        "o Serializing multitraining predictions\n")
                    f.write(multitrain_preds)
            else:
                sys.stderr.write('o Skipping multitrain\n')
        # Fit complete dataset
        logmodel.fit(X_train, Y_train)
        logmodel.sparsify()

        if multitrain and X_train.shape[0] > 95000:
            preds, probas = zip(*self.predict(train_path, as_text=False))
            with io.open(script_dir + os.sep + "multitrain" + os.sep +
                         self.name + '_' + self.corpus,
                         'w',
                         newline="\n") as f:
                sys.stderr.write(
                    "o Serializing predictions from partial model\n")
                outlines = [
                    str(preds[i]) + "\t" + str(probas[i])
                    for i in range(len(probas))
                ]
                outlines = "\n".join(outlines)
                f.write(outlines + "\n")

        pickle_objects = (logmodel, predictors_train)
        pickle.dump(pickle_objects, open(self.model_path, 'wb'))
Ejemplo n.º 11
0
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

features.plot(kind='barh', figsize=(25, 25))

model = SelectFromModel(clf, prefit=True)
train_reduce = model.transform(train)
train_reduce.shape

test_reduce = model.transform(test)
test_reduce.shape

#MODEL BUILDING

logreg = LogisticRegression()
logreg_cv = LogisticRegressionCV()
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()

models = [logreg, logreg_cv, rf, gboost]

for model in models:
    print('Cross-validation of :{0}'.format(model.__class__))
    score = compute_score(clf=model,
                          X=train_reduce,
                          y=targets,
                          scoring='accuracy')
    print('CV score {0}='.format(score))
    print('****')
Ejemplo n.º 12
0
    result = pd.DataFrame(
        result, index=['HorseWin', 'HorseRankTop3', 'HorseRankTop50Percent'])
    return result


df_train = pd.read_csv('data/training.csv')
train_X = df_train[[
    'actual_weight', 'declared_horse_weight', 'draw', 'win_odds',
    'recent_ave_rank', 'jockey_ave_rank', 'trainer_ave_rank', 'race_distance'
]].values
train_Y = np.ravel(df_train[['finishing_position']].values)

# 3.1.1
print("Start LogisticRegression CV")
start = time.time()
lr_model = LogisticRegressionCV(cv=10, random_state=3320)
lr_model.fit(train_X, train_Y)
print("End LogisticRegression CV, Time: %s s" % (time.time() - start))

# 3.1.2
print("Start GaussianNB CV")
start = time.time()
skf_list = list(
    StratifiedKFold(n_splits=10, random_state=3320,
                    shuffle=True).split(train_X, train_Y))
nb_model = cvTrain(GaussianNB(), train_X, train_Y)
print("End GaussianNB CV, Time: %s s" % (time.time() - start))

print("Start self NaiveBayes")
start = time.time()
clf = NaiveBayes()
Ejemplo n.º 13
0
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False,
                            scoring='', verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    if len(X_train) <= 100000 or X_train.shape[1] < 50:
        NUMS = 100
        FOLDS = 5
    else:
        NUMS = 200
        FOLDS = 10
    ## create Voting models
    estimators = []
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        scv = ShuffleSplit(n_splits=FOLDS,random_state=seed)
        if Boosting_Flag is None:
            model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                        n_estimators=NUMS,random_state=seed)
            results1 = model5.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging1',model5, metrics1))
        else:
            model5 = LassoLarsCV(cv=scv)
            results1 = model5.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = rmse(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('LassoLarsCV Regression',model5, metrics1))
        model6 = LassoCV(alphas=np.logspace(-10,-1,50), cv=scv,random_state=seed)
        results2 = model6.fit(X_train,y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = rmse(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('LassoCV Regularization',model6, metrics2))
        model7 = RidgeCV(alphas=np.logspace(-10,-1,50), cv=scv)
        results3 = model7.fit(X_train,y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = rmse(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('RidgeCV Regression',model7, metrics3))
        ## Create an ensemble model ####
        if Boosting_Flag:
            model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                        n_estimators=NUMS,random_state=seed)
            results4 = model8.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging2',model8, metrics4))
        else:
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                        min_samples_leaf=2, max_depth=1, random_state=seed),
                        n_estimators=NUMS, random_state=seed)
            results4 = model8.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = rmse(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting',model8, metrics4))
        estimators_list = [(tuples[0],tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if verbose >= 2:
            print('QuickML_Ensembling Model results:')
            print('    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f' %(estimator_names[0], metrics1,
                    estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4))
    else:
        if scoring == '':
            scoring = 'accuracy'
        scv = StratifiedKFold(n_splits=FOLDS,random_state=seed)
        if Boosting_Flag is None:
            model5 = ExtraTreesClassifier(n_estimators=NUMS,min_samples_leaf=2,random_state=seed)
            results1 = model5.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean()
            else:
                metrics1 = 0
            estimators.append(('Bagging',model5, metrics1))
        else:
            model5 = LogisticRegressionCV(Cs=np.linspace(0.01,100,20),cv=scv,scoring=scoring,
                                          random_state=seed)
            results1 = model5.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics1 = accu(results1, y_test).mean() 
            else:
                metrics1 = 0
            estimators.append(('Logistic Regression',model5, metrics1))
        model6 = LinearDiscriminantAnalysis()
        results2 = model6.fit(X_train,y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics2 = accu(results2, y_test).mean()
        else:
            metrics2 = 0
        estimators.append(('Linear Discriminant',model6, metrics2))
        if modeltype == 'Binary_Classification':
            float_cols = X_train.columns[(X_train.dtypes==float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes==int).values].tolist()
            if (X_train[float_cols+int_cols]<0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = GaussianNB()
        else:
            float_cols = X_train.columns[(X_train.dtypes==float).values].tolist()
            int_cols = X_train.columns[(X_train.dtypes==int).values].tolist()
            if (X_train[float_cols+int_cols]<0).astype(int).sum().sum() > 0:
                model7 = DecisionTreeClassifier(max_depth=5)
            else:
                model7 = MultinomialNB()
        results3 = model7.fit(X_train,y_train).predict(X_test)
        if not isinstance(y_test, str):
            metrics3 = accu(results3, y_test).mean()
        else:
            metrics3 = 0
        estimators.append(('Naive Bayes',model7, metrics3))
        if Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here.
            model8 = ExtraTreesClassifier(n_estimators=NUMS,min_samples_leaf=2,random_state=seed)
            results4 = model8.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Bagging',model8, metrics4))
        else:
            ## Create an ensemble model ####
            model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                                    random_state=seed, max_depth=1, min_samples_leaf=2
                                    ), n_estimators=NUMS, random_state=seed)
            results4 = model8.fit(X_train,y_train).predict(X_test)
            if not isinstance(y_test, str):
                metrics4 = accu(results4, y_test).mean()
            else:
                metrics4 = 0
            estimators.append(('Boosting',model8, metrics4))
        estimators_list = [(tuples[0],tuples[1]) for tuples in estimators]
        estimator_names = [tuples[0] for tuples in estimators]
        if not isinstance(y_test, str):
            if verbose >= 2:
                print('QuickML_Ensembling Model results:')
                print('    %s = %0.4f \n    %s = %0.4f\n    %s = %0.4f \n    %s = %0.4f' %(estimator_names[0], metrics1,
                        estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4))
        else:
            if verbose >= 1:
                print('QuickML_Ensembling completed.')
    stacks = np.c_[results1,results2,results3,results4]
    if verbose == 1:
        print('    Time taken for Ensembling: %0.1f seconds' %(time.time()-start_time))
    return estimator_names, stacks
#########################################################
Ejemplo n.º 14
0
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


if __name__ == "__main__":
    path = u'..\\9.Regression\\iris.data'  # 数据文件路径
    # data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
    data = pd.read_csv(path, header=None)
    x, y = data[range(4)], data[4]
    y = pd.Categorical(y).codes
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=50)

    data_train = xgb.DMatrix(x_train, label=y_train)
    data_test = xgb.DMatrix(x_test, label=y_test)
    watch_list = [(data_test, 'eval'), (data_train, 'train')]
    param = {'max_depth': 4, 'eta': 0.3, 'silent': 1, 'objective': 'multi:softmax', 'num_class': 3}

    bst = xgb.train(param, data_train, num_boost_round=6, evals=watch_list)
    y_hat = bst.predict(data_test)
    result = y_test == y_hat
    print '正确率:\t', float(np.sum(result)) / len(y_hat)
    print 'END.....\n'

    models = [('LogisticRegression', LogisticRegressionCV(Cs=10, cv=3)),
              ('RandomForest', RandomForestClassifier(n_estimators=30, criterion='gini'))]
    for name, model in models:
        model.fit(x_train, y_train)
        print name, '训练集正确率:', accuracy_score(y_train, model.predict(x_train))
        print name, '测试集正确率:', accuracy_score(y_test, model.predict(x_test))
Ejemplo n.º 15
0
        tmp_text = " ".join(
            [row.complaint_type, row.descriptor, row.location_type])
    except Exception:
        print(row)
        break
    raw_text.append(tmp_text)
print("{} elapsed".format(time() - tick))

print("Split data & fit vectorizer/classifier")
tick = time()
# train/test split
X_train, X_test, y_train, y_test = train_test_split(raw_text,
                                                    targets,
                                                    test_size=0.1,
                                                    random_state=19)
# BoNG (size=1,2)
vec = CountVectorizer(ngram_range=(1, 2),
                      lowercase=True,
                      binary=False,
                      stop_words="english")
# LogisticRegression with automatic regularization tuning
lr = LogisticRegressionCV(class_weight="balanced")
# fit on train data
lr.fit(vec.fit_transform(X_train), y_train)
print("{} elapsed".format(time() - tick))

print("\nEVAL on held-out data\n")
# eval on test
print(
    classification_report(y_test, lr.predict(vec.transform(X_test)), digits=3))
def train_and_score(clf, X_train, y_train, X_test, y_test):
    clf = clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    cf = confusion_matrix(y_test, preds)

    print(plot_confusion_matrix(cf, class_names=positions))

    print(" Accuracy: ", accuracy_score(y_test, preds))
    print(" F1 score: ", metrics.f1_score(y_test, preds, average='weighted'))


#Logistic Regression

LR = LogisticRegressionCV(cv=5,
                          random_state=20,
                          solver='lbfgs',
                          multi_class='multinomial')
train_and_score(LR, X_train_dev, y_train_dev, X_test, y_test)

plot_learning_curve(LR, "Logistic Regression Curve", X_train_dev, y_train_dev)

#create new a knn model
knn_model = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
KNN = GridSearchCV(knn_model, param_grid, cv=5)

train_and_score(KNN, X_train_dev, y_train_dev, X_test, y_test)

# knn model
Ejemplo n.º 17
0
new_feat_test['total_sec'] = time_delt_sec_scaled[idx_split:]

# print(get_auc_lr_valid(csr_matrix(hstack([X_train_sparse, new_feat_train[['start_month', 'start_hour']].values.reshape(-1, 2)])), y_train))
# print(get_auc_lr_valid(csr_matrix(hstack([X_train_sparse, new_feat_train[['start_month', 'morning']].values.reshape(-1, 2)])), y_train))
# print(get_auc_lr_valid(csr_matrix(hstack([X_train_sparse, new_feat_train[['start_month', 'start_hour', 'morning']].values.reshape(-1, 3)])), y_train))

mm_train = csr_matrix(
    hstack([
        X_train_sparse, new_feat_train[[
            'start_month', 'start_hour', 'morning', 'day', 'evening', 'night',
            'total_sites', 'total_sec'
        ]].values.reshape(-1, 8)
    ]))
mm_test = csr_matrix(
    hstack([
        X_test_sparse, new_feat_test[[
            'start_month', 'start_hour', 'morning', 'day', 'evening', 'night',
            'total_sites', 'total_sec'
        ]].values.reshape(-1, 8)
    ]))

# logit.fit(mm_train, y_train)
# test_preds = logit.predict_proba(mm_test)[:, 1]
"""Подбор коефициента регуляризации"""
C = np.logspace(-3, 1, 10)
time_split = TimeSeriesSplit(n_splits=10)
logitCV = LogisticRegressionCV(Cs=C, cv=time_split, scoring='roc_auc')
logitCV.fit(mm_train, y_train)

# print(get_auc_lr_valid(mm_train, y_train, C=logitCV.C_[0]))
Ejemplo n.º 18
0
# PCA analysis, analysis only done on traning data and transformed on both training and validation data
pca = PCA()
pca.fit(X_train)
pcaTrain = pca.transform(X_train)
pcaValid = pca.transform(X_valid)

print(np.shape(pcaTrain))

# Dataset is not balanced. More samples of class 0 than 1. But the representation in the validation set is similar
# to the representation in the training data
print(np.bincount(y_train))
print(np.bincount(y_valid))
print()

# Models, nr of folds = 5
logCV = LogisticRegressionCV(penalty='l1', solver='liblinear',
                             cv=5)  # Lasso-regualated Logistic regression
ridgeCV = RidgeClassifierCV(alphas=np.array(
    [0.01, 0.1, 1, 100, 500, 1000, 5000, 10000]),
                            cv=5)
forest = ExtraTreesClassifier(n_estimators=nFeat)

logResOrg = []
ridgeResOrg = []
logResPCA = []
ridgeResPCA = []

for ix in range(10):

    logCV.fit(X_train, y_train)
    ridgeCV.fit(X_train, y_train)
Ejemplo n.º 19
0
        y = Gy[1]
        true_beta = Gy[0]

        # es = EarlyStopping(monitor='val_loss', patience=30, verbose=2)
        autoencoder = Sequential()
        autoencoder.add(Dense(r_hat, activation=aut_met, use_bias=False, input_shape=(p,)))
        autoencoder.add(Dense(p, activation=aut_met, use_bias=False))
        autoencoder.compile(loss=aut_loss, optimizer=keras.optimizers.Adam())
        autoencoder.fit(X, X, epochs=aut_epoch, batch_size=8, verbose=aut_verb)
        C = autoencoder.predict(X)
        E = X - C
        sigma = np.sqrt(np.sum(E ** 2) / (n * p))
        X_ko = C + sigma * np.random.randn(n, p)
        Xnew = np.hstack((X, X_ko))

        log = LogisticRegressionCV(penalty='l1', solver='liblinear', n_jobs=-1, cv=10).fit(Xnew, y.reshape((n, )))
        beta = log.coef_[0]
        W = (beta[:p]) ** 2 - (beta[p:]) ** 2

        t = np.sort(np.concatenate(([0], abs(W))))

        ratio = [sum(W <= -tt) / max(1, sum(W >= tt)) for tt in t[:p]]
        ind = np.where(np.array(ratio) <= q)[0]
        if len(ind) == 0:
            T = float('inf')
        else:
            T = t[ind[0]]
        selected = np.where(W >= T)[0]

        ratio_plus = [(1 + sum(W <= -tt)) / max(1, sum(W >= tt)) for tt in t[:p]]
        ind_plus = np.where(np.array(ratio_plus) <= q)[0]
Ejemplo n.º 20
0
    x_train_cat, x_test_cat, y_train_cat, y_test_cat = data_splits

    x_train_cat, x_val_cat, y_train_cat, y_val_cat = train_test_split(x_train_cat, y_train_cat, test_size=0.25, random_state=random_seed) # 0.25 x 0.7 = 0.175
    eval_set_cat = [(x_val_cat, y_val_cat)]

    feature_names_cat = list(x_train_cat.columns.values)

    # ## 2. Modelling Helper Functions

    # # 3.  Model Training 

    # ## 3.1  Logistic Regression
    logit_cv = LogisticRegressionCV(cv=5,
                                    n_jobs=-1,
                                    random_state=random_seed,
                                    refit=True,
                                    scoring=custom_cost_scorer,
                                )

    logit_cv.fit(x_train, y_train)
    logit_report = report(logit_cv, x_train, y_train,
                                    x_test, y_test,
                                    importance_plot=True,
                                    feature_labels=feature_names,
                                    confusion_labels=confusion_lbs,
                                    verbose = False)

    # ## 3.2  Random Forests

    # ### Training
Ejemplo n.º 21
0
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, \
        RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

models = [
    DecisionTreeClassifier(random_state=rs, max_depth=15),
    SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3),
    BernoulliNB(),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=200),
    RandomForestClassifier(n_estimators=200), AdaBoostClassifier(),
    GradientBoostingClassifier(random_state=rs),
    MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(2, 1), max_iter=2000)]

def score_model(X, y, estimator, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
                    ('scaler', StandardScaler()),
                    ('one_hot_encoder', OneHotEncoder()),
                    ('estimator', estimator),
Ejemplo n.º 22
0
 def Classification(self):
     return {
         'RF': {
             'estimator':
             RandomForestClassifier(oob_score=True,
                                    n_estimators=100,
                                    n_jobs=10),
             'parameters': {
                 'GSCV': {
                     'max_features':
                     [0.6, 0.7, 0.8, 0.9, 'auto', 'log2', None],
                     'max_depth': [3, 4, 5, None],
                     'n_estimators': [100],
                     'class_weight':
                     ['balanced', 'balanced_subsample', None],
                     'min_samples_split': [2, 3, 4, 5],
                     'min_samples_leaf': [2, 3, 4, 5],
                     'criterion': ['gini', 'entropy']
                 },
                 'RSCV': {
                     'max_features':
                     [0.6, 0.7, 0.8, 0.9, 0.99, 'auto', 'log2', None],
                     'max_depth': [3, 4, 5, 6, None],
                     'n_estimators': [100],
                     'criterion': ['gini', 'entropy'],
                     'class_weight':
                     ['balanced', 'balanced_subsample', None],
                     'min_samples_split': [2, 3, 4, 5, 6, 8, 11, 15],
                     'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 9, 11, 15],
                 }
             }
         },
         'GBDT': {
             'estimator':
             GradientBoostingClassifier(n_estimators=200,
                                        learning_rate=0.1,
                                        subsample=1.0,
                                        max_depth=5),
             'parameters': {
                 'GSCV': {
                     'max_features': (0.5, 0.75, 0.8, 0.9, 'auto'),
                     'loss': ['deviance', 'exponential'],
                     'max_depth': [3, 4, 5],
                     'n_estimators': [100],
                     'learning_rate':
                     [0.005, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9],
                     'min_samples_split': [2, 3, 4, 5],
                     'subsample': [0.75, 0.85, 0.95]
                 },
                 'RSCV': {
                     'max_features':
                     (0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 'auto', 'log2', None),
                     'loss': ['deviance', 'exponential'],
                     'max_depth': [3, 4, 5, 6, 7, 8, 9, 11, 15],
                     'n_estimators': [100],
                     'learning_rate':
                     [0.005, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9],
                     'min_samples_split': [2, 3, 4, 5, 6, 8],
                     'subsample': [0.7, 0.8, 0.9, 1]
                 },
             }
         },
         'XGB': {
             'estimator':
             XGBClassifier(
                 n_estimators=100,
                 objective='binary:logistic',  # multi:softprob
                 booster='gbtree',
                 silent=True,
                 max_depth=4,
                 missing=None,
                 reg_alpha=0,
                 reg_lambda=1,
                 learning_rate=0.1,
                 n_jobs=10),
             'parameters': {
                 'GSCV': {
                     'colsample_bytree': [0.75, 0.85, 0.95],
                     'subsample': [0.75, 0.85, 0.95],
                     'reg_alpha': [0, 0.1, 0.5, 1, 2],
                     'reg_lambda': [0.1, 0.5, 1, 2, 2.5],
                     'max_depth': [3, 4, 5, 6],
                     'n_estimators': [100],
                     'learning_rate': [0.01, 0.05, 0.1, 0.2],
                     #'min_child_weight' : [1, 2],
                 },
                 'RSCV': {
                     'subsample': [0.7, 0.8, 0.85],
                     'colsample_bytree': [0.7, 0.8, 0.9],
                     'colsample_bylevel': [0.7, 0.8, 0.9],
                     'max_delta_step': [0, 1],
                     'colsample_bynode': [1],
                     'scale_pos_weight': [0.8, 1, 1.2],
                     'base_score': [0.5],
                     'n_estimators': [100],
                     'gamma': [0, 0.005, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9],
                     'min_child_weight': [1, 2, 3, 4, 5],
                     'min_samples_split': [2, 3, 4, 5, 6, 7],
                     'max_depth': [3, 4, 5, 6, 7, 8],
                     'reg_lambda': [0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.96, 1],
                     'reg_alpha': [0, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7],
                     'learning_rate':
                     [0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
                 },
             }
         },
         'LGBM': {
             'estimator':
             LGBMClassifier(
                 boosting_type='gbdt',
                 num_leaves=33,
                 max_depth=5,
                 learning_rate=0.1,
                 n_estimators=100,
                 class_weight=None,
                 min_split_gain=0.0,
                 min_child_weight=1e-3,
                 min_child_samples=10,
                 subsample=0.8,
                 subsample_freq=0,
                 colsample_bytree=0.8,
                 reg_alpha=0.0,
                 reg_lambda=0.0,
                 random_state=None,
                 n_jobs=-1,
                 silent=False,
                 importance_type='split',
                 verbose=-1,
             ),
             'parameters': {
                 'GSCV': {
                     'num_leaves': [9, 17, 33, 65],
                     'max_depth': [-1, 3, 4, 5, 6],
                     'learning_rate': [0.01, 0.05, 0.1, 0.2],
                     'n_estimators': [100],
                     'reg_alpha': [0.01, 0.1, 0.5, 1, 1.5, 2],
                     'reg_lambda': [0.01, 0.1, 0.5, 1, 1.5, 2],
                     'class_weight': [
                         'balanced',
                         None,
                     ],
                     'subsample': [0.7, 0.78, 0.85],
                     'colsample_bytree': [0.7, 0.78, 0.85],
                 },
                 'RSCV': {
                     'num_leaves': [9, 17, 33, 65, 129],
                     'max_depth': [-1, 3, 4, 5, 6, 7],
                     'learning_rate': [0.01, 0.05, 0.1, 0.2],
                     'n_estimators': [100],
                     'reg_alpha':
                     [0, 0.01, 0.1, 0.5, 0.75, 1, 1.5, 2, 2.5, 3],
                     'reg_lambda':
                     [0, 0.01, 0.1, 0.5, 0.75, 1, 1.5, 2, 2.5, 3],
                     'class_weight': [
                         'balanced',
                         None,
                     ],
                     'subsample': [0.7, 0.78, 0.85],
                     'colsample_bytree': [0.7, 0.78, 0.85],
                 }
             }
         },
         'AdaB_DT': {
             'estimator':
             AdaBoostClassifier(DecisionTreeClassifier(
                 criterion='gini',
                 splitter='best',
                 class_weight='balanced',
                 min_samples_leaf=1,
                 max_features=None),
                                algorithm='SAMME.R',
                                learning_rate=1,
                                n_estimators=500),
             'parameters': {
                 'GSCV': {
                     'n_estimators': range(400, 800, 1000),
                     'algorithm': ['SAMME', 'SAMME.R'],
                     "learning_rate": [0.3, 0.5, 0.7, 0.9, 0.95, 1, 2]
                 },
                 'RSCV': {}
             }
         },
         'MLP': {
             'estimator':
             MLPClassifier(max_iter=3000,
                           solver='lbfgs',
                           alpha=1e-05,
                           tol=1e-4,
                           hidden_layer_sizes=(
                               50,
                               50,
                           ),
                           random_state=None),
             'parameters': {
                 'GSCV': {
                     'alpha': [10**i for i in [-6, -5, -4, -3, -1, 1]],
                     'max_iter': [15000],
                     'solver': ['adam', 'lbfgs', 'sgd'],
                     'activation': ["logistic", 'identity', "relu", "tanh"],
                     'hidden_layer_sizes': [(
                         50,
                         30,
                         20,
                         10,
                         10,
                     ), (
                         15,
                         25,
                         15,
                         10,
                     ), (
                         8,
                         15,
                         10,
                         10,
                     ), (
                         30,
                         10,
                         40,
                         10,
                     ), (
                         10,
                         20,
                         10,
                         10,
                     )],
                     'learning_rate':
                     ["constant", "invscaling", "adaptive"],
                     'tol': [1e-4]
                 },
                 'RSCV': {
                     'hidden_layer_sizes': [(
                         randint.rvs(8, 30, 1),
                         randint.rvs(5, 20, 1),
                         randint.rvs(5, 30, 1),
                     ), (
                         randint.rvs(8, 40, 1),
                         randint.rvs(5, 40, 1),
                     )],
                     'activation': ["logistic", 'identity', "relu", "tanh"],
                     'solver': ['adam', 'lbfgs', 'sgd'],
                     'alpha': [10**i for i in [-6, -5, -4, -3, -1, 1]
                               ],  #uniform(1e-06, 0.9),
                     'max_iter': [15000],
                     'learning_rate':
                     ["constant", "invscaling", "adaptive"]
                 }
             }
         },
         'LinearSVM': {
             'estimator':
             LinearSVC(penalty='l2',
                       dual=True,
                       tol=0.0001,
                       C=1,
                       max_iter=2e6,
                       random_state=None),
             'parameters': {
                 'GSCV': {
                     'penalty': ['l2'],
                     'dual': [True],
                     'loss': ['hinge', 'squared_hinge'],
                     'tol': [
                         5e-7, 1e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4,
                         5e-3, 1e-3, 1e-2, 5e-2
                     ],
                     #'C': [ 0.1, 0.3, 0.5, 0.8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 19, 20, 50],
                     'C':
                     np.power(10, np.linspace(-2, 2, 20)),
                     'max_iter': [4e6, 6e6, 8e6, 1.4e7]
                 },
                 'RSCV': {}
             }
         },
         'LinearSVMl1': {
             'estimator':
             LinearSVC(penalty='l1',
                       dual=False,
                       tol=0.0001,
                       C=1,
                       max_iter=3e6,
                       random_state=None),
             'parameters': {
                 'GSCV': {
                     'penalty': ['l1', 'l2'],
                     'dual': [False, True],
                     'loss': ['hinge', 'squared_hinge'],
                     'tol': [
                         5e-7, 1e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4,
                         5e-3, 1e-3, 1e-2, 5e-2
                     ],
                     'C':
                     np.power(10, np.linspace(-2, 2, 20)),
                     'max_iter': [4e6, 6e6, 8e6, 1.4e7]
                 },
                 'RSCV': {}
             }
         },
         'SVMlinear': {
             'estimator':
             SVC(kernel="linear",
                 probability=True,
                 C=1.0,
                 decision_function_shape='ovr',
                 random_state=None),
             'parameters': {
                 'GSCV': [{
                     'kernel': ['linear'],
                     'tol': [
                         5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3,
                         1e-3, 1e-2, 5e-2, 5e-1
                     ],
                     'C':
                     np.power(10, np.linspace(-2, 2, 20)),
                 }],
                 'RSCV': {}
             }
         },
         'SVMrbf': {
             'estimator':
             SVC(kernel='rbf',
                 gamma='scale',
                 probability=True,
                 C=1,
                 decision_function_shape='ovr'),
             'parameters': {
                 'GSCV': {
                     'kernel': ['rbf'],
                     'gamma': [
                         1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 5e-3, 5e-4,
                         'auto'
                     ],
                     'tol': [
                         5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3,
                         1e-3, 1e-2, 5e-2, 5e-1
                     ],
                     'C':
                     np.power(10, np.linspace(-2, 2, 20)),
                 },
                 'RSCV': {}
             }
         },
         'SVM': {
             'estimator':
             SVC(kernel='rbf',
                 gamma='scale',
                 probability=True,
                 C=1,
                 decision_function_shape='ovr'),
             'parameters': {
                 'GSCV': [
                     {
                         'kernel': ['rbf', 'sigmoid'],
                         'gamma': [
                             1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 5e-3,
                             5e-4, 'auto'
                         ],
                         'tol': [
                             5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3,
                             1e-3, 1e-2, 5e-2, 5e-1
                         ],
                         'C':
                         np.power(10, np.linspace(-2, 2, 20)),
                     },
                     {
                         'kernel': ['poly'],
                         'degree': [2, 3, 4, 5],
                         'gamma': [
                             1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 5e-3,
                             5e-4, 'auto'
                         ],
                         'tol': [
                             5e-7, 5e-6, 1e-6, 1e-5, 5e-5, 1e-4, 5e-4, 5e-3,
                             1e-3, 1e-2, 5e-2, 5e-1
                         ],
                         'C':
                         np.power(10, np.linspace(-2, 2, 20)),
                     },
                 ],
                 'RSCV': {}
             }
         },
         'nuSVMrbf': {
             'estimator':
             NuSVC(kernel='rbf',
                   gamma='scale',
                   probability=True,
                   nu=0.5,
                   decision_function_shape='ovr'),
             'parameters': {
                 'GSCV': {
                     'kernel': ['rbf'],
                     'nu': [0.8, 0.9, 1],
                     'gamma':
                     [1e-2, 1e-3, 1e-4, 1e-5, 5e-2, 5e-3, 5e-4, 'auto'],
                     'tol':
                     [1e-6, 1e-5, 5e-5, 1e-4, 5e-3, 3e-3, 1e-3, 1e-2, 5e-2]
                 },
                 'RSCV': {}
             }
         },
         'SGD': {
             'estimator':
             SGDClassifier(penalty='l2',
                           loss='hinge',
                           alpha=0.0001,
                           max_iter=5000,
                           tol=1e-3,
                           n_jobs=2),
             'parameters': {
                 'GSCV': [
                     {
                         'penalty': ['l2', 'elasticnet'],
                         'loss': [
                             'hinge', 'log', 'modified_huber',
                             'squared_hinge', 'perceptron'
                         ],
                         'alpha':
                         [5e-5, 1e-4, 5e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1],
                         'l1_ratio':
                         [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9],
                         'max_iter': [5e5],
                         #'eta0' : [0, 0.0001, 0.001, 0.01, 0.1],
                         #'learning_rate' : ['optimal'],
                         'tol': [1e-3, 5e-3, 1e-4, 1e-5, 1e-2]
                     },
                 ],
                 'RSCV': {}
             }
         },
         'KNN': {
             'estimator':
             KNeighborsClassifier(n_neighbors=5,
                                  weights='uniform',
                                  algorithm='auto',
                                  leaf_size=30,
                                  p=2),
             'parameters': {
                 'GSCV': {
                     'n_neighbors': [3, 4, 5, 6, 7, 8, 10, 12, 15],
                     'weights': ['uniform', 'distance'],
                     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                     'leaf_size': [10, 20, 30, 40, 50, 60, 70, 100],
                     'p': [1, 2]
                 },
                 'RSCV': {}
             }
         },
         'RNN': {
             'estimator':
             RadiusNeighborsClassifier(radius=1,
                                       weights='uniform',
                                       algorithm='auto',
                                       leaf_size=30,
                                       p=2,
                                       metric='minkowski',
                                       outlier_label=None),
             'parameters': {
                 'GSCV': {
                     'radius':
                     [1, 2, 3, 4, 5, 10, 15, 20, 23, 26, 30, 35, 40],
                     'weights': ['uniform', 'distance'],
                     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                     'leaf_size': [10, 20, 30, 50, 70, 100, 200],
                     'p': [1, 2]
                 },
                 'RSCV': {}
             }
         },
         'GNB': {
             'estimator': GaussianNB(priors=None, var_smoothing=1e-09),
             'parameters': {
                 'GSCV': {
                     'var_smoothing':
                     np.dot(
                         np.array([[1e-11, 1e-10, 1e-09, 1e-08, 1e-07]]).T,
                         np.array([[1, 3, 5, 7]])).flatten()
                 },
                 'RSCV': {}
             }
         },
         'BNB': {
             'estimator':
             BernoulliNB(alpha=1.0,
                         binarize=0.5,
                         fit_prior=True,
                         class_prior=None),
             'parameters': {
                 'GSCV': {
                     'alpha': [i / 10 for i in range(1, 21, 1)],
                     'binarize': [i / 10 for i in range(1, 10, 1)]
                 },
                 'RSCV': {}
             }
         },
         'MNB': {
             'estimator':
             MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None),
             'parameters': {
                 'GSCV': {
                     'alpha': [i / 10 for i in range(1, 21, 1)]
                 },
                 'RSCV': {}
             }
         },
         'CNB': {
             'estimator':
             ComplementNB(alpha=1.0,
                          fit_prior=True,
                          class_prior=None,
                          norm=False),
             'parameters': {
                 'GSCV': {
                     'alpha': [i / 10 for i in range(1, 21, 1)]
                 },
                 'RSCV': {}
             }
         },
         'DT': {
             'estimator':
             DecisionTreeClassifier(criterion='gini',
                                    splitter='best',
                                    class_weight='balanced',
                                    min_samples_leaf=1,
                                    max_features=None),
             'parameters': {
                 'GSCV': {
                     'max_features':
                     (0.4, 0.5, 0.6, 0.7, 0.8, 'sqrt', 'log2'),
                     'min_samples_leaf': (1, 2, 3),
                     'max_depth': (5, 8, 10, 15, 25, 30, None),
                     'criterion': ['gini', 'entropy']
                 },
                 'RSCV': {}
             }
         },
         'LR': {
             'estimator':
             LogisticRegression(random_state=None,
                                solver='liblinear',
                                penalty='l1',
                                fit_intercept=True,
                                max_iter=10000,
                                l1_ratio=None,
                                multi_class='auto'),
             'parameters': {
                 'GSCV': [
                     {
                         'penalty': ['l1'],
                         'tol': [
                             1e-3,
                             1e-4,
                             1e-5,
                         ],
                         'l1_ratio': [None],
                         'solver': ['liblinear', 'saga']
                     },
                     {
                         'penalty': ['l2'],
                         'tol': [
                             1e-3,
                             1e-4,
                             1e-5,
                         ],
                         'l1_ratio': [None],
                         'solver': ['lbfgs', 'sag']
                     },
                     {
                         'penalty': ['elasticnet'],
                         'tol': [
                             1e-3,
                             1e-4,
                             1e-5,
                         ],
                         'l1_ratio': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95],
                         'solver': ['saga']
                     },
                 ],
                 'RSCV': {}
             }
         },
         'LRCV': {
             'estimator':
             LogisticRegressionCV(
                 random_state=None,
                 solver='saga',
                 penalty='elasticnet',
                 Cs=10,
                 cv=8,
                 fit_intercept=True,
                 max_iter=6e3,
                 n_jobs=30,
                 l1_ratios=[0.005, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9],
                 multi_class='auto'),
             'parameters': {
                 'GSCV': [
                     {
                         'penalty': ['elasticnet'],
                         'Cs': [
                             10,
                             np.power(10, np.arange(-4, 4, 0.4)),
                             np.power(10, np.arange(-3, 3, 0.3))
                         ],
                         'l1_ratios': [
                             [0.005, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9],
                         ],
                         'max_iter': [4e4],
                         'cv': [10],
                         'solver': ['saga']
                     },
                 ],
                 'RSCV': [
                     {
                         'penalty': ['l1'],
                         'Cs': [np.power(10, np.arange(-2, 4, 0.3))],
                         'l1_ratios': [None],
                         'max_iter': [6e4],
                         'cv': [10],
                         'solver': ['liblinear', 'saga']
                     },
                     {
                         'penalty': ['l2'],
                         'l1_ratios': [None],
                         'max_iter': [6e4],
                         'cv': [10],
                         'Cs': [np.power(10, np.arange(-2, 4, 0.3))],
                         'solver': ['lbfgs', 'sag']
                     },
                     {
                         'penalty': ['elasticnet'],
                         'Cs': [np.power(10, np.arange(-2, 4, 0.3))],
                         'max_iter': [6e4],
                         'cv': [10],
                         'solver': ['saga']
                     },
                 ],
             }
         },
         'LassoCV': {
             'estimator':
             LassoCV(cv=10,
                     alphas=[0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 10],
                     max_iter=20000,
                     n_jobs=-1),
             'parameters': {
                 'GSCV': {
                     'n_alphas': [200, 500],
                     'normalize': [False, True]
                 },
                 'RSCV': {}
             }
         },
         'Lasso': {
             'estimator': Lasso(alpha=1, max_iter=15000),
             'parameters': {
                 'GSCV': {
                     'alpha': [0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 10],
                     'max_iter': [15000],
                     'normalize': [False, True],
                     'precompute': [False, True]
                 },
                 'RSCV': {}
             }
         },
         'LLIC': {
             'estimator':
             LassoLarsIC(criterion='aic',
                         fit_intercept=True,
                         verbose=False,
                         normalize=True,
                         precompute='auto',
                         max_iter=10000,
                         eps=2.220446049250313e-16,
                         copy_X=True,
                         positive=False),
             'parameters': {
                 'GSCV': {
                     'criterion': ['aic', 'bic'],
                 },
                 'RSCV': {}
             }
         },
         'ENet': {
             'estimator':
             ElasticNetCV(
                 cv=10,
                 alphas=[0.001, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1, 10],
                 max_iter=20000,
                 random_state=None,
                 n_jobs=20,
                 l1_ratio=[.01, .05, .1, .3, .5, .7, .9, .98]),
             'parameters': {
                 'GSCV': {
                     'n_alphas': [200, 500],
                     'max_iter': [50000],
                     'tol': [
                         1e-3,
                         1e-4,
                         1e-5,
                     ],
                     'normalize': [False, True]
                 },
                 'RSCV': {}
             }
         },
     }
Ejemplo n.º 23
0
plt.scatter(X[:, 0], X[:, 1], s=60, c=y, cmap=plt.cm.coolwarm)
plt.xlabel('Feature $x_1$', fontsize=15)
plt.ylabel('Feature $x_2$', fontsize=15)
plt.title("2D Multi-Classification Dataset", fontsize=15)

# plt.show()

clf = LogisticRegressionCV(Cs=10,
                           class_weight=None,
                           cv=None,
                           dual=False,
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           max_iter=200,
                           multi_class='multinomial',
                           n_jobs=1,
                           penalty='l2',
                           random_state=0,
                           refit=True,
                           scoring=None,
                           solver='sag',
                           tol=0.0001,
                           verbose=0)
clf.fit(X, y)


def plot_decision_boundary(pred_func):
    # Set min and max values and give it some padding
    x_min, x_max = X_val[:, 0].min() - 0.5, X_val[:, 0].max() + 0.5
    y_min, y_max = X_val[:, 1].min() - 0.5, X_val[:, 1].max() + 0.5
    h = 0.01
Ejemplo n.º 24
0
# print(words)
# new_vector=vec.transform(words)
# print(type(new_vector))

# vec = TfidfTransformer()
# vec.fit(cv.transform(words))
# vec.fit(cv.transform(z_words))


lr = LogisticRegression(penalty = 'l2')
loss = cross_val_score(lr,vec.transform(words),y_train,cv=5,scoring='neg_log_loss')
print('logloss of each fold is: ',-loss)
print('cv logloss is:', -loss.mean())

Cs = [1,10,100,1000]
lrcv = LogisticRegressionCV(Cs = Cs,cv = 5,scoring='neg_log_loss',penalty='l1', solver='liblinear', multi_class='ovr')
lrcv.fit(vec.transform(words),y_train)

LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5,
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1',
           random_state=None, refit=True, scoring='neg_log_loss',
           solver='liblinear', tol=0.0001, verbose=0)
print(lrcv.scores_)


#调用逻辑回归算法
lr = LogisticRegression(penalty = 'l2',solver = 'lbfgs',max_iter=100,class_weight={0:0.28,1:0.72})
param = {'C':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.5,2.0,2.5,3.0,1.6,1.7,1.8,1.9,2.1,2.2,2.3,2.4]}
gc_lr = GridSearchCV(lr,param_grid=param,cv = 3)
gc_lr.fit(vec.transform(words),y_train)
Ejemplo n.º 25
0
# In[28]:


X[0]


# In[29]:


X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)


# In[30]:


logreg = LogisticRegressionCV(cv = 10,random_state=0, solver = 'lbfgs', multi_class = 'multinomial', max_iter=10000)


# In[31]:


logreg.fit(X_train,y_train)


# In[32]:


print(logreg.coef_)
logreg.fit(X / np.std(X, 0), Y)

Ejemplo n.º 26
0
def compute_single_model(j, data_i, data_j, task, model_type, n_trees, n_folds,
                         max_depth, symmetric):
    print("Examining response variable %d out of %d" %
          (j, np.shape(data_j)[1]))
    if symmetric:
        X = data_i[:, list(range(0, j)) + list(range(j + 1, data_i.shape[1]))]
    else:
        X = data_i
    y = data_j[:, j]
    score = []
    importance = []
    for k in range(0, n_folds):
        if task == "regression":
            rfm = RandomForestRegressor(n_estimators=n_trees,
                                        max_depth=max_depth,
                                        max_features='sqrt',
                                        n_jobs=-1)
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                train_size=0.8)
            rfm.fit(X_train, y_train)
            score.append(rfm.score(X_test, y_test))
            importance.append(rfm.feature_importances_)
        else:
            if "rf" in model_type:
                rfm = RandomForestClassifier(n_estimators=n_trees,
                                             max_depth=max_depth,
                                             max_features='sqrt',
                                             n_jobs=-1)
                try:
                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, train_size=0.8, stratify=y)
                    rfm.fit(X_train, y_train)
                    y_test_matrix = np.eye(len(set(
                        y_test.tolist())))[y_test.tolist()]
                    score.append(
                        roc_auc_score(y_test_matrix,
                                      rfm.predict_proba(X_test)))
                    importance.append(rfm.feature_importances_)
                except:
                    score.append(0.)
                    importance.append(np.zeros(np.shape(X)[1]))
            else:
                rfm = LogisticRegressionCV()
                try:
                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, train_size=0.8, stratify=y)
                    rfm.fit(X_train, y_train)
                    y_test_matrix = np.eye(len(set(
                        y_test.tolist())))[y_test.tolist()]
                    score.append(
                        roc_auc_score(y_test_matrix,
                                      rfm.predict_proba(X_test)))
                    importance.append(rfm.coef_)
                except:
                    score.append(0.)
                    importance.append(np.zeros(np.shape(X)[1]))

    score = np.mean(score)
    importance = np.mean(np.vstack(importance), axis=0)
    if symmetric:
        arr = np.zeros(data_i.shape[1])
        arr[:j] = importance[:j]
        arr[(j + 1):] = importance[j:]
        importance = arr

    return (score, importance)
Ejemplo n.º 27
0
def run(submission_name):
    print('Prepare datasets...')
    train, X_test = prepare()
    y_train = train['open_account_flg']
    X_train = train.drop('open_account_flg', axis=1)

    #  Part 1. Boosting. (Part 2 must be commented)

    l1_features = {
        'xgb': [
            feature for feature in X_train.columns if feature not in [
                'monthly_payment', 'monthly_payment_to_income',
                'credit_sum_to_income'
            ]
        ],
        'nn': [
            feature for feature in X_train.columns
            if feature not in ['credit_sum_to_income']
            and feature[:13] != 'living_region'
        ],
        'rf': [
            feature for feature in X_train.columns
            if feature[:13] != 'living_region'
        ],
        'gbm': [
            feature for feature in X_train.columns
            if feature not in ['monthly_income', 'monthly_payment_to_income']
        ]
    }

    l1_models_pool = {
        'xgb': calibrated(xgb_bag()),
        # 'nn': calibrated(nn_bag()),
        # 'rf': calibrated(rf()),
        'gbm': calibrated(gbm_bag())
    }
    l2_model = LogisticRegressionCV(cv=5,
                                    scoring='roc_auc',
                                    max_iter=10000,
                                    solver='sag',
                                    class_weight='balanced',
                                    n_jobs=N_JOBS,
                                    random_state=SEED)

    l1_df, cv_score = fit_stacking(l1_models_pool,
                                   l2_model,
                                   X_train,
                                   y_train,
                                   X_test,
                                   l1_features=l1_features)

    print()
    print('Predict...')
    pred = l2_model.predict_proba(l1_df)[:, 1]

    # End part 1

    # Part 2. Over previous submissions (Part 1 must be commented)

    df = pd.DataFrame({
        'stacked':
        pd.read_csv(
            'submissions/stacking_xgb_gbm_calibrated_l2_lrcv_calibrated_0.79973117848.csv',
            index_col='_ID_')['_VAL_'].as_matrix(),
        'xgb':
        load_l1_predictions('xgb')[0],
        'gbm':
        load_l1_predictions('gbm')[0]
    })

    pred = df.mean(axis=1)

    cv_score = '___'

    # End part 2

    print()
    print('Build submission...')
    df = pd.read_csv('data/credit_test.csv', sep=';')
    submission = pd.DataFrame({'_ID_': df['client_id'], '_VAL_': pred})
    print('Write submission to file (%s.csv)...' % submission_name)
    submission.to_csv('submissions/%s_%s.csv' % (submission_name, cv_score),
                      index=False)
    print('Done!')
Ejemplo n.º 28
0
train = pd.concat([train, train_nan], axis=0)
del train_nan
#test
test = pd.concat([test, test_nan], axis=0)
del test_nan

y = train['renewal']
x = train.drop('renewal', axis=1)

ros = over_sampling.ADASYN()
rus = under_sampling.NearMiss()
rcs = combine.SMOTEENN()
rcs2 = combine.SMOTETomek()

log = BaggingClassifier(LogisticRegressionCV(Cs=6))
rf = BaggingClassifier(RandomForestClassifier())
gbc = BaggingClassifier(
    GradientBoostingClassifier(n_estimators=250, learning_rate=0.01))
sv = SVC(C=0.8, probability=True)
for sample, sample_name in zip([rcs2, ros, rus, rcs, rcs2],
                               ['rcs2', 'ros', 'rus', 'rcs']):
    print(sample_name)
    x_rs, y_rs = sample.fit_sample(x, y)
    for model, model_name in zip([log, rf, gbc], ['log', 'rf', 'gbc']):
        model.fit(x_rs, y_rs)
        filename = 'C:/Users/cheekati/Desktop/ml/AV Mck/' + str(
            model_name) + str(sample_name) + '.pkl'
        f = open(filename, 'wb')
        pickle.dump(model, f)
        print('model complete')
Ejemplo n.º 29
0
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV

# loading iris dataset into memory
iris = sns.load_dataset("iris")
sns.pairplot(iris, hue='species')

#Seperating dependent and independent variable
X = iris.values[:, :4]
y = iris.values[:, 4]
#Dividing dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.5,
                                                    random_state=1)

#creating a object of logisticregression
model = LogisticRegressionCV()

#fitting model with training data
model.fit(X_train, y_train)

print("Accuracy={:.2f}".format(model.score(X_test, y_test)))
Ejemplo n.º 30
0
     'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'],
     'dataset': 'classifier',
 },
 {
     'model':
     LogisticRegression(max_iter=100, multi_class='multinomial'),
     'methods': [
         'decision_function', 'predict', 'predict_proba',
         'predict_log_proba', 'score'
     ],
     'dataset':
     'classifier',
 },
 {
     'model':
     LogisticRegressionCV(max_iter=100),
     'methods': [
         'decision_function', 'predict', 'predict_proba',
         'predict_log_proba', 'score'
     ],
     'dataset':
     'classifier',
 },
 {
     'model': RandomForestRegressor(n_estimators=10),
     'methods': ['predict', 'score'],
     'dataset': 'regression',
 },
 {
     'model': LinearRegression(),
     'methods': ['predict', 'score'],