コード例 #1
0
def CV(clf,X,y,seeds=range(3)):  ## shuffle data, and to alter method via seed.
    if type(seeds).__name__=='int':  ## shuffle only once.
        cv=np.sqrt(-cvs(clf,X,y,scoring=scorer,cv=KFold(10,True,seeds).split(X,y)))
        print('Mean:   ',round(np.mean(cv)*1e4,3),'\tMax:   ',round(np.max(cv)*1e4,3))
    else:
        median=[];      worst=[];       meen=[]
        for seed in seeds:
            cv=np.sqrt(-cvs(clf,X,y,scoring=scorer,cv=KFold(10,True,seed).split(X,y)))
            worst.append(np.max(cv));   meen.append(np.mean(cv))
        print('Mean3:   ',round(np.mean(meen)*1e4,3),'\tMax3:   ',round(np.mean(worst)*1e4,3))
コード例 #2
0
def runBestRegsCompKFold(dataSets=[], regModels=[], names=[]):

    myResults = {}
    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg')
        #myTrain = skb(f_regression, k=3).fit_transform(myTrain,myVal)
        for name in myTrain.columns:
            if (not (myTrain[name].dtype == 'O')):
                myTrain[name] = pre.minmax_scale(myTrain[name].astype('float'))
        splits = kf(n_splits=10, shuffle=True, random_state=42)
        infinity = float("inf")
        index = -1
        count = -1
        for reg in regModels:
            count = count + 1
            reg.fit(myTrain, myVal)
            cvsScores = cvs(reg,
                            myTrain,
                            myVal,
                            cv=splits,
                            scoring='neg_mean_squared_error')
            meanSquareRootError = np.sqrt(-1 * cvsScores.mean())
            print(RegsCompNames[names[count]], meanSquareRootError)
            if (meanSquareRootError < infinity):
                infinity = meanSquareRootError
                index = count
                L1, L2, L3 = RegsCompNames[names[index]], cvsScores, infinity
        print(filesReg[ds], RegsCompNames[names[index]], infinity)
        myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3}
        print('\n')
    return myResults
コード例 #3
0
def runBestClassificationKFold(dataSets=[], Classifiers=[], names=[]):

    myResults = {}
    le = pre.LabelEncoder()

    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesBinClass')
        le.fit(myVal)
        myVal = le.transform(myVal)
        #myTrain = skb(f_regression, k=6).fit_transform(myTrain,myVal)
        #myTrain = skb(chi2, k=5).fit_transform(myTrain,myVal)
        splits = sss(n_splits=10,
                     test_size=((len(myData) * .20) / len(myData)),
                     random_state=42)
        #splits =kf(n_splits=10, shuffle=True, random_state=42)
        infinity = -1.0 * float("inf")
        index = -1
        count = -1
        for clf in Classifiers:
            count = count + 1
            clf.fit(myTrain, myVal)
            cvsScores = cvs(clf, myTrain, myVal, cv=splits, scoring='roc_auc')
            meanAUC = cvsScores.mean()
            print(ClassifiersNames[names[count]], meanAUC)
            if (meanAUC > infinity):
                infinity = meanAUC
                index = count
                L1, L2, L3 = ClassifiersNames[
                    names[index]], cvsScores, infinity
        print(filesBinClass[ds], ClassifiersNames[names[index]], infinity)
        myResults[filesBinClass[ds]] = {1: L1, 2: L2, 3: L3}
        print('\n')
    return myResults
コード例 #4
0
 def cross_fold_val(self, model_list):
     self.model_list = model_list
     self.avg_scores = []
     self.std_dev = []
     self.model_names = []
     for model_name, model in tqdm(self.model_list):
         score = cvs(model,
                     self.A,
                     self.C,
                     cv=5,
                     scoring='neg_mean_absolute_error')
         scores = abs(score)  # MAE scoring is negative in cross_val_score
         avg_score = np.mean(scores)
         std = np.std(scores)
         self.avg_scores.append(avg_score)
         self.std_dev.append(std)
         self.model_names.append(model_name)
         output = "%s: %f (%f)" % (model_name, avg_score, std)
         print(output)
     fig, ax = plt.subplots(figsize=(15, 7))
     plt.title(' Models with Cross Validation Scores comparison', size=20)
     plt.ylabel('Avg_scores', fontsize=15, fontweight='bold')
     plt.xlabel('model_list', fontsize=15, fontweight='bold')
     plt.xticks(fontsize=12, fontweight='bold')
     plt.yticks(fontsize=12, fontweight='bold')
     ax.bar(self.model_names, self.avg_scores)
     plt.show()
コード例 #5
0
def runBestRegressionModelKFoldwFS(dataSets=[], regModels=[], names=[]):

    myResults = {}
    for ds in dataSets:
        myData, myTrain, myVal = dataEncoding(ds, taskID='filesReg')
        myTrain = skb(f_regression, k=5).fit_transform(myTrain, myVal)
        splits = kf(n_splits=10, shuffle=True, random_state=42)
        infinity = float("inf")
        index = -1
        count = -1
        for reg in regModels:
            count = count + 1
            reg.fit(myTrain, myVal)
            cvsScores = cvs(reg,
                            myTrain,
                            myVal,
                            cv=splits,
                            scoring='neg_mean_squared_error')
            meanSquareRootError = np.sqrt(-1 * cvsScores.mean())
            print(regsNames[names[count]], meanSquareRootError)
            if (meanSquareRootError < infinity):
                infinity = meanSquareRootError
                index = count
                L1, L2, L3, L4, L5, L6 = regsNames[
                    names[index]], reg.intercept_, reg.coef_, np.exp(
                        reg.coef_), cvsScores, infinity
        print(filesReg[ds], regsNames[names[index]], infinity)
        myResults[filesReg[ds]] = {1: L1, 2: L2, 3: L3, 4: L4, 5: L5, 6: L6}
        print('\n')
    return myResults
コード例 #6
0
def classification(data, labels, trials=3):
    """Performs classifications and obtain analytical scores.
   Parameters
   ----------
   data : List[List[float]]
   labels : List[int]
   trials : int
      The number of trials of cross-validation.
   Returns
   -------
   None
   """
    clf1 = OneVsRestClassifier(
        svm.SVC(kernel='poly', C=1, degree=6, probability=True))
    clf2 = clone(clf1)
    clf3 = clone(clf1)

    trainingAccuracy = getTrainingAccuracy(clf1, data, labels)
    # print("Training accuracy = " + str(trainingAccuracy))

    CVSScores = cvs(clf2, data, labels, cv=trials)
    # print("Cross-validation scores: " + str(CVSScores))

    ROCAUC = getROCAUCScore(clf3, data, labels, trials)
    # print("ROC AUC = " + str(ROCAUC))
    return trainingAccuracy, np.mean(CVSScores), ROCAUC
コード例 #7
0
    def show_score(self):
        import sklearn
        if str(sklearn.__version__).startswith('0.18'):
            from sklearn.model_selection import cross_val_score as cvs
        else:
            from sklearn.cross_validation import cross_val_score as cvs

        scores = cvs(self.model, self.X, self.targets, cv=5)
        print("mean of scores is: " + str(scores.mean()))
コード例 #8
0
def automatic_dt_pruning(dt_classifier, data, label):
    np.random.seed(42)
    alpha = []
    score = []
    for k in range(0, 100):
        ccp_alpha_test = k / 100
        dt_classifier.set_params(ccp_alpha=ccp_alpha_test)
        alpha.append(ccp_alpha_test)
        score.append(cvs(dt_classifier, data, label, cv=5).mean())

    best_ccp_alpha = alpha[score.index(max(score))]

    return best_ccp_alpha
コード例 #9
0
def rfr_fillna(df_all):
    '''
    func:对于原来的表格进行缺失值填充,使用的方法是随机森林
    
    paramas: 
    df_all:原来需要填充的表格
    
    return:df_adda(新的表格),model(填充模型),MinMax_1st(归一化模型1),MinMax_2nd(归一化模型2)
    '''

    # 将数据分段,选择好要进行预测的因变量和自变量
    user_id = df_all.iloc[:, 0]
    X = df_all.iloc[:, 1:-1]
    Y = df_all.iloc[:, -1]

    X1 = X.copy()
    Y2 = X1.iloc[:, 43:]
    sex = X1.iloc[:, 0]
    X2 = X1.iloc[:, 1:43]

    # 量纲归一化
    MinMax_1st = MinMaxScaler().fit(X2)
    X2.iloc[:, :] = MinMax_1st.transform(X2)

    X2 = pd.concat([sex, X2], axis=1)
    # 对于模型进行筛选
    model = {}
    krange = range(4, 30)
    for k in tqdm(list(Y2)):
        X_train = X2[Y2[k].notnull()]
        X_test = X2[Y2[k].isnull()]
        Y_train = Y2[k][Y2[k].notnull()]
        score = []
        for i in krange:
            rfr = RFR(min_samples_split=i, n_jobs=-1)
            score_each = cvs(rfr, X_train, Y_train, cv=3, n_jobs=-1).mean()
            score.append(score_each)
        best_choose = list(krange)[np.argmax(score)]
        rfr = RFR(min_samples_split=best_choose, n_jobs=-1)
        rfr = rfr.fit(X_train, Y_train)
        model[k] = rfr
        Y2[k][Y2[k].isnull()] = rfr.predict(X_test)

    # 对银行流水表再次量纲归一化
    MinMax_2nd = MinMaxScaler().fit(Y2)
    Y2.iloc[:, :] = MinMax_2nd.transform(Y2)

    df_adda = pd.concat([X2, Y2], axis=1)

    df_adda = pd.concat([user_id, df_adda, Y], axis=1)
    return df_adda, model, MinMax_1st, MinMax_2nd
コード例 #10
0
    def build_classifier(self):
        """
        build both and LDA and an SVM classifier for offline training. can lateron be used for online training
        :return:
        """
        self.clf = [LDA(n_components=None, priors=None, shrinkage='auto',
                        solver='eigen', store_covariance=False, tol=0.0001),
                    SVM(kernel='rbf', shrinking=True, probability=True, gamma='scale')]
        # self.clf.fit(self.features, self.labels)
        [c.fit(self.features, self.labels) for c in self.clf]
        # possible to use methods predict(X), predict_log_proba(X) or predict_proba()

        self.cv_scores = []
        [self.cv_scores.append(cvs(estimator=c, X=self.features, y=self.labels, cv=10, n_jobs=-1)) for c in self.clf]
        [print('mean cv score of clf {:d} is'.format(i), np.mean(cv)) for i, cv in enumerate(self.cv_scores)]
コード例 #11
0
ファイル: simpsons.py プロジェクト: be-ns/simpsons_analysis
def build_abr(training_x, training_y, holdout_x, holdout_y, rounds):
    '''
    INPUT: training features, training target, holdout features, holdout target
    OUTPUT: adaboost model, adaboost test score, adaboost train score
    '''
    # > BUILD ADABOOST REGRESSOR
    # use defaults to get adaboost training error
    _abr = a_br()
    # get RMSE (take square root of absolute value
    # of negative mse) using 5-fold Cross Validation
    abr_train = sqrt(
        abs(
            np.array(
                cvs(_abr,
                    training_x,
                    training_y,
                    cv=4,
                    n_jobs=-1,
                    verbose=False,
                    scoring='neg_mean_squared_error')).mean()))
    # print training error
    print('Adaboost_cross_val_score = ', abr_train)

    # now onto test error...
    # set parameters for Random Search
    param_distribution = {
        "loss": ['linear', "square", "exponential"],
        "learning_rate": [.15, .25, .29, .33, .5, .6, .9],
        "n_estimators": sp_randint(250, 1500)
    }
    # number of iterations on Random Search
    n_iter_search = rounds
    # set Random Search (r_search was import name)
    _abr = r_search(_abr,
                    param_distributions=param_distribution,
                    n_iter=n_iter_search,
                    n_jobs=-1,
                    cv=4,
                    verbose=1)
    # fit to training set
    _abr.fit(training_x, training_y)

    # get holdout score and print it
    abr_test = sqrt(mse(holdout_y, _abr.predict(holdout_x)))
    print('holdout_score = ', abr_test)
    # return model and scores
    return _abr, abr_test, abr_train
コード例 #12
0
def lr_model(X_train, y_train, X_test, y_test):
    '''
    Set up logistic regession pipeline.
    Input: train and test matricies
    Output: model predictions and accuracy
    '''
    lr_model = LogisticRegression(C=0.1, penalty='l1')

    lr_model.fit(X_train, y_train)

    cv_score = np.mean(
        cvs(lr_model, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1))
    y_hat = lr_model.predict(X_test)
    score = metrics.accuracy_score(y_test, y_hat)

    print('LR CV Accuracy: {:.2f}'.format(cv_score))
    print('LR Test Accuracy: {:.2f}'.format(score))
コード例 #13
0
def linear_model(x_train='x_train.csv',
                 y_train='y_train.csv',
                 x_test='x_test.csv',
                 y_test='y_test.csv'):

    Linear_model = LinearRegression()
    Linear_model.fit(x_train, y_train)
    scores = cvs(Linear_model, x_train, y_train, cv=10)
    print("accuracy of linearRegressor " + str(scores.mean()))

    rms = np.sqrt(
        np.square(np.asarray(np.log(y_predict) - np.log(y_test))).sum() /
        float(len(y_predict)))
    print('RMSE = {}'.format(rms))

    y_predict = Linear_model.predict(x_test)

    return y_predict
コード例 #14
0
def trainModel(clf,Algorithm, tr_x, tr_y,tr_te_x,tr_te_y):
    clf.fit(tr_x, tr_y)
    tr_score = clf.score(tr_x, tr_y)
    print('训练集中切分的训练数据_%s score is %.4f' % (Algorithm, tr_score),end=' ')

    te_score = clf.score(tr_te_x, tr_te_y)
    print('训练集中切分的测试数据_%s  score is %.4f' % (Algorithm, te_score),end=' ')

    scores=cvs(clf,tr_x, tr_y,cv=5)
    np_tr_scores=np.array(scores)
    tr_mean_score=np_tr_scores.mean()
    print('训练集中切分的训练数据交叉验证:_%s mena score is %.4f'%(Algorithm,tr_mean_score))

    # te_scores = cvs(clf, tr_te_x,tr_te_y, cv=5)
    # np_te_scores = np.array(te_scores)
    # te_mean_score = np_te_scores.mean()
    # print('训练集中切分的测试数据交叉验证:_%s mena score is %.4f' % (Algorithm, te_mean_score), end='  ')
    return tr_score,te_score,tr_mean_score
コード例 #15
0
ファイル: evaluation.py プロジェクト: cmry/amica
    def _cv_score(self, model: Pipeline, p_grid: dict, X: list, y: list,
                  metric: make_scorer, nest: bool = True, smote: bool = True
                  ) -> (Pipeline, float, dict):
        """Big evaluation function, handles oversampling, and cross-val."""
        neural = self.neural
        if smote:
            X, y = self._oversample(X, y, factor=3)
        if p_grid:
            # If nested add a layer of 3 splits, else just cross-validate with
            # 10. If neural apply a simple split only.
            n, _n = (10, 3) if nest else (10 if not neural else 2, 0)
            print(f"running {_n} outer, {n} inner...")
            cv = StratifiedKFold(n_splits=n, random_state=42)
            if nest:
                _cv = StratifiedKFold(n_splits=_n, random_state=42)

            # Non_nested parameter search and scoring
            grid = GridSearchCV(estimator=model, param_grid=p_grid, cv=cv,
                                scoring=metric,
                                n_jobs=1 if nest or neural else -1)
            # NOTE: n_jobs sometimes needs to be tweaked (depending on where
            # multi-threading happens). Above is the safest default config.
            grid.fit(X, y)
            print("\n> Inner CV F1:", grid.best_score_)  # Score of 10-fold

            clf = grid.best_estimator_
        else:
            try:
                assert not nest
            except AssertionError:
                raise(ValueError(
                    "Set nest to false if no p_grid is provided."))
            grid, clf = None, model

        print("\n\n> Final model:\n")
        for step in clf.steps:
            print(step)

        clf.fit(X, y)  # Refit best_estimator_ on the entire train set

        # Nested CV with parameter optimization                v only if nested
        return clf, cvs(clf, X, y, cv=_cv, scoring=metric) if nest else 0, grid
コード例 #16
0
def rf_model(X_train, y_train, X_test, y_test):
    '''
    Set up logistic regession pipeline.
    Input: train and test matricies
    Output: model predictions and accuracy
    '''
    rf_model = RandomForestClassifier(n_estimators=500,
                                      min_samples_leaf=4,
                                      min_samples_split=3,
                                      max_features='sqrt')

    rf_model.fit(X_train, y_train)

    cv_score = np.mean(
        cvs(rf_model, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1))
    y_hat = rf_model.predict(X_test)
    score = metrics.accuracy_score(y_test, y_hat)

    print('RF CV Accuracy: {:.2f}'.format(cv_score))
    print('RF Test Accuracy: {:.2f}'.format(score))
コード例 #17
0
def classification():
    global feat_val
    cols = features + ["Gender"]
    data = df[cols]
    data_bal = data
    x = np.array(data_bal.iloc[:, :-1])
    y = np.array(data_bal.iloc[:, -1])
    x_train, x_test, y_train, y_test = tts(x,
                                           y,
                                           test_size=0.25,
                                           random_state=23)
    classifiers = [
        rfc(n_estimators=100, random_state=23),
        lr(random_state=23),
        SVC(random_state=23)
    ]
    methods = [
        "Random Forest Classifier", "Logistic Regression",
        "Support Vector Machines"
    ]
    print(
        "We are using the K-fold cross-validation for estimating accuracy of the models "
    )
    print(
        "The accuracy mean with its 95% confidence interval for different methods is as follows"
    )
    for i in range(len(methods)):
        clf = classifiers[i]
        print("Classifier : {}".format(methods[i]))
        clf.fit(x_train, y_train)
        if (i == 0):
            feat_val = clf.feature_importances_
        y_pred = clf.predict(x_test)
        #acc = recall_score(y_test, y_pred , average=None).mean()
        #acc = bac(y_test,y_pred)
        #print("Accuracy : {}".format(acc))
        acc = cvs(clf, x, y, cv=5)
        print("Accuracy: %0.4f (+/- %0.2f)" % (acc.mean(), acc.std() * 2))
コード例 #18
0
def fit(name, dataset, target_col, feature_cols=None, exclude_cols=None):
    try:
        # Load Dataset
        X, y = load_dataset(dataset, target_col, feature_cols, exclude_cols)

        # Fetch model to fit and evaluate
        Model = MODELS[name]['Model']
        params = MODELS[name]['params']

        # Fit and evaluate model
        start = time.time()
        model = Model(**params)
        r2_scores = cvs(model, X, y, scoring='r2', cv=12)
        delta = time.time() - start

        # Construct report
        report = {
            'model': name,
            'hyperparameters': model.get_params(),
            'repr': str(model),
            'r2_scores': list(r2_scores),
            'elapsed': delta,
            'target_col': target_col,
            'feature_cols': feature_cols,
            'X_shape': X.shape,
            'y_shape': y.shape,
        }

        return json.dumps(report)
    except Exception as e:
        return json.dumps({
            'model': name,
            'dataset': dataset,
            'target_col': target_col,
            'feature_cols': feature_cols,
            'error': str(e),
        })
コード例 #19
0
def gb_model(X_train, y_train, X_test, y_test):
    '''
    Set up logistic regession pipeline.
    Input: train and test matricies
    Output: model predictions and accuracy
    '''
    gb_model = GradientBoostingClassifier(learning_rate=0.1,
                                          loss='exponential',
                                          max_depth=2,
                                          max_features=None,
                                          min_samples_leaf=2,
                                          min_samples_split=2,
                                          n_estimators=100,
                                          subsample=0.5)

    gb_model.fit(X_train, y_train)

    cv_score = np.mean(
        cvs(gb_model, X_train, y_train, scoring='accuracy', cv=5, n_jobs=-1))
    y_hat = gb_model.predict(X_test)
    score = metrics.accuracy_score(y_test, y_hat)

    print('GB CV Accuracy: {:.2f}'.format(cv_score))
    print('GB Test Accuracy: {:.2f}'.format(score))
コード例 #20
0
y = y.flatten()
x = x.T

# for i in range(y.shape[0]):
#     if y[i] == -1:
#         y[i] =0
#

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)


# fit a model
model = LogisticRegression(C=1.0, solver='newton-cg')
model.fit(x_train, y_train)

score = cvs(model,x_train,y_train,cv = 5,scoring='accuracy')
auc1 =  cvs(model,x_train,y_train,cv = 5,scoring='roc_auc')
print('score = {0}'.format(score.mean()))
print('auc = {0}'.format(auc1.mean()))

# predict probabilities
probs = model.predict_proba(x_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate roc auc
auc = roc_auc_score(y_test, probs)
print(auc)



fpr, tpr, thresholds = roc_curve(y_test, probs)
コード例 #21
0
gSearch = grid.fit(input_variables, output)
best_params = gSearch.best_params_
best_accuracy = gSearch.best_score_

# summarize results
print("Best score: %f using params %s" %
      (gSearch.best_score_, gSearch.best_params_))
means = gSearch.cv_results_['mean_test_score']
stds = gSearch.cv_results_['std_test_score']
params = gSearch.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# evaluate using 10-fold cross validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
results = cvs(classifier, input_variables, output, cv=kfold)
print(results.mean())
'''Check point of ANN model improvements while training by max mode'''
filepath = 'weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath,
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             mode='max')
callbacks_list = [checkpoint]

# Fit the model
classifier.fit(xTrain,
               yTrain,
               validation_split=0.33,
               epochs=150,
コード例 #22
0
ファイル: simpsons.py プロジェクト: be-ns/simpsons_analysis
def build_gbr(training_x, training_y, holdout_x, holdout_y, _abr, abr_test,
              rounds):
    '''
    INPUT: training features, training targets, holdout features,
    holdout target, previous model, test score from previous model

    OUTPUT: final model, final_score, final model training score
    '''
    # > BUILD GRADIENT BOOSTED REGRESSOR
    # train gradient boosted model on all of X with rfr and abr scores
    # to get training error
    _gbr = g_br(loss='lad',
                learning_rate=.1,
                n_estimators=500,
                warm_start=False,
                verbose=False)
    # get RMSE (take square root of absolute value of negative mse) using 5-fold Cross Validation
    final_train = sqrt(
        abs(
            np.array(
                cvs(_gbr,
                    training_x,
                    training_y,
                    cv=5,
                    n_jobs=-1,
                    verbose=False,
                    scoring='neg_mean_squared_error')).mean()))
    print('Gradient_Boosted_score_cross_val_score = ', final_train)

    # now on to the test error...
    # set arbitrary final_score - to be used in while loop to meet threshold
    final_test = 100
    # iteratively train Random Searched Gradient Boosted model
    # on training data until it beats the previous model's score
    while final_test > abr_test:
        # make simple Gradient Boosted Regressor
        _gbr = g_br(loss='lad', verbose=True)
        # set param distribution
        param_distribution = {
            "max_depth": [3, 4, 5],
            "learning_rate": [.2, .3, .4],
            "n_estimators": sp_randint(500, 3000)
        }
        n_iter_search = rounds

        # implement Random Search
        final_model = r_search(_gbr,
                               param_distributions=param_distribution,
                               n_iter=n_iter_search,
                               n_jobs=-1,
                               cv=4,
                               verbose=1)
        # fit to training set
        final_model.fit(training_x, training_y)

        # get score for holdout set, reset variable
        final_test = predict_on_holdout(_abr, final_model, holdout_x,
                                        holdout_y)
        # if threshold not met, try again
        print('final_score = ', final_test)
    # return model and scores
    return final_model, final_test, final_train
コード例 #23
0
# compute accuracy of the classifier
accuracy = 100.0 * (y_test == y_test_pred).sum() / x_test.shape[0]
print("Accuracy of the classifier =", round(accuracy, 2), "%")

plot_classifier(classifier_gaussiannb_new, x_test, y_test)

###############################################
# Cross validation and scoring functions

num_validations = 5

# https://scikit-learn.org/stable/modules/model_evaluation.html
accuracy = cvs(classifier_gaussiannb,
               x,
               y,
               scoring='accuracy',
               cv=num_validations)
print("Accuracy: " + str(round(100 * accuracy.mean(), 2)) + "%")

# precision is calculated by total number of correct identifications
# divided by the total number of identifications.
precision = cvs(classifier_gaussiannb,
                x,
                y,
                scoring='precision_weighted',
                cv=num_validations)
print("Precision: " + str(round(100 * precision.mean(), 2)) + "%")

# recall is calculated by total number of correct identifications
# divided by the total number of interesting items in the dataset
コード例 #24
0
# @Author  : AlwaysDazz
# @Time    : 2021/5/9 10:58
# @IDE:    : PyCharm
# @Project : pythonProject
# @Comment : 回归树模型,数据集波士顿房价数据集

from sklearn.tree import DecisionTreeRegressor as regressor  #回归树模型
from sklearn.model_selection import cross_val_score as cvs  #交叉验证方法
from sklearn.datasets import load_boston  #波士顿房价
import pandas as pd  # 对数据进行观察

#数据详情就这些了
boston = load_boston()
pd_data = pd.concat([pd.DataFrame(boston.data),
                     pd.DataFrame(boston.target)],
                    axis=1)
data = pd.DataFrame(boston.data)  #506 rows x 13 columnspd.DataFrame(data).add
data_col = data.columns['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
                        'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']

print(data_col)
#实例化回归树模型
reg = regressor(random_state=0)  #无需加任何参数,先实例化 后期可调整
res = cvs(reg,
          boston.data,
          boston.target,
          cv=10,
          scoring="neg_mean_squared_error"
          )  #交叉验证法,回归树模型,数据,标签,交叉次数,分数返回值(回归树默认返回R平方,我们将其转化为负均方误差)
print(res)
imputed_encoded_x_train_plus.columns = encoded_x_train_plus.columns

imputed_encoded_x_test_plus = pd.DataFrame(
    imputer.fit_transform(encoded_x_test_plus))
imputed_encoded_x_test_plus.columns = encoded_x_test_plus.columns

#Align testing and training data sets
final_train, final_test = imputed_encoded_x_train_plus.align(
    imputed_encoded_x_test_plus, join='inner', axis=1)

#Create model and fit
my_model = GradientBoostingRegressor()
my_model.fit(final_train, y)

#Use cross validation to evaluate model
scores = cvs(my_model, final_train, y, scoring='neg_mean_absolute_error')
print('Mean Absolute Error using Cross Validation is: ', (-1 * scores.mean()))

#Plot some partial dependences
my_graphs = plot_partial_dependence(my_model,
                                    X=final_train,
                                    features=[2, 5],
                                    feature_names=final_train.columns,
                                    grid_resolution=10)

#Predict saleprice of test data
predictions = my_model.predict(final_test)

#Create a submission dataframe and export to a csv file
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
コード例 #26
0
#predicting the test set results
y_pred=classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, classification_report

cm=confusion_matrix(y_test, y_pred)
plt.figure(figsize = (5,5))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

print(classification_report(y_test, y_pred))

#applying k-fold cross validation
from sklearn.model_selection import cross_val_score as cvs
accuracies = cvs(estimator=classifier,X=x_train,y=y_train,cv=10)
print(accuracies.mean())
print(accuracies.std())

"""Logistic Regression"""

#fitting logistic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

#predicting the test set results
y_pred=classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, classification_report
コード例 #27
0
ファイル: analysis.py プロジェクト: cnzmeca/tufts
    return data


def make_matrix(l):
    matrix = np.full((ROW, COL), 0)
    for d in l:
        matrix[d[0]][d[1]] = d[2]
    return matrix


if __name__ == "__main__":
    train = read_csv(TRAIN)
    gender = read_csv(GENDER)
    year = read_csv(YEAR)

    X1 = make_matrix(train)
    X2 = X1.T
    Y1 = np.asarray(gender).T[0]
    Y2 = np.asarray(year).T[0]

    clf1 = logr()
    scores = cvs(clf1, X1, Y1, cv=10)
    print("Min CV error: {}".format(1 - max(scores)))

    clf2 = logr(solver="saga", multi_class="multinomial")
    pred = cvp(clf2, X2, Y2, cv=10)
    mse1 = mse(Y2, pred)
    mse2 = mse(Y2, np.full_like(Y2, np.mean(Y2)))
    print("Regression MSE: {}".format(mse1))
    print("Naive MSE: {}".format(mse2))
コード例 #28
0
# print("RMSE: ", rmse) # 預測誤差 68628.19819848923 美元

# 因此對於擬合不足的模型, 可以選用更強大的模型或是提供更好的特徵, 減少限制等等
# 這裡則更換更強大的模型 DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor as DTR
tree_reg = DTR()
tree_reg.fit(housing_prepared, housing_labels)
predicted = tree_reg.predict(housing_prepared)
# mse = mean_squared_error(housing_labels, predicted)
# rmse = np.sqrt(mse)
# print("RMSE: ", rmse) # RMSE:  0.0
# 這裡的結果正確率是100%, 但要考慮是否過度擬合 Overfitting

# 因此, 要使用交叉驗證來進行更好的評估模型
from sklearn.model_selection import cross_val_score as cvs
scores = cvs(tree_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
# Scikit-Learn 交叉驗證更傾向於效用函數(越大越好), 而不是成本函數(越小越好), 所以計算分數實際上是負的MSE
# 來查看結果
def display_score(scores):
    print("Score: ", scores)
    print("Mean:", scores.mean())
    print("Stardard deviation: ", scores.std())

# display_score(rmse_scores)
# Mean: 71227.31692492112
# Stardard deviation:  2926.49161963209

# 和LR的交叉驗證評分做個比較
lr_scores = cvs(lr, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
lr_rmse = np.sqrt(-lr_scores)
コード例 #29
0
final = []

for i in range(len(featureList)):
    l = [featureList[i]]
    l.append(labelListAct[i])
    l.append(labelListVal[i])
    l.append(speakerList[i])
    final.append(l)

final.sort(key = getFourth)

featureList = [] #list of lists used to store the extracted features of each training sample
labelListAct = []   #list of strings used to store the labels(emotions) for each training sample
labelListVal = []

for i in range(len(final)):
    featureList.append(final[i][0])
    labelListAct.append(final[i][1])
    labelListVal.append(final[i][2])
    
clf = svm.SVC(gamma = 'auto')

predictionsAct = cvs(clf, featureList, labelListAct, cv = 24)
predictionsVal = cvs(clf, featureList, labelListVal, cv = 24)

print('Binary Activation')
print(predictionsAct)
print(np.mean(predictionsAct))
print('Binary Valence')
print(predictionsVal)
print(np.mean(predictionsVal))
コード例 #30
0
seaborn.regplot(x='petal_length', y='petal_width', data=iris)
plt.show()

#训练线性回归模型
lm = linear_model.LinearRegression()
features = ['petal_length']
X = iris[features]
y = iris['petal_width']
model = lm.fit(X, y)
#打印截距和系数
print(model.intercept_, model.coef_)
#预测petal_length为4,petal_width的值
predict = model.predict(4)
print("petal_width's value : ", predict)
#预测性能评估,5次交叉检验
scores = -cvs(lm, X, y, cv=5, scoring='neg_mean_absolute_error')
#平均绝对值误差均值
ave_score = numpy.mean(scores)
print(ave_score)

#更改为2个特征
features = ['petal_length', 'sepal_length']
X = iris[features]
y = iris['petal_width']
model = lm.fit(X, y)
print(model.intercept_, model.coef_)
predict = model.predict([[1, 2]])
print("petal_width's value : ", predict)
scores = -cvs(lm, X, y, cv=5, scoring='neg_mean_absolute_error')
ave_score = numpy.mean(scores)
print(ave_score)