def rfmodel(X,y):
    X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2)
    rf = RandomForestRegressor(n_estimators = 500)
    rf.fit(X_train, y_train)
    y_train_pred = rf.predict(X_train)
    y_test_pred = rf.predict(X_test)
    scores = cross_validate(rf, X, y, cv=5, scoring=('r2', 'neg_mean_squared_error'))
    cv_mse = -scores['test_neg_mean_squared_error']
    cv_rmse = np.sqrt(cv_mse).mean()
    cv_r2 = scores['test_r2'].mean()
    print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),r2_score(y_test, y_test_pred)))
    print('RMSE train: %.3f, test: %.3f' % (math.sqrt(mean_squared_error(y_train, y_train_pred)),math.sqrt(mean_squared_error(y_test, y_test_pred))))
    print('CV R^2: %.3f, RMSE: %.3f' % (cv_r2,cv_rmse))
    return y_train,y_train_pred,y_test,y_test_pred
Beispiel #2
0
def cv_evaluate(clf, train_data, train_labels):
	""" evaluation with cross validation in classifiers, by default: 10-CV.
		report the precision, recall, and f1-measure
	Args:
	--------
	clf: the classifier
	train_data: the training data
	train_labels: the training labels

	Returns:
	--------
	"""
	print("Using cross validation")
	scoring = ['precision', 'recall', 'f1']
	scores_cv = cross_validate(clf, train_data, train_labels.ravel(), cv=10, scoring=scoring)
	precision, recall, f1_score = np.mean([scores_cv['test_precision'], scores_cv['test_recall'], scores_cv['test_f1']], axis=1)
	print('=' * 20, 'RESULT', '=' * 20)
	print("Precision:  %.6f, Recall: %.6f, F1_score: %.6f" % (precision, recall, f1_score))
def train_and_test_dnn(args):
    
    for a in args:
        print(a)
    
    primitive = args[1]
    res =  pickle.load(open(sys.argv[2], "rb" ))
    notes_with_truth_labels_for_query_primitives = pd.read_csv(args[3])
   
    dl_results = pd.DataFrame(columns = ['primitive', 'avg_fit_time', 'avg_score_time', 'avg_score'])
    
    X = get_doc_term_matrix(res)
    y = notes_with_truth_labels_for_query_primitives.loc[:, primitive]

    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 5, 2), random_state=1)

    try:

        sm = SMOTE(random_state=357)
        X_sm, y_sm = sm.fit_sample(X, y)

    except ValueError:
        print("value error, smote")
        X_sm = X
        y_sm = y

    cv_results = cross_validate(clf, X_sm, y_sm, cv=3, return_train_score=False)
    print(cv_results)

    dump(clf, './models/{}_trained_dnn.joblib'.format(primitive)) 

    dl_results.loc[0, 'primitive'] = primitive
    dl_results.loc[0, 'avg_fit_time'] = np.mean(cv_results['fit_time'])
    dl_results.loc[0, 'avg_score_time'] = np.mean(cv_results['score_time'])
    dl_results.loc[0, 'avg_test_score'] = np.mean(cv_results['test_score'])

    with open(args[4], 'a') as f:
        f.write("{}, {}, {}, {}\n".format(dl_results.loc[0,'primitive'], dl_results.loc[0,'avg_fit_time'], dl_results.loc[0,'avg_score_time'], dl_results.loc[0,'avg_test_score']))
        #f.write(dl_results.loc[0,:])
        #f.write("\n")
        f.close()
    
    print("DONE w/ {}".format(primitive))
Beispiel #4
0
# <div class="admonition note alert alert-info">
# <p class="first admonition-title" style="font-weight: bold;">Note</p>
# <p class="last">Here, we need to increase the maximum number of iterations to obtain a fully
# converged <tt class="docutils literal">LogisticRegression</tt> and silence a <tt class="docutils literal">ConvergenceWarning</tt>. Contrary
# to the numerical features, the one-hot encoded categorical features are all
# on the same scale (values are 0 or 1), so they would not benefit from
# scaling. In this case, increasing <tt class="docutils literal">max_iter</tt> is the right thing to do.</p>
# </div>

# Finally, we can check the model's statistical performance only using the
# categorical columns.

# In[ ]:

from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, data_categorical, target)
cv_results

# In[ ]:

scores = cv_results["test_score"]
print(f"The accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")

# As you can see, this representation of the categorical variables is
# slightly more predictive of the revenue than the numerical variables
# that we used previously.

#
# In this notebook we have:
# * seen two common strategies for encoding categorical features: **ordinal
#   encoding** and **one-hot encoding**;
Beispiel #5
0
    scorerMCC = metrics.make_scorer(metrics.matthews_corrcoef)
    scorerSP = metrics.make_scorer(specificity_score)
    scorerPR = metrics.make_scorer(metrics.precision_score)
    scorerSE = metrics.make_scorer(metrics.recall_score)

    scorer = {
        'ACC': 'accuracy',
        'recall': scorerSE,
        'roc_auc': 'roc_auc',
        'MCC': scorerMCC,
        'SP': scorerSP
    }
    five_fold = model_selection.cross_validate(clf,
                                               gram_train,
                                               y_train,
                                               cv=cv,
                                               scoring=scorer)

    mean_ACC = np.mean(five_fold['test_ACC'])
    mean_sensitivity = np.mean(five_fold['test_recall'])
    mean_AUC = np.mean(five_fold['test_roc_auc'])
    mean_MCC = np.mean(five_fold['test_MCC'])
    mean_SP = np.mean(five_fold['test_SP'])

    #print('five fold:')
    print(mean_sensitivity)
    print(mean_SP)
    print(mean_ACC)
    print(mean_MCC)
    print(mean_AUC)
Beispiel #6
0
from sklearn.externals import joblib
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.linear_model import LogisticRegression
#warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.preprocessing import StandardScaler
import random
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
import sweetviz
from sklearn.metrics import accuracy_score
data = pd.read_csv(
    "/home/gulshan/Desktop/Diseaseprediction/Diseaseprediction/Datasets/diabetes.csv"
)
#print(data)
#my_report=sweetviz.analyze(data)
#my_report.show_html('report.html')
target = data['Outcome']
data = data.drop(['Outcome'], axis=1)
sc = StandardScaler()
data = sc.fit_transform(data)

lr = LogisticRegression()
lr.fit(data, target)
cv_results = cross_validate(lr, data, target, cv=10)
print(lr.predict(data))
#joblib.dump(lr,"Diabetes_Model")
#joblib.dump(sc,'dscaler')
y_pred = lr.predict(data)

#print(accuracy_score(target,y_pred))(78.38% acc.)
train_err = [0] * len(ks)
test_err = [0] * len(ks)
train_err2 = [0] * len(ks)
test_err2 = [0] * len(ks)
cv_scores = [0] * len(ks)
cv_scores2 = [0] * len(ks)

for i, k in enumerate(ks):
    print 'kNN: learning a kNN classifier with k = ' + str(k)
    clf = KNeighborsClassifier(n_neighbors = k)
    clf.fit(X_train, y_train)
    clf2 = KNeighborsClassifier(n_neighbors = k, weights='distance')
    clf2.fit(X_train, y_train)
    train_err[i] = accuracy_score(y_train, clf.predict(X_train))
    cv_results = cross_validate(clf, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True, return_estimator=True)
    cv_scores[i] = cv_results['test_score'].mean()
    index = np.argmax(cv_results['test_score'])
    estimator = cv_results['estimator'][index]
    YpredTest = estimator.predict(X_test)
    test_err[i] = accuracy_score(y_test, YpredTest)
    train_err2[i] = accuracy_score(y_train, clf2.predict(X_train))
    cv_results = cross_validate(clf2, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True, return_estimator=True)
    cv_scores2[i] = cv_results['test_score'].mean()
    index = np.argmax(cv_results['test_score'])
    estimator = cv_results['estimator'][index]
    YpredTest = estimator.predict(X_test)
    test_err2[i] = accuracy_score(y_test, YpredTest)
    print '---'

# Plot results
Beispiel #8
0
        ########## Predection and Reporting ################
        boston_Y_pred = boston_ridge_reg.predict(boston_X_test)
        boston_Y_train_pred = boston_ridge_reg.predict(boston_X_train)

        test_error_boston = mean_squared_error(boston_Y_test, boston_Y_pred)
        train_error_boston = mean_squared_error(boston_Y_train,
                                                boston_Y_train_pred)
        r2_score_boston = r2_score(boston_Y_train, boston_Y_train_pred)

        ########## Cross validation K = 5 ##################
        cross_val_boston = np.abs(
            np.mean(
                cross_validate(
                    boston_ridge_reg,
                    boston_X_train,
                    boston_Y_train,
                    cv=5,
                    scoring='neg_mean_squared_error')['test_score']))

        boston_test_error_ridge.append(test_error_boston)
        boston_train_error_ridge.append(train_error_boston)
        boston_r2_score_ridge.append(r2_score_boston)
        boston_cv.append(cross_val_boston)

    boston_test_error_ridge_global.append(boston_test_error_ridge)
    boston_train_error_ridge_global.append(boston_train_error_ridge)
    boston_r2_score_ridge_global.append(boston_r2_score_ridge)
    boston_cv_global.append(boston_cv)

########### Ploting the reports ####################
ymin_error = np.min(
Beispiel #9
0
print(le_embarked.classes_)
titanic_train['Embarked'] = le_embarked.transform(titanic_train['Embarked'])

le_sex = preprocessing.LabelEncoder()
le_sex.fit(titanic_train['Sex'])
print(le_sex.classes_)
titanic_train['Sex'] = le_sex.transform(titanic_train['Sex'])

features = ['Pclass', 'Parch', 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex']
X_train = titanic_train[features]
y_train = titanic_train['Survived']

knn_estimator = neighbors.KNeighborsClassifier()
knn_estimator.fit(X_train, y_train)

scores = model_selection.cross_validate(knn_estimator, X_train, y_train, cv=10)
test_scores = scores.get("test_score")
print(test_scores.mean())

train_scores = scores.get("train_score")
print(train_scores.mean())

#read test data
titanic_test = pd.read_csv(
    "C:\\Users\\Algorithmica\\Downloads\\titanic_test.csv")
print(titanic_test.info())

titanic_test[imputable_cont_features] = cont_imputer.transform(
    titanic_test[imputable_cont_features])
titanic_test['Embarked'] = cat_imputer.transform(titanic_test['Embarked'])
titanic_test['Embarked'] = le_embarked.transform(titanic_test['Embarked'])
Beispiel #10
0
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

# Union all the pipeline
preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

X_train = preprocess_pipeline.fit_transform(train_data)
y_train = train_data["Survived"]

from sklearn.svm import SVC
from sklearn import tree

decision_tree = tree.DecisionTreeClassifier()

cv_results = cross_validate(decision_tree,
                            X_train,
                            y_train,
                            cv=5,
                            return_train_score=True)

# decision_tree.fit(X_train, y_train)

# X_test = preprocess_pipeline.transform(test_data)
# y_pred = decision_tree.predict(X_test)

print(cv_results)
Beispiel #11
0
    # define oversampling strategy
    oversample = RandomOverSampler(sampling_strategy=i, random_state=1)
    #print(Counter(Y))
    X_ov, Y_ov = oversample.fit_resample(X_train, dummy_y_train)
    #print(Counter(Y_ov))

    under = RandomUnderSampler(sampling_strategy=1, random_state=1)
    X_un, Y_un = under.fit_resample(X_ov, Y_ov)
    #print(Counter(Y_un))


    model=create_model()
    model.fit(X_un, Y_un, validation_split=0.2, epochs=num_epochs, batch_size=batch_size, verbose=1)
    y_pred=model.predict(X_test, batch_size=32)
    scores = cross_validate(estimator=model, X=X_train, y=dummy_y_train, cv=10,return_train_score=True)

#     class_names=['normal','dos','probe','u2r','r2l']

    hehe = classification_report(dummy_y_Test.argmax(axis=1),y_pred.argmax(axis=1), target_names=class_names, output_dict=True)
    normal_f.append(hehe['normal']['f1-score'])
    dos_f.append(hehe['dos']['f1-score'])
    probe_f.append(hehe['probe']['f1-score'])
    u2r_f.append(hehe['u2r']['f1-score'])
    r2l_f.append(hehe['r2l']['f1-score'])
    
    
ratios = [0, 0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]
fig, ax = plt.subplots()
ax.set_xlabel("Oversampling Ratio")
ax.set_ylabel("F1-score")
        num_rows - num_rows_clean))

"Feature Importance"

input_features = [
    column for column in list(individuals_train) if column != 'individuals'
]
X = individuals_train[input_features]
X['random_noise'] = np.random.normal(size=X.shape[0])
y = individuals_train['individuals']

# RF K-Fold train
classifier = RandomForestRegressor(n_jobs=-1)
cv = cross_validate(estimator=classifier,
                    X=X,
                    y=y,
                    cv=5,
                    return_estimator=True)

feature_importance = {}
for k in range(0, len(cv['estimator'])):
    feature_importance['k_{}'.format(
        k + 1)] = cv['estimator'][k].feature_importances_
feature_importance = pd.DataFrame(feature_importance, index=X.columns)

feature_importance = feature_importance.mean(axis=1).to_frame('importance') \
        .sort_values('importance', ascending=False)
feature_selection = feature_importance.to_dict()

# Get importance concentration score
importance_concentration = (feature_importance.iloc[1] /
Documentation
'''
if __name__ == '__main__':
    from Models_data_prep.NYTaxi_cross_ref_data_split_train_test import X_train, Y_train
    mse = make_scorer(mean_squared_error, greater_is_better=False)
    seed = 42
    n_split = 10

    dirname = 'linear_regression'
    reportname = 'RMSE_scores_{}.csv'.format(datetime)

    try:
        os.mkdir(os.path.join(Path_Reports, dirname))
    except Exception as e:
        pass

    cv = ShuffleSplit(n_splits=n_split, test_size=0.2, random_state=seed)
    model = LinearRegression()
    scores = cross_validate(model,
                            X_train.reshape(-1, 1),
                            Y_train,
                            scoring=mse,
                            cv=cv,
                            verbose=1,
                            return_train_score=True)

    df = pd.DataFrame.from_dict(scores)
    # df.test_mean_squared_error = np.sqrt(df.test_score)
    # df.train_mean_squared_error = np.sqrt(df.train_score)
    df.to_csv(os.path.join(Path_Reports, dirname, reportname))
Beispiel #14
0
    SVC(),
    KNeighborsClassifier(),
    LogisticRegression(),
    RandomForestClassifier(),
    GaussianNB(),
    LinearSVC(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier()
]

# Cross validation
cv_train_score = list()
cv_test_score = list()
for model in mod:
    cv_result = cross_validate(model, train_data, train_Survived, cv=cv_split)
    cv_train_score.append(cv_result['train_score'].mean())
    cv_test_score.append(cv_result['test_score'].mean())

cv_model = pd.DataFrame({
    'Model': [
        'Support Vector Machines', 'KNN', 'Logistic Regression',
        'Random Forest', 'Naive Bayes', 'Linear SVC', 'Decision Tree',
        'AdaBoost classifer', 'Gradient Boosting Classifier'
    ],
    'CVTrainScore':
    cv_train_score,
    'CVTestScore':
    cv_test_score
})
Beispiel #15
0
def cross_validation(model,x,y,cv=3):
    cv_res=cross_validate(model,x,y,return_train_score=True,scoring=score_fn,cv=cv)
    return cv_res
Beispiel #16
0
print(f"[INFO] Reading data from {arg['dataset']}")
X, y = data_to_model(pd.read_csv(arg["dataset"]))

## PLAIN RANDOM FOREST

report.write("ESPERIMENTO 1. PLAIN MULTILAYER PERCEPTRON REGRESSOR:\n")
report.write("\t\t Dati non riscalati\n\n")

scoring = {
    'r2': 'r2',
    "explained_variance_score": 'explained_variance',
    "max error": 'max_error'
}
#scoring=make_scorer(explained_variance_score,max_error,mean_absolute_error,r2_score)
regr = MLPRegressor()
scores = cross_validate(regr, X, y, cv=10, n_jobs=-1, verbose=1)

print(scores)

report.write(f"10 fold-cross validation: \n{scores}\n")

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.10,
                                                    random_state=42)

print(f"[INFO] Fitting model")
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)
rf_RSCV_end_time = time.time()
duration = rf_RSCV_end_time - rf_RSCV_start_time

print(f'Randomized CV search done. {search_iters} iterations took \
{int(duration // 3600):02d}::{int((duration % 3600) // 60):02d}::{int((duration % 3600) % 60):02d}'
      )

# print the best parameters chosen by CV
pprint.pprint(rf_RSCV.best_params_)

# get CV results with best parameters
rf_clf.set_params(**rf_RSCV.best_params_)
rf_cv = cross_validate(rf_clf,
                       X_train,
                       y_train,
                       n_jobs=32,
                       scoring={
                           'log_loss': log_loss_scorer,
                           'accuracy': accuracy_scorer
                       })

print('RF 5-fold Validation Performance')
# note test_log_loss is negated due to how scorers work
# in parameter searches in sklearn
print('Mean Log Loss\t{}'.format(np.mean(-rf_cv['test_log_loss'])))
print('Mean Accuracy\t{}'.format(np.mean(rf_cv['test_accuracy'])))

# get performance on test set
rf_clf.fit(X_train, y_train)
rf_y_test_pred = rf_clf.predict(X_test)

print('RF Test Set Performance')
Beispiel #18
0
# :class:`~sklearn.linear_model.QuantileRegressor` than
# :class:`~sklearn.linear_model.LinearRegression`. In contrast to that, MSE is
# lower for :class:`~sklearn.linear_model.LinearRegression` than
# :class:`~sklearn.linear_model.QuantileRegressor`. These results confirms that
# MAE is the loss minimized by :class:`~sklearn.linear_model.QuantileRegressor`
# while MSE is the loss minimized
# :class:`~sklearn.linear_model.LinearRegression`.
#
# We can make a similar evaluation but looking a the test error obtained by
# cross-validation.
from sklearn.model_selection import cross_validate

cv_results_lr = cross_validate(
    linear_regression,
    X,
    y_pareto,
    cv=3,
    scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
)
cv_results_qr = cross_validate(
    quantile_regression,
    X,
    y_pareto,
    cv=3,
    scoring=["neg_mean_absolute_error", "neg_mean_squared_error"],
)
print(f"""Test error (cross-validated performance)
    {linear_regression.__class__.__name__}:
    MAE = {-cv_results_lr["test_neg_mean_absolute_error"].mean():.3f}
    MSE = {-cv_results_lr["test_neg_mean_squared_error"].mean():.3f}
    {quantile_regression.__class__.__name__}:
Beispiel #19
0
# In[ ]:

Y = dftrain["Survived"]
Y_model = dftrain["Survived_Model"]
print("recall score on training set", recall_score(Y, Y_model))

# In[ ]:

print("precision score on training set", precision_score(Y, Y_model))

# In[ ]:

scores = cross_validate(clf,
                        dftrain,
                        dftrain["Survived"],
                        scoring=["f1", "accuracy"],
                        cv=10,
                        return_train_score=False)

# In[ ]:


def display_cross_validate(scores):
    print("cross val scores")
    print("f1 scores", scores["test_f1"])
    print("f1 mean", scores["test_f1"].mean())
    print("f1 std", scores["test_f1"].std())
    print("accuracy scores", scores["test_accuracy"])
    print("accuracy mean", scores["test_accuracy"].mean())
    print("accuracy std", scores["test_accuracy"].std())
}

merged_data_set = merged_data_set.dropna()
print('merged:' + str(type(merged_data_set)))
print('df:' + str(type(df)))

columns = [
    'date_time', 'srch_ci', 'srch_co', 'user_id',
    'disc_orig_destination_distance', 'std_srch_children_cnt',
    'std_srch_adults_cnt'
]

merged_data_set = merged_data_set.drop(columns=columns, axis=1)
print('merged:' + str(type(merged_data_set)))
y = merged_data_set['hotel_cluster']
merged_data_set = merged_data_set.drop(['hotel_cluster'], 1)

X = merged_data_set
print('Going into the classifier')

resultMNB = cross_validate(LogisticRegression(multi_class='multinomial',
                                              solver='newton-cg'),
                           X,
                           y,
                           cv=KFold(n_splits=5, shuffle=True),
                           scoring=scoring)
print('Accuracy per fold =', resultMNB['test_accuracy'])
print('Mean Accuracy =', np.mean(resultMNB['test_accuracy']))
print('Mean Precision =', np.mean(resultMNB['test_precision']))
print('Mean Recall =', np.mean(resultMNB['test_recall']))
print('Mean F1 Score =', np.mean(resultMNB['test_f1_score']))
Beispiel #21
0
    activity.replace([np.inf, -np.inf], np.nan, inplace=True)

data_concat = pd.concat(data)
y = data_concat['original']['power']
X = data_concat.drop('power', axis=1, level=1)
X.fillna(X.mean(), inplace=True)
groups = []
for group_idx, activity in enumerate(data):
    groups += [group_idx] * activity.shape[0]
groups = np.array(groups)

scores = cross_validate(GradientBoostingRegressor(random_state=42, n_jobs=-1),
                        X,
                        y,
                        groups=groups,
                        scoring=['r2', 'neg_median_absolute_error'],
                        cv=GroupKFold(n_splits=3),
                        n_jobs=1,
                        return_train_score=True,
                        verbose=0)

print('The obtained scores on training and testing in terms of '
      'R2 and MAE are: \n')
print(scores)

# Store the prediction for visualization
y_pred = cross_val_predict(GradientBoostingRegressor(random_state=42,
                                                     n_jobs=-1),
                           X,
                           y,
                           groups=groups,
Beispiel #22
0
#geração de histogramas
#x.hist(bins=10, figsize=(9, 10))
#pl.savefig('histogramas')

#matriz de correlação
#sns.heatmap(x.corr(), annot=True).figure.savefig('corr.png')

log_file = open('classification_scores-naive.txt', 'w+')

def report(scores, experimentName):
	print(experimentName)
	print('Mean accuracy on train: %0.2f' % (scores['train_score'].mean()))
	print('Standard deviation accuracy on train: %0.2f' % (scores['train_score'].std()))
	print('Mean accuracy on test: %0.2f' % (scores['test_score'].mean()))
	print('Standard deviation accuracy on test: %0.2f' % (scores['test_score'].std()))
	#writing test scores
	log_file.write('{} score per fold\n'.format(experimentName))
	for s in scores['test_score']:
		log_file.write('{}\n'.format(s))

naive = GaussianNB()
experimento = '*** NAIVE BAYES - No Scaler ***'
x_n = x
#trainning
cv_scores = cross_validate(naive, x, y, scoring='accuracy', cv=KFold(n_splits=10), return_train_score=True)
#results
report(cv_scores, experimento)

log_file.close()
Beispiel #23
0
"""
reg_H = linear_model.LinearRegression().fit(X, H)
reg_I = linear_model.LinearRegression().fit(X, I)
reg_J = linear_model.LinearRegression().fit(X, J)
"""

#print(r2(H, reg_H.predict(X)), r2(I, reg_I.predict(X)), r2(J, reg_J.predict(X)))

# Lasso is Linear Regression with Regularization parameter
reg_H = linear_model.Lasso(alpha=0.1).fit(X, H)
reg_I = linear_model.Lasso(alpha=0.1).fit(X, I)
reg_J = linear_model.Lasso(alpha=0.1).fit(X, J)

H_results = cross_validate(reg_H,
                           X,
                           H,
                           cv=13,
                           scoring=('r2', 'neg_mean_squared_error'))
I_results = cross_validate(reg_I,
                           X,
                           I,
                           cv=13,
                           scoring=('r2', 'neg_mean_squared_error'))
J_results = cross_validate(reg_H,
                           X,
                           J,
                           cv=13,
                           scoring=('r2', 'neg_mean_squared_error'))

print("H_neg_MSE", np.mean(H_results['test_neg_mean_squared_error']))
print("I_neg_MSE", np.mean(I_results['test_neg_mean_squared_error']))
Beispiel #24
0
    def _train(self):
        """Trains one iteration of the model called when ``tune.run`` is called.

        Different routines are run depending on if the ``early_stopping``
        attribute is True or not.

        If ``self.early_stopping`` is not None, each fold is fit with
        `partial_fit`, which stops training the model if the validation
        score is not improving for a particular fold.

        Otherwise, run the full cross-validation procedure.

        In both cases, the average test accuracy is returned over all folds,
        as well as the individual folds' accuracies as a dictionary.

        Returns:
            ret (:obj:`dict): Dictionary of results as a basis for
                ``cv_results_`` for one of the cross-validation interfaces.

        """
        if self.early_stopping:
            for i, (train, test) in enumerate(self.cv.split(self.X, self.y)):
                X_train, y_train = _safe_split(self.estimator[i], self.X,
                                               self.y, train)
                X_test, y_test = _safe_split(
                    self.estimator[i],
                    self.X,
                    self.y,
                    test,
                    train_indices=train)
                self.estimator[i].partial_fit(X_train, y_train,
                                              np.unique(self.y))
                if self.return_train_score:
                    self.fold_train_scores[i] = self.scoring(
                        self.estimator[i], X_train, y_train)
                self.fold_scores[i] = self.scoring(self.estimator[i], X_test,
                                                   y_test)

            ret = {}
            total = 0
            for i, score in enumerate(self.fold_scores):
                total += score
                key_str = f"split{i}_test_score"
                ret[key_str] = score
            self.mean_score = total / len(self.fold_scores)
            ret["average_test_score"] = self.mean_score

            if self.return_train_score:
                total = 0
                for i, score in enumerate(self.fold_train_scores):
                    total += score
                    key_str = f"split{i}_train_score"
                    ret[key_str] = score
                self.mean_train_score = total / len(self.fold_train_scores)
                ret["average_train_score"] = self.mean_train_score

            return ret
        else:
            try:
                scores = cross_validate(
                    self.estimator,
                    self.X,
                    self.y,
                    cv=self.cv,
                    n_jobs=self.n_jobs,
                    fit_params=self.fit_params,
                    groups=self.groups,
                    scoring=self.scoring,
                    return_train_score=self.return_train_score,
                )
            except PicklingError:
                warnings.warn("An error occurred in parallelizing the cross "
                              "validation. Proceeding to cross validate with "
                              "one core.")
                scores = cross_validate(
                    self.estimator,
                    self.X,
                    self.y,
                    cv=self.cv,
                    fit_params=self.fit_params,
                    groups=self.groups,
                    scoring=self.scoring,
                    return_train_score=self.return_train_score,
                )

            ret = {}
            for i, score in enumerate(scores["test_score"]):
                key_str = f"split{i}_test_score"
                ret[key_str] = score
            self.test_accuracy = sum(scores["test_score"]) / len(
                scores["test_score"])
            ret["average_test_score"] = self.test_accuracy

            if self.return_train_score:
                for i, score in enumerate(scores["train_score"]):
                    key_str = f"split{i}_train_score"
                    ret[key_str] = score
                self.train_accuracy = sum(scores["train_score"]) / len(
                    scores["train_score"])
                ret["average_train_score"] = self.train_accuracy

            return ret
Beispiel #25
0
    sgd = SGD(lr=0.02, momentum=0.01, decay=0, nesterov=False)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    return model


classifier = KerasClassifier(build_fn=create_model,
                             epochs=10,
                             batch_size=15,
                             verbose=1)

y = np_utils.to_categorical(y, 10)

scores = cross_validate(classifier, X, y, return_train_score=True)

print("Train Accuracy: %0.2f (+/- %0.2f)" %
      (scores['train_score'].mean(), scores['train_score'].std() * 2))
print("Test Accuracy: %0.2f (+/- %0.2f)" %
      (scores['test_score'].mean(), scores['test_score'].std() * 2))
print("Time: %0.6f (+/- %0.6f)" %
      (scores['score_time'].mean(), scores['score_time'].std() * 2))
exit()


def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
Beispiel #26
0
print(os.listdir("."))

# # ライブラリで解く

# In[ ]:

X = train.get(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
y = train['Survived']

clf = LogisticRegression()
skf = StratifiedKFold(shuffle=True)
scoring = {
    'acc': 'accuracy',
    'auc': 'roc_auc',
}
scores = cross_validate(clf, X, y, cv=skf, scoring=scoring)

print('Accuracy (mean):', scores['test_acc'].mean())
print('AUC (mean):', scores['test_auc'].mean())

# In[ ]:

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

clf = LogisticRegression()
clf.fit(X, y)

print(clf.intercept_)
print(clf.coef_)

# In[ ]:
Beispiel #27
0
arrayOfYears = pickle.load(open("arrayOfYears.p", "rb"))
arrayOfGrades = pickle.load(open("arrayOfGrades.p", "rb"))
featuresToRemove = pickle.load(open("featuresToRemove.p", "rb"))
arrayOfGraduation = pickle.load(open("arrayOfGraduation.p", "rb"))
arrayOfJustGrades = []
for studYear in arrayOfYears:
    arrayOfJustGrades+=[[studYear["GPA"]]]
vec = DictVectorizer()
vectorizedArrayOfYears = vec.fit_transform(arrayOfYears).toarray()

reg2 = linear_model.BayesianRidge()
reg2.fit(vectorizedArrayOfYears,arrayOfGrades)
a = zip(reg2.coef_,vec.get_feature_names())
gpaFeatures = list(a)
gpaFeatures.sort()


clf2 = svm.SVC(kernel="linear")
clf2.fit(vectorizedArrayOfYears, arrayOfGraduation)
b = zip(clf2.coef_[0],vec.get_feature_names())
gradFeatures = list(b)
gradFeatures.sort()

#regr = linear_model.BayesianRidge()
#clf = svm.SVC(kernel="linear")
clf2 = linear_model.LogisticRegression()
#clf3 = naive_bayes.GaussianNB()
scoring = ['precision', 'recall','accuracy','f1']
scores = cross_validate(clf2, vectorizedArrayOfYears, arrayOfGraduation, cv=10, scoring=scoring)
scores2 = cross_validate(clf2, arrayOfJustGrades, arrayOfGraduation, cv=10, scoring=scoring)
print("run time = "+str(timer()-start))
Beispiel #28
0
    'max_depth': 8,
    'learning_rate': 0.25,
    'n_estimators': 200,
    'reg_alpha': 1.12,
    'lambda': 18.51,
    'subsample': 0.9,
}

model = XGBClassifier(**p_grid)

# clf = GridSearchCV( estimator = model,param_grid = p_grid, cv = 10 )

scoring = {'accuracy', 'precision', 'recall', 'f1', 'roc_auc'}

nested_scores = cross_validate(estimator=model,
                               X=inp,
                               y=y,
                               cv=10,
                               scoring=scoring)

op = pd.DataFrame(nested_scores)

op.to_csv(
    '/Users/shreyaspatel/Desktop/Machine_Learning/Aduri/Scores/Scores_2333.csv'
)

# clf.fit(X,y)
print(op)
# print( clf.best_params_ )
print("--- %s seconds ---" % (time.time() - start_time))
Beispiel #29
0
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    SVC(kernel="linear", random_state=0),
    RidgeClassifier(random_state=0),
    LogisticRegression(solver='lbfgs', tol=1e-3, max_iter=400, random_state=0),
    SGDClassifier(loss="log", random_state=0),
    MLPClassifier(early_stopping=True, random_state=0),
    AdaBoostClassifier(random_state=0),
    KNeighborsClassifier(3)
]

cv_scores = pd.DataFrame(columns=['Classifier', 'Precision', 'Recall', 'F1'])
for i, clf in enumerate(classifiers):
    s = cross_validate(clf,
                       X_train,
                       y_train,
                       scoring=['recall', 'precision', 'f1'],
                       cv=3,
                       return_train_score=False)
    cv_scores.loc[i] = [
        clf.__class__.__name__, s['test_precision'].mean(),
        s['test_recall'].mean(), s['test_f1'].mean()
    ]

clf = MLPClassifier(early_stopping=True, random_state=0)
clf.fit(X_train, y_train)

instances_ = []

for i in range(200, 300):
    text_file = os.path.join(TEST_DOCS_DIR, f'{i}.txt')
Beispiel #30
0
start_time = time.time()
model = model.fit(images, labels)
print("Train LINEAR SVC --- %s seconds ---" % (time.time() - start_time))

start_time = time.time()
basic_score = model.score(images_validation, labels_validation)
print("Validation LINEAR SVC --- %s seconds ---" % (time.time() - start_time))

print("Linear SVC scikit learn basic score: %0.4f" % basic_score)

# Validating the model and evaluation
start_time = time.time()
scores = cross_validate(model,
                        images_validation,
                        labels_validation,
                        cv=5,
                        scoring=('f1', 'roc_auc_ovo'),
                        return_train_score=True)
print("Cross Validation LINEAR SVC --- %s seconds ---" %
      (time.time() - start_time))

cross_score = model.score(images_validation, labels_validation)

print("Linear SVC scikit learn cross-val score: %0.4f" % cross_score)
print(scores)

pickle.dump(model, open(model_file, 'wb'))

# calculate the fpr and tpr for all thresholds of the classification

probs = model.predict_proba(images_validation)
subset = rich[rich['Taxon'] == taxa[j]]
x = subset[covar]
y = subset[yval]

# find the best model params
tuner = aei.model.tune(x, y, n_splits=3)
tuner.GradientBoostRegressor(scoring='neg_mean_squared_error')

# clean up a deprecated param
del(tuner.best_params['min_impurity_split'])

# set up the model
gbr = ensemble.GradientBoostingRegressor(**tuner.best_params)

# run cross validation metrics
cv_score = model_selection.cross_validate(gbr, x, y, scoring=['r2', 'neg_mean_squared_error'])

# fit the model on all the data
gbr.fit(x, y)

# calculate the metrics
y_eval = gbr.predict(x)
rsq = metrics.r2_score(y, y_eval)
mse = metrics.mean_squared_error(y, y_eval)

# set the linear fit
z = np.polyfit(y_eval, y, 1)
f = np.poly1d(z)
x_new = np.linspace(y.min(), y.max(), 50)
y_new = f(x_new)
Beispiel #32
0
                                random_state=rand_st)
    rgr.fit(data_train, target_train)

    scores_RMSE = 'asdasdasde2asd'
    print('Decision Tree RMSE:', scores_RMSE)
    scores_Expl_Var = 'asdasdfadfasdfs'
    print('Decision Tree Expl Var:', scores_Expl_Var)

####Cross-Val Regressors####
if binning == 0 & cross_val == 1:
    #Setup Crossval regression scorers
    scores = 'asdfasdfqwefawefa'

    #SciKit Decision Tree Regressor - Cross Val
    start_ts = time.time()
    rgr = DecisionTreeRegressor(criterion='mse',
                                splitter='best',
                                max_depth=None,
                                min_samples_split=3,
                                mean_samples_leaf=1,
                                max_features=None,
                                random_state=rand_st)
    scores = cross_validate(rgr, data_np, target_np, scoring=scorers, cv=5)
    scores_RMSE = 'asdasdasde2asd'
    scores_Expl_Var = 'asdasdfadfasdfs'
    print("Decision Tree RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()),
                                                      (scores_RMSE.std() * 2)))
    print("Decision Tree Expl Var: %0.2f (+/- %0.2f)" %
          ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
    print("CV Runtime:", time.time() - start_ts)
Beispiel #33
0
    Y = Y[index].astype('float64')
    print(X.shape, Y.shape)
    print("train_model!")
    clf = RandomForestClassifier(n_estimators=100,
                                 min_samples_leaf=2,
                                 class_weight="balanced",
                                 n_jobs=10)
    #clf=LogisticRegression(class_weight="balanced",max_iter=500)
    #clf = SVC(probability=True,class_weight="balanced")
    #clf=KNeighborsClassifier(n_neighbors=5)

    scoring = ['roc_auc', 'recall', 'f1', 'average_precision', 'accuracy']
    scores = cross_validate(clf,
                            X,
                            Y,
                            cv=10,
                            n_jobs=10,
                            scoring=scoring,
                            return_train_score=True)  #,scoring='roc_auc'
    auc_v = scores['test_roc_auc']
    train_auc = scores['train_roc_auc']
    recall = scores['test_recall']
    f1 = scores['test_f1']
    aupr = scores['test_average_precision']
    acc = scores['test_accuracy']
    print(str(auc))
    print(str(train_auc))
    print("test_AUC: %0.4f (+/- %0.2f)" % (auc_v.mean(), auc_v.std() * 2))
    #print("train_AUC: %0.4f (+/- %0.2f)" % (train_auc.mean(), train_auc.std() * 2))
    print("recall: %0.4f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))
    print("f1: %0.4f (+/- %0.2f)" % (f1.mean(), f1.std() * 2))
                          edgecolor="none",
                          linewidth=0)
    ax.legend([extra], [scores], loc="upper left")
    title = title + "\n Evaluation in {:.2f} seconds".format(elapsed_time)
    ax.set_title(title)


fig, axs = plt.subplots(2, 2, figsize=(9, 7))
axs = np.ravel(axs)

for ax, (name, est) in zip(
        axs, estimators + [("Stacking Regressor", stacking_regressor)]):
    start_time = time.time()
    score = cross_validate(est,
                           X,
                           y,
                           scoring=["r2", "neg_mean_absolute_error"],
                           n_jobs=-1,
                           verbose=0)
    elapsed_time = time.time() - start_time

    y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)

    plot_regression_results(
        ax,
        y,
        y_pred,
        name,
        (r"$R^2={:.2f} \pm {:.2f}$" + "\n" +
         r"$MAE={:.2f} \pm {:.2f}$").format(
             np.mean(score["test_r2"]),
             np.std(score["test_r2"]),