def cross_validation(df, mapper): pipe = sklearn.pipeline.Pipeline([ ('featurized', mapper), ('lm', sklearn.linear_model.LinearRegression()) ]) result = cross_val_score(pipe, df.copy(), df.Sales) print(np.round(result), 2)
def test_with_car_dataframe(cars_dataframe): pipeline = Pipeline( [("preprocess", DataFrameMapper([("description", CountVectorizer())])), ("classify", SVC(kernel="linear"))] ) data = cars_dataframe.drop("model", axis=1) labels = cars_dataframe["model"] scores = cross_val_score(pipeline, data, labels) assert scores.mean() > 0.30
def test_with_car_dataframe(cars_dataframe): pipeline = Pipeline([("preprocess", DataFrameMapper([ ("description", CountVectorizer()), ])), ("classify", SVC(kernel='linear'))]) data = cars_dataframe.drop("model", axis=1) labels = cars_dataframe["model"] scores = cross_val_score(pipeline, data, labels) assert scores.mean() > 0.30
def test_with_iris_dataframe(iris_dataframe): pipeline = Pipeline([("preprocess", DataFrameMapper([ ("petal length (cm)", None), ("petal width (cm)", None), ("sepal length (cm)", None), ("sepal width (cm)", None), ])), ("classify", SVC(kernel='linear'))]) data = iris_dataframe.drop("species", axis=1) labels = iris_dataframe["species"] scores = cross_val_score(pipeline, data, labels) assert scores.mean() > 0.96 assert (scores.std() * 2) < 0.04
def test_with_iris_dataframe(iris_dataframe): pipeline = Pipeline([ ("preprocess", DataFrameMapper([ ("petal length (cm)", None), ("petal width (cm)", None), ("sepal length (cm)", None), ("sepal width (cm)", None), ])), ("classify", SVC(kernel='linear')) ]) data = iris_dataframe.drop("species", axis=1) labels = iris_dataframe["species"] scores = cross_val_score(pipeline, data, labels) assert scores.mean() > 0.96 assert (scores.std() * 2) < 0.04
def logistic_001(): X, y = classes.get_train_data() y = y > 0 remove_object = classes.RemoveObjectColumns() X = remove_object.fit_transform(X) imputer = Imputer() X = imputer.fit_transform(X) scores = [] for i in range(X.shape[1]): clf = LogisticRegression() s = cross_val_score(clf, X[:, i], y, scoring='roc') scores.append((i, s))
def cross_validation(df, mapper): pipe = sklearn.pipeline.Pipeline([ ('featurized', mapper), ('lm', sklearn.linear_model.LinearRegression()) ]) result = cross_val_score(pipe, df.copy(), df.Sales) print(np.round(result), 2)
("one_hot_encoding", one_hot_encoding(categorical_features)), ("imputer", Imputer(axis=0, strategy='median')), ("random_forest", OneVsOneClassifier(RandomForestClassifier())) ]) kfold = KFold(n_splits=5, shuffle=True) model = pipe_1.fit(x_train, y_train) # model_file_path = '/Users/Aniket/Appzen/myenv/Source/semanticzen/learned_models/random_forest_baseline.pkl' # joblib.dump(model, model_file_path) # print '\n model : {0}'.format(model) # print '\n Model is dumped to : {0}'.format(model_file_path) scores = cross_val_score( model, # steps to convert raw messages into models x_train, # training data y_train, # training labels cv=kfold, # split data randomly into 10 parts: 9 for training, 1 for scoring scoring='accuracy', # which scoring metric? n_jobs=-1, # -1 = use all cores = faster ) print '\n Train result : cross_validation' print '\n Mean : {0}, std : (+/-) {1}'.format(scores.mean(), scores.std()) trained_model = model.steps[3][1] print '\n trained_model : {0}'.format(trained_model) y_prediction = model.predict(x_test) report = classification_report(y_test, y_prediction) print '\n ---------- Classification Report ------------' print report
def crossval(): cv = cross_val_score(pipe, X_train, y_train, cv=5) print("Cross Validation Scores are: ", cv.round(3)) print("Mean CrossVal score is: ", round(cv.mean(),3)) print("Std Dev CrossVal score is: ", round(cv.std(),3))
for i, v in table_y.iteritems(): print("\t" + i + " : " + repr(v)) table_y = table['SOILCLASS'].value_counts() print("Dataset features a total of " + repr(len(table_y)) + " soil classes.") for i, v in table_y.iteritems(): print("\t" + i + " : " + repr(v)) print("Training and evaluating classifier through 10-fold cross-validation...") classifier = XGBClassifier(n_estimators=100, n_jobs=5) classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5) pipe = sklearn.pipeline.Pipeline([('featurize', mapper), ('classify', classifier)]) aux = cross_val_score( pipe, X=table, y=table.SOILCLASS, scoring=make_scorer(classification_report_with_accuracy_score), cv=10) print("Overall results...") print("Accuracy : " + repr(aux.mean())) classification_report_with_accuracy_score(test_results_y_true, test_results_y_pred) print("Training classification model on complete dataset...") train_data = mapper.fit_transform(table) classifier.fit(train_data[0:train_data.shape[0], 1:train_data.shape[1]], train_data[0:train_data.shape[0], 0]) joblib.dump(classifier, 'classification-model.joblib') print("Infering the feature ranking within the classification model...") if isinstance(classifier, XGBClassifier):
def classifiers_comparison(): classifiers = [ ("Regresja logistyczna", LogisticRegression(), {'classifier__C': 5.0}), ("Naiwny klas. bayesowski", MultinomialNB(), {'classifier__alpha': 0.1}), ("SVM (liniowy)", SVC(kernel='linear', probability=True), {'classifier__C': 3.5, 'features__text_words': 500, 'features__subject_words': 50}), ("SVM (RBF)", SVC(kernel='rbf', probability=True), {'classifier__C': 0.5, 'classifier__gamma': 0.1, 'features__text_words': 500, 'features__subject_words': 50}), ("Las drzew losowych", RandomForestClassifier(), {'classifier__n_estimators': 100}), ] clf_count = len(classifiers) train_mails = parse_mails(COMPLETE_ALL['filename']) train_labels = COMPLETE_ALL['label'] plt.figure(figsize=(8, 12)) for (clf_name, clf, params), (ls, lc) in zip(classifiers, linestyles_gen()): model = AntispamModel(clf) model.spam_filter.set_params(**params) cv = StratifiedKFold(train_labels, 5) scorer = ROCScorer(params.keys()) cross_val_score(model.spam_filter, train_mails, train_labels, cv=cv, scoring=scorer, verbose=2) score = scorer.interp_scores.values()[0] label = clf_name plt.subplot(2, 1, 1) score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / clf_count) plt.subplot(2, 1, 2) score.plot(label=label, lc=lc, ls=ls, fill_alpha=0.5 / clf_count) plt.subplot(2, 1, 1) plt.grid(True) plt.xlabel('FPR') plt.ylabel('TPR') plt.legend(loc='lower right', fontsize='medium') plt.gca().add_patch( plt.Rectangle((0, 0.8), 0.2, 0.2, ls='dashed', fc='none') ) plt.xlim(-0.05, 1) plt.ylim(0, 1.05) plt.subplot(2, 1, 2) plt.grid(True) plt.xlabel('FPR') plt.ylabel('TPR') plt.xlim(0, 0.2) plt.ylim(0.8, 1) plt.savefig('doc/charts/ROC_ALL.png') plt.show()
def cv_score(self, train_data, labels): return cross_val_score(self.spam_filter, train_data, labels, score_func=f1_score)
model = LinearRegression(normalize=True) model.fit(X_train,np.log1p(y_train)) pred = np.exp(model.predict(X_test))-1 from sklearn.metrics import mean_absolute_error print(mean_absolute_error(y_test,pred)) from sklearn.model_selection import GridSearchCV, cross_val_score from sklearn.linear_model import Ridge pipe_ridge = make_pipeline(preprocessing_features, Ridge()) param_grid = {'ridge__alpha' : [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]} pipe_ridge_gs = GridSearchCV(pipe_ridge, param_grid=param_grid, scoring = 'neg_mean_squared_error', cv=3) result = np.sqrt(-cross_val_score(pipe_ridge_gs, X_train, np.log1p(y_train), scoring = 'neg_mean_squared_error', cv = 5)) np.mean(result) pipe_ridge_gs.fit(X_train, np.log1p(y_train)) predicted = np.exp(pipe_ridge_gs.predict(X_test)) -1 predicted= predicted.round() print(mean_absolute_error(y_test,predicted)) df_TEST = pd.read_csv(path+file_test) df_TEST.week_start_date = pd.to_datetime(df_TEST.week_start_date, yearfirst=True) predicted_TEST = np.exp(pipe_ridge_gs.predict(df_TEST)) -1 pd.DataFrame(predicted_TEST).to_csv(path+'TEST.csv')
alpha=0.0001, learning_rate='adaptive', learning_rate_init=0.001, max_iter=1000))]) # 模型拟合 nn.fit(data, label) # 模型预测 nn_predict = nn.predict(X_test) # 模型评估 # 基础打分 nn_score = nn.score(X_test, y_test) print(nn_score) # 交叉验证 nn_cross1 = cross_val_score(nn, X_train, y_train, scoring='accuracy', cv=10, n_jobs=-1) nn_cross2 = cross_val_score(nn, X_test, y_test, scoring='accuracy', cv=10, n_jobs=-1) print(nn_cross1) print(nn_cross2) # # scores1.append(nn_cross1.mean()) # # scores2.append(nn_cross2.mean()) # # print(nn_cross1.mean()) # # print(nn_cross2.mean()) # # plt.plot(scores1, linestyle='-', color='r', label='train')
('UHDICM30', None), ('UHDICM40', None), ('LHDICM00', None), ('LHDICM10', None), ('LHDICM20', None), ('LHDICM30', None), ('LHDICM40', None), ('CRFVOL00', None), ('CRFVOL10', None), ('CRFVOL20', None), ('CRFVOL30', None), ('CRFVOL40', None), ('SNDPPT00', None), ('SNDPPT10', None), ('SNDPPT20', None), ('SNDPPT30', None), ('SNDPPT40', None), ('SLTPPT00', None), ('SLTPPT10', None), ('SLTPPT20', None), ('SLTPPT30', None), ('SLTPPT40', None), ('CLYPPT00', None), ('CLYPPT10', None), ('CLYPPT20', None), ('CLYPPT30', None), ('CLYPPT40', None), ('BLD00', None), ('BLD10', None), ('BLD20', None), ('BLD30', None), ('BLD40', None), ('PHIHOX00', None), ('PHIHOX10', None), ('PHIHOX20', None), ('PHIHOX30', None), ('PHIHOX40', None), ('PHIKCL00', None), ('PHIKCL10', None), ('PHIKCL20', None), ('PHIKCL30', None), ('PHIKCL40', None), ('ORCDRC00', None), ('ORCDRC10', None), ('ORCDRC20', None), ('ORCDRC30', None), ('ORCDRC40', None), ('CECSUM00', None), ('CECSUM10', None), ('CECSUM20', None), ('CECSUM30', None), ('CECSUM40', None) ]) table_y = table_y['WRB_2006_NAMEf_2'].value_counts() print("Dataset features a total of " + repr(len(table_y)) + " soil classes.") print("Training and evaluating classifier through 10-fold cross-validation...") classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=100) #classifier = GCForest(get_gcforest_config()) pipe = sklearn.pipeline.Pipeline([('featurize', mapper), ('classify', classifier)]) cross_val_score(pipe, X=table, y=table.WRB_2006_NAMEf_2, scoring=make_scorer(classification_report_with_accuracy_score), cv=10)