def neural_net(features,target,test_size_percent=0.2,cv_split=3,n_iter=100,learning_rate=0.01): '''Features -> Pandas Dataframe with attributes as columns target -> Pandas Dataframe with target column for prediction Test_size_percent -> Percentage of data point to be used for testing''' scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) mlp = Regressor(layers=[Layer("Rectifier",units=5), # Hidden Layer1 Layer("Rectifier",units=3) # Hidden Layer2 ,Layer("Linear")], # Output Layer n_iter = n_iter, learning_rate=0.01) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) mlp.fit(X_train,y_train) test_prediction = mlp.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(mlp,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(mlp,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(mlp,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(mlp,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return mlp
def main(): from sklearn import preprocessing from sklearn.datasets import fetch_mldata from sklearn.model_selection import cross_val_score db_name = 'iris' hid_num = 1000 data_set = fetch_mldata(db_name) data_set.data = preprocessing.scale(data_set.data) print(db_name) print('ECOBELM', hid_num) e = ECOBELM(hid_num, c=2**5) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave)) print('ELM', hid_num) e = ELM(hid_num) ave = 0 for i in range(10): scores = cross_val_score( e, data_set.data, data_set.target, cv=5, scoring='accuracy') ave += scores.mean() ave /= 10 print("Accuracy: %0.2f " % (ave))
def test_cross_val_score_fit_params(): clf = MockClassifier() n_samples = X.shape[0] n_classes = len(np.unique(y)) W_sparse = coo_matrix((np.array([1]), (np.array([1]), np.array([0]))), shape=(10, 1)) P_sparse = coo_matrix(np.eye(5)) DUMMY_INT = 42 DUMMY_STR = '42' DUMMY_OBJ = object() def assert_fit_params(clf): # Function to test that the values are passed correctly to the # classifier arguments for non-array type assert_equal(clf.dummy_int, DUMMY_INT) assert_equal(clf.dummy_str, DUMMY_STR) assert_equal(clf.dummy_obj, DUMMY_OBJ) fit_params = {'sample_weight': np.ones(n_samples), 'class_prior': np.ones(n_classes) / n_classes, 'sparse_sample_weight': W_sparse, 'sparse_param': P_sparse, 'dummy_int': DUMMY_INT, 'dummy_str': DUMMY_STR, 'dummy_obj': DUMMY_OBJ, 'callback': assert_fit_params} cross_val_score(clf, X, y, fit_params=fit_params)
def test_score_memmap(): # Ensure a scalar score of memmap type is accepted iris = load_iris() X, y = iris.data, iris.target clf = MockClassifier() tf = tempfile.NamedTemporaryFile(mode='wb', delete=False) tf.write(b'Hello world!!!!!') tf.close() scores = np.memmap(tf.name, dtype=np.float64) score = np.memmap(tf.name, shape=(), mode='r', dtype=np.float64) try: cross_val_score(clf, X, y, scoring=lambda est, X, y: score) # non-scalar should still fail assert_raises(ValueError, cross_val_score, clf, X, y, scoring=lambda est, X, y: scores) finally: # Best effort to release the mmap file handles before deleting the # backing file under Windows scores, score = None, None for _ in range(3): try: os.unlink(tf.name) break except WindowsError: sleep(1.)
def _cross_validation(self, sentences, labels, intent_features, spacy_nlp, max_ngrams): """choose the best number of ngrams to include in bow. Given an intent classification problem and a set of ordered ngrams (ordered in terms of importance by pick_applicable_ngrams) we choose the best number of ngrams to include in our bow vecs by cross validation.""" from sklearn import preprocessing from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score import numpy as np clf2 = LogisticRegression(class_weight='balanced') intent_encoder = preprocessing.LabelEncoder() intent_encoder.fit(labels) y = intent_encoder.transform(labels) cv_splits = min(10, np.min(np.bincount(y))) if y.size > 0 else 0 if cv_splits >= 3: logger.debug("Started ngram cross-validation to find best number of ngrams to use...") num_ngrams = np.unique(list(map(int, np.floor(np.linspace(1, max_ngrams, 8))))) no_ngrams_X = self._create_bow_vecs(intent_features, sentences, spacy_nlp, max_ngrams=0) no_ngrams_score = np.mean(cross_val_score(clf2, no_ngrams_X, y, cv=cv_splits)) scores = [] for n in num_ngrams: X = self._create_bow_vecs(intent_features, sentences, spacy_nlp, max_ngrams=n) score = np.mean(cross_val_score(clf2, X, y, cv=cv_splits)) scores.append(score) logger.debug("Evaluating usage of {} ngrams. Score: {}".format(n, score)) n_top = num_ngrams[np.argmax(scores)] logger.debug("Score without ngrams: {}".format(no_ngrams_score)) logger.info("Best score with {} ngrams: {}".format(n_top, np.max(scores))) return n_top else: warnings.warn("Can't cross-validate ngram featurizer. There aren't enough examples per intent (at least 3)") return max_ngrams
def three_models_combined(self, intrusion_features, avoidance_features, hypertension_features): self.df = self.df[~self.df['intrusion_cutoff'].isna()] self.df = self.df[~self.df['avoidance_cutoff'].isna()] self.df = self.df[~self.df['hypertention_cutoff'].isna()] print("self.df.shape", self.df.shape) X = self.df Y = self.df[self.target]# strict all_Y = [self.target, "intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff"] X_train, X_test, y_train, y_test = train_test_split(X, self.df[all_Y], test_size=0.25, random_state = 8526566, stratify=Y) # intrusion X_intrusion = X_train[intrusion_features].values y_intrusion = y_train["intrusion_cutoff"].apply(lambda x: int(x)) pipe_intrusion = Pipeline(steps=[ ('rfe', BorderlineSMOTE()), ('classifier', XGBClassifier(n_estimators=100, reg_alpha=1))]) scores = cross_val_score(pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5)) print(f"intrusion {sum(scores)/5}") pipe_intrusion.fit(X_intrusion, y_intrusion) # avoidance X_avoidance = X_train[avoidance_features].values y_avoidance = y_train["avoidance_cutoff"].apply(lambda x: int(x)) pipe_avoidance = Pipeline(steps=[ ('classifier', XGBClassifier(n_estimators=100, scale_pos_weight=3, reg_alpha=1))]) scores = cross_val_score(pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5)) print(f"avoidance {sum(scores)/5}") pipe_avoidance.fit(X_avoidance, y_avoidance) # hypertension X_hypertension = X_train[hypertension_features].values y_hypertention = y_train["hypertention_cutoff"].apply(lambda x: int(x)) pipe_hypertension = Pipeline(steps=[ ('classifier', BalancedBaggingClassifier(n_estimators=100))]) scores = cross_val_score(pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") pipe_hypertension.fit(X_hypertension, y_hypertention) ## combine three classifiers X_test_hypertension = X_test[hypertension_features].values X_test_avoidance = X_test[avoidance_features].values X_test_intrusion = X_test[intrusion_features].values y_pred_hypertension = pipe_hypertension.predict(X_test_hypertension) y_pred_avoidance = pipe_avoidance.predict(X_test_avoidance) y_pred_intrusion = pipe_intrusion.predict(X_test_intrusion) y_pred = (y_pred_hypertension * y_pred_avoidance * y_pred_intrusion) y_target = y_test["PCL_Strict3"].apply(lambda x: int(x)) acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def Random_forest(features,target,test_size_percent=0.2,cv_split=3): X_array = features.as_matrix() y_array = target.as_matrix() model_rdf = RandomForestRegressor() X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) model_rdf.fit(X_train,y_train) test_prediction = model_rdf.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(model_rdf,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(model_rdf,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(model_rdf,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(model_rdf,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return model_rdf
def svm_regressor(features,target,test_size_percent=0.2,cv_split=5): scale=preprocessing.MinMaxScaler() X_array = scale.fit_transform(features) y_array = scale.fit_transform(target) X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) svr = SVR(kernel='rbf',C=10,gamma=1) svr.fit(X_train,y_train.ravel()) test_prediction = svr.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy return svr
def linear_regression(features,target,test_size_percent=0.2,cv_split=5): ''' Features -> Pandas Dataframe with attributes as columns target -> Pandas Dataframe with target column for prediction Test_size_percent -> Percentage of data point to be used for testing''' X_array = features.as_matrix() y_array = target.as_matrix() ols = linear_model.LinearRegression() X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4) # model = ols.fit(X_train, y_train) ols.fit(X_train, y_train) # test_prediction_model = ols.predict(X_test) tscv = TimeSeriesSplit(cv_split) training_score = cross_val_score(ols,X_train,y_train,cv=tscv.n_splits) testing_score = cross_val_score(ols,X_test,y_test,cv=tscv.n_splits) print"Cross-val Training score:", training_score.mean() # print"Cross-val Testing score:", testing_score.mean() training_predictions = cross_val_predict(ols,X_train,y_train,cv=tscv.n_splits) testing_predictions = cross_val_predict(ols,X_test,y_test,cv=tscv.n_splits) training_accuracy = metrics.r2_score(y_train,training_predictions) # test_accuracy_model = metrics.r2_score(y_test,test_prediction_model) test_accuracy = metrics.r2_score(y_test,testing_predictions) # print"Cross-val predicted accuracy:", training_accuracy print"Test-predictions accuracy:",test_accuracy plot_model(target,y_train,y_test,training_predictions,testing_predictions) return ols
def fit(self, X_train, y_train): # intrusion X_intrusion = X_train[self.features].values y_intrusion = X_train["intrusion_cutoff"].apply(lambda x: int(x)) self.pipe_intrusion = Pipeline(steps=[ ('rfe', RFE(XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=3), self.rfe)), ('classifier', XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=3))]) self.pipe_intrusion.fit(X_intrusion, y_intrusion) scores = cross_val_score(self.pipe_intrusion, X_intrusion, y_intrusion, scoring='precision', cv=StratifiedKFold(5)) print(f"intrusion {sum(scores)/5}") self.pipe_intrusion.fit(X_intrusion, y_intrusion) # avoidance X_avoidance = X_train[self.features].values y_avoidance = X_train["avoidance_cutoff"].apply(lambda x: int(x)) self.pipe_avoidance = Pipeline(steps=[ ('rfe', RFE(XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=6), self.rfe)), ('classifier', XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=6))]) self.pipe_avoidance.fit(X_avoidance, y_avoidance) scores = cross_val_score(self.pipe_avoidance, X_avoidance, y_avoidance, scoring='precision', cv=StratifiedKFold(5)) print(f"avoidance {sum(scores)/5}") self.pipe_avoidance.fit(X_avoidance, y_avoidance) # hypertension X_hypertension = X_train[self.features].values y_hypertention = X_train["hypertention_cutoff"].apply(lambda x: int(x)) self.pipe_hypertension = Pipeline(steps=[ ('rfe', RFE(XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=4), self.rfe)), ('classifier', XGBClassifier(n_estimators=self.n_estimators, reg_alpha=1, scale_pos_weight=4))]) self.pipe_hypertension.fit(X_hypertension, y_hypertention) scores = cross_val_score(self.pipe_hypertension, X_hypertension, y_hypertention, scoring='precision', cv=StratifiedKFold(5)) print(f"hypertension {sum(scores)/5}") self.pipe_hypertension.fit(X_hypertension, y_hypertention) # regression X_regression = X_train[self.features].values y_regression = X_train["PCL3"] self.pipe_regression = Pipeline(steps=[ ('classifier', Ridge())]) self.pipe_regression.fit(X_regression, y_regression) # target y_pred_hypertension = self.pipe_hypertension.predict(X_hypertension) y_pred_avoidance = self.pipe_avoidance.predict(X_avoidance) y_pred_intrusion = self.pipe_intrusion.predict(X_intrusion) y_pred_regression = self.pipe_regression.predict(X_regression) >= self.cutoff y_pred = (y_pred_hypertension & y_pred_avoidance & y_pred_intrusion & y_pred_regression & y_pred_regression) y_target = y_train acc = accuracy_score(y_target, y_pred) f1 = f1_score(y_target, y_pred) recall = recall_score(y_target, y_pred) precision = precision_score(y_target, y_pred) print("test scores") print(f"acc-{acc}, f1- {f1}, recall-{recall}, precision - {precision}")
def test_cross_val_score_allow_nans(): # Check that cross_val_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cross_val_score(p, X, y, cv=5)
def tune_spam(X_train,y_train,alpha_list): val_accuracy=[] for alpha in alpha_list: model = SVC(C=alpha) val_accuracy.extend([np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))]) print [np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))] max_index = val_accuracy.index(max( val_accuracy)) print "CV_val_error:", val_accuracy print "Best C:",alpha_list[max_index] return alpha_list[max_index]
def compute_scores(X): pca = PCA(svd_solver='full') fa = FactorAnalysis() pca_scores, fa_scores = [], [] for n in n_components: pca.n_components = n fa.n_components = n pca_scores.append(np.mean(cross_val_score(pca, X))) fa_scores.append(np.mean(cross_val_score(fa, X))) return pca_scores, fa_scores
def test_k_fold_cv(): """Test OneHotEncoder with categorical_features='auto'.""" boston = load_boston() clf = make_pipeline( OneHotEncoder( categorical_features='auto', sparse=False, minimum_fraction=0.05 ), LinearRegression() ) cross_val_score(clf, boston.data, boston.target, cv=KFold(n_splits=10, shuffle=True))
def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) X = rng.rand(20, 2) D = pairwise_distances(X, metric='euclidean') y = rng.randint(3, size=20) for Est in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): metric_score = cross_val_score(Est(), X, y) precomp_score = cross_val_score(Est(metric='precomputed'), D, y) assert_array_equal(metric_score, precomp_score)
def get_results(dataset): X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) full_scores = cross_val_score(estimator, X_full, y_full, scoring='neg_mean_squared_error') # Add missing values in 75% of the lines missing_rate = 0.75 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) # Estimate the score after replacing missing values by 0 X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = RandomForestRegressor(random_state=0, n_estimators=100) zero_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after imputation (mean strategy) of the missing values X_missing = X_full.copy() X_missing[np.where(missing_samples)[0], missing_features] = 0 y_missing = y_full.copy() estimator = Pipeline([("imputer", SimpleImputer(missing_values=0, strategy="mean")), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) mean_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') # Estimate the score after chained imputation of the missing values estimator = Pipeline([("imputer", ChainedImputer(missing_values=0, random_state=0)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) chained_impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), (chained_impute_scores.mean(), chained_impute_scores.std()))
def test_pairwise_cross_val_score(): clf_precomputed = svm.SVC(kernel='precomputed') clf_notprecomputed = svm.SVC(kernel='linear') X, y = iris.data, iris.target for MultiClassClassifier in [OneVsRestClassifier, OneVsOneClassifier]: ovr_false = MultiClassClassifier(clf_notprecomputed) ovr_true = MultiClassClassifier(clf_precomputed) linear_kernel = np.dot(X, X.T) score_precomputed = cross_val_score(ovr_true, linear_kernel, y) score_linear = cross_val_score(ovr_false, X, y) assert_array_equal(score_precomputed, score_linear)
def generate_binary_crime_label(): y = retrieve_crime_count(2013) threshold = np.median(y) label = [1 if ele >= threshold else 0 for ele in y] F = generate_corina_features() from sklearn import svm, tree from sklearn.model_selection import cross_val_score clf1 = svm.SVC() scores1 = cross_val_score(clf1, F[1], label, cv=10) print scores1.mean(), scores1 clf2 = tree.DecisionTreeClassifier() scores2 = cross_val_score(clf2, F[1], label, cv=10) print scores2.mean(), scores2 pickle.dump(label, open("crime-label", 'w')) return y, label, F[1]
def test_nested_cv(): # Test if nested cross validation works with different combinations of cv rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) labels = rng.randint(0, 5, 15) cvs = [LeaveOneLabelOut(), LeaveOneOut(), LabelKFold(), StratifiedKFold(), StratifiedShuffleSplit(n_iter=10, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): gs = GridSearchCV(LinearSVC(random_state=0), param_grid={'C': [1, 10]}, cv=inner_cv) cross_val_score(gs, X=X, y=y, labels=labels, cv=outer_cv, fit_params={'labels': labels})
def test_cross_val_score_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cross_val_score(clf, X_df, y_ser)
def do_mlp(x,y): #mlp clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1) scores = cross_val_score(clf, x, y, cv = 5,scoring='f1_micro') #print scores print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) scores = cross_val_score(clf, x, y, cv = 5,scoring='accuracy') #print scores print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def test_cross_val_score_multilabel(): X = np.array([[-3, 4], [2, 4], [3, 3], [0, 2], [-3, 1], [-2, 1], [0, 0], [-2, -1], [-1, -2], [1, -2]]) y = np.array([[1, 1], [0, 1], [0, 1], [0, 1], [1, 1], [0, 1], [1, 0], [1, 1], [1, 0], [0, 0]]) clf = KNeighborsClassifier(n_neighbors=1) scoring_micro = make_scorer(precision_score, average='micro') scoring_macro = make_scorer(precision_score, average='macro') scoring_samples = make_scorer(precision_score, average='samples') score_micro = cross_val_score(clf, X, y, scoring=scoring_micro, cv=5) score_macro = cross_val_score(clf, X, y, scoring=scoring_macro, cv=5) score_samples = cross_val_score(clf, X, y, scoring=scoring_samples, cv=5) assert_almost_equal(score_micro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 3]) assert_almost_equal(score_macro, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4]) assert_almost_equal(score_samples, [1, 1 / 2, 3 / 4, 1 / 2, 1 / 4])
def test_nested_cv(): # Test if nested cross validation works with different combinations of cv rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 5, 15) cvs = [LeaveOneGroupOut(), LeaveOneOut(), GroupKFold(), StratifiedKFold(), StratifiedShuffleSplit(n_splits=3, random_state=0)] for inner_cv, outer_cv in combinations_with_replacement(cvs, 2): gs = GridSearchCV(Ridge(), param_grid={'alpha': [1, .1]}, cv=inner_cv) cross_val_score(gs, X=X, y=y, groups=groups, cv=outer_cv, fit_params={'groups': groups})
def test_sample_weight_func(): """Assert that the TPOTRegressor score function outputs a known score for a fixed pipeline with sample weights""" tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') # Reify pipeline with known scor pipeline_string = ("ExtraTreesRegressor(" "GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.8," "GradientBoostingRegressor__learning_rate=0.1,GradientBoostingRegressor__loss=huber," "GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.5," "GradientBoostingRegressor__min_samples_leaf=5, GradientBoostingRegressor__min_samples_split=5," "GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.25)," "ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5," "ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=5, " "ExtraTreesRegressor__n_estimators=100)") tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r) tpot_obj._optimized_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) tpot_obj._fitted_pipeline = tpot_obj._toolbox.compile(expr=tpot_obj._optimized_pipeline) # make up a sample weight training_classes_r_weight = np.array(range(1, len(training_classes_r)+1)) training_classes_r_weight_dict = set_sample_weight(tpot_obj._fitted_pipeline.steps, training_classes_r_weight) np.random.seed(42) cv_score1 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error') np.random.seed(42) cv_score2 = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error') np.random.seed(42) cv_score_weight = cross_val_score(tpot_obj._fitted_pipeline, training_features_r, training_classes_r, cv=3, scoring='neg_mean_squared_error', fit_params=training_classes_r_weight_dict) np.random.seed(42) tpot_obj._fitted_pipeline.fit(training_features_r, training_classes_r, **training_classes_r_weight_dict) # Get score from TPOT known_score = 12.643383517 # Assumes use of mse score = tpot_obj.score(testing_features_r, testing_classes_r) # http://stackoverflow.com/questions/5595425/ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) assert np.allclose(cv_score1, cv_score2) assert not np.allclose(cv_score1, cv_score_weight) assert isclose(known_score, score)
def test_cross_val_score_sparse_fit_params(): iris = load_iris() X, y = iris.data, iris.target clf = MockClassifier() fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))} a = cross_val_score(clf, X, y, fit_params=fit_params) assert_array_equal(a, np.ones(3))
def ccv(self, bst, X, y, scorer): cv1 = model_selection.cross_val_score(bst, X, y, cv=self.n_fold_, n_jobs=-2, scoring = scorer) tasks = [delayed(split_validate_job)(base.clone(bst), X, y, seed) for seed in range(self.n_fold_)] cv2 = Parallel(n_jobs=-2, backend="threading")(tasks) score = (np.sum(cv1) + np.sum(cv2)) / (len(cv1) + len(cv2)) std = np.std(list(cv1) + list(cv2)) return score, std
def _evaluate_projection(self, x, y): """ kNNEvaluate - evaluate class separation in the given projection using a k-NN method Parameters ---------- x - variables to evaluate y - class Returns ------- scores """ if self.percent_data_used != 100: rand = np.random.choice(len(x), int(len(x) * self.percent_data_used / 100), replace=False) x = x[rand] y = y[rand] neigh = KNeighborsClassifier(n_neighbors=3) if self.attr_color.is_discrete else \ KNeighborsRegressor(n_neighbors=3) assert ~(np.isnan(x).any(axis=None) | np.isnan(x).any(axis=None)) neigh.fit(x, y) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) scores = cross_val_score(neigh, x, y, cv=3) return scores.mean()
def test_build_ps_owa_factory(): import os csv_file = os.path.join(os.path.dirname(__file__), "iris.csv") data = np.genfromtxt(csv_file, dtype=float, delimiter=',', names=True) X = np.array([data["sepallength"], data["sepalwidth"], data["petallength"], data["petalwidth"]]).T y = data["class"] from sklearn.preprocessing import MinMaxScaler X = MinMaxScaler().fit_transform(X) l = nfpc.FuzzyPatternClassifier( membership_factory=t_factory, aggregation_factory=nfpc.GAOWAFactory(optimizer=nfpc.ps_owa_optimizer()) ) from sklearn.model_selection import cross_val_score scores = cross_val_score(l, X, y, cv=10) mean = np.mean(scores) print("mean", mean) assert 0.92 < mean
def test_rfe_estimator_tags(): rfe = RFE(SVC(kernel='linear')) assert_equal(rfe._estimator_type, "classifier") # make sure that cross-validation is stratified iris = load_iris() score = cross_val_score(rfe, iris.data, iris.target) assert_greater(score.min(), .7)
from keras.utils import np_utils from keras.wrappers.scikit_learn import KerasClassifier from sklearn.model_selection import cross_val_score base = pd.read_csv('iris.csv') previsores = base.iloc[:, 0:4].values classe = base.iloc[:, 4].values from sklearn.preprocessing import LabelEncoder labelencoder = LabelEncoder() classe = labelencoder.fit_transform(classe) classe_dummy = np_utils.to_categorical(classe) def criar_rede(): classificador = Sequential() classificador.add(Dense(units = 8, activation = 'relu', kernel_initializer = 'normal', input_dim = 4)) classificador.add(Dropout(0.3)) classificador.add(Dense(units = 8, activation = 'relu', kernel_initializer = 'normal')) classificador.add(Dropout(0.3)) classificador.add(Dense(units = 3, activation = 'softmax')) classificador.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['categorical_accuracy']) return classificador classificador = KerasClassifier(build_fn = criar_rede, epochs = 1500, batch_size = 30) resultados = cross_val_score(estimator = classificador, X = previsores, y = classe, cv = 10, scoring = 'accuracy') media = resultados.mean() desvio = resultados.std()
def model(opt): knr = KNeighborsClassifier(n_neighbors=opt["n_neighbors"]) scores = cross_val_score(knr, X, y, cv=5) score = scores.mean() return score
##Train the model regressor.fit(X_train, y_train) print("Coefficient : ", regressor.coef_) print("Intercept: ", regressor.intercept_) print("Coefficient of determination R^2 <-- on train set: {}".format( regressor.score(X_train, y_train))) print("Coefficient of determination R^2 <-- on train set: {}".format( regressor.score(X_test, y_test))) from sklearn.model_selection import cross_val_score score = cross_val_score(regressor, X, y, cv=5) print(score) print("Score: ", score.mean()) ##Model Evaluation coeff_df = pd.DataFrame(regressor.coef_, index=X.columns, columns=['Coefficient']) print(coeff_df.head(10)) ###Interpreting the Coefficients """1. Holding all features fixed, a 1 unit increase in T with an decrease of 10.12 in AQI PM 2.5 2. Holding all features fixed, a 1 unit increase in TM with an increase of 3.92 in AQI PM 2.5 3. Holding all features fixed, a 1 unit increase in VV with an decrease of 47.20 in AQI PM 2.5""" ##Prediction on test data
pipelines['Scaled' + name] = ppl print('done') print('') # ---------------------------------------- # pipeline fitting and scoring # ---------------------------------------- print('Pipleine fitting and scoring progress: name - mean accuracy - std accuracy') scoring = 'neg_mean_absolute_error' pipelinenames = list() scores = list() for entry in pipelines.items(): name = entry[0] print(' {0:<20}'.format(name), end = '') ppl = entry[1] score = -1 * sms.cross_val_score(ppl, Xtrain, ytrain, cv = cvsplitter, scoring = scoring) scores.append(score) pipelinenames.append(entry[0]) print('{0:.4f} - {1:.4f}'.format(np.mean(score), np.std(score, ddof = 1))) print('') # boxplot of results plt.close('all') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.boxplot(scores) ax.set_xticklabels(pipelinenames) ax.set_xlabel('Algorithm') ax.set_ylabel('Mean Absolute Error') ax.set_title('Mean Absolute Error of Different Algorithms')
X = dfForTraining.iloc[:, :].values # %% Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20) # %% classifier = RandomForestClassifier(n_estimators=200, criterion='entropy', min_samples_split=.0002575) classifier.fit(X_train, y_train) # %% confusion matrix preds y_pred = classifier.predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) # %% Kfold validation accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10, n_jobs=-1) acc_mean = accuracies.mean() acc_std = accuracies.std() print(f'Accuracy of model: {acc_mean}') print(f'Accuracy of model: {acc_std}') # %% Grid search n_estimators = np.linspace(200, 250, 5) n_estimators = [int(estimator) for estimator in n_estimators] min_samples_split = np.linspace(.00001, .001, 5) # %% parameters = { 'n_estimators': n_estimators, 'min_samples_split': [.0002575], 'criterion': ['entropy']
from sklearn.datasets import load_digits from sklearn.ensemble import AdaBoostClassifier from sklearn.model_selection import cross_val_score # For reproducibility np.random.seed(1000) nb_classifications = 100 if __name__ == '__main__': # Load dataset digits = load_digits() # Collect accuracies ab_accuracy = [] for i in range(1, nb_classifications): a = cross_val_score(AdaBoostClassifier(n_estimators=i), digits.data, digits.target, scoring='accuracy', cv=10).mean() ab_accuracy.append(a) # Show results plt.figure(figsize=(30, 25)) plt.xlabel('Number of trees') plt.ylabel('Accuracy') plt.grid(True) plt.plot(ab_accuracy) plt.show()
# 1) Cross Validation Classification Accuracy import warnings warnings.filterwarnings(action="ignore") import pandas as pd from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression filename = 'indians-diabetes.data.csv' names = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] dataframe = pd.read_csv(filename, names=names) array = dataframe.values X = array[:, 0:8] Y = array[:, 8] kfold = KFold(n_splits=10) model = LogisticRegression() # This is the default method of accuracy scoringMethod = 'accuracy' results = cross_val_score(model, X, Y, cv=kfold, scoring=scoringMethod) print("Accuracy: %.3f (%.3f)" % (results.mean() * 100, results.std() * 100))
# compared to Europe and Central Asia. Therefore, if you are trying to predict life expectancy, # it would be preferable to retain the 'Region' feature. To do this, you need to binarize it by # creating dummy variables, which is what you will do in this exercise. # Create dummy variables: df_region df_region = pd.get_dummies(df) # Print the columns of df_region print(df_region.columns) # Create dummy variables with drop_first=True: df_region df_region = pd.get_dummies(df, drop_first=True) # Print the new columns of df_region print(df_region.columns) #Regression with categorical features # Having created the dummy variables from the 'Region' feature, you can build # regression models as you did before. Here, you'll use ridge regression to perform 5-fold cross-validation. # The feature array X and target variable array y have been pre-loaded. # Instantiate a ridge regressor: ridge ridge = Ridge(normalize=True, alpha=0.5) # Perform 5-fold cross-validation: ridge_cv ridge_cv = cross_val_score(ridge, df_region.values, y, cv=5) # Print the cross-validated scores print(ridge_cv)
# Combine sets and extract HOG features from itertools import chain X_train = np.array( [feature.hog(im) for im in chain(positive_patches, negative_patches)]) y_train = np.zeros(X_train.shape[0]) y_train[:positive_patches.shape[0]] = 1 #%% print(X_train.shape) #%% # training a support vecctor machine from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_score print(cross_val_score(GaussianNB(), X_train, y_train)) #%% ''' We see that on our training data, even a simple naive Bayes algorithm gets us upwards of 90% accuracy. Let's try the support vector machine, with a grid search over a few choices of the C parameter: ''' from sklearn.svm import LinearSVC from sklearn.model_selection import GridSearchCV grid = GridSearchCV(LinearSVC(), {'C': [1.0, 2.0, 4.0, 8.0]}) grid.fit(X_train, y_train) print(grid.best_score_) #%% print(grid.best_params_) #%%
def clustering(): import numpy as np import pandas as pd from sklearn.model_selection import KFold, cross_val_score from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from sklearn.svm import SVC pd.options.mode.chained_assignment = None print 'Reading from CSV files....' train = pd.read_csv("training_data_example.csv") valid = pd.read_csv("validation_data_example.csv") test = pd.read_csv("training_data_example_and_validation.csv") # Transform the symbolic values into numbers suitable for the Bayes classifier print 'Doing some data pre-processing/cleaning....' train['category'] = pd.factorize(train['category'])[0] valid['category'] = pd.factorize(valid['category'])[0] test['category'] = pd.factorize(test['category'])[0] train['expense description'] = pd.factorize( train['expense description'])[0] valid['expense description'] = pd.factorize( valid['expense description'])[0] test['expense description'] = pd.factorize(test['expense description'])[0] train['tax name'] = pd.factorize(train['tax name'])[0] valid['tax name'] = pd.factorize(valid['tax name'])[0] test['tax name'] = pd.factorize(test['tax name'])[0] train.fillna(train.mean(), inplace=True) test.fillna(test.mean(), inplace=True) # Format the data and expected values for SKLearn trainData = pd.DataFrame(train[[ 'expense description', 'pre-tax amount', 'tax name', 'tax amount' ]]) trainTarget = np.array(pd.DataFrame(train[['category']])) testData = pd.DataFrame(test[[ 'expense description', 'pre-tax amount', 'tax name', 'tax amount' ]]) testTarget = pd.DataFrame(test[['category']]) valData = pd.DataFrame(valid[[ 'expense description', 'pre-tax amount', 'tax name', 'tax amount' ]]) valTarget = pd.DataFrame(valid[['category']]) # Prepare cross-validation folds & variables k_fold = KFold(len(valData), shuffle=True, random_state=0) algoEval = 0 winningAlgo = "" # Change y vectors to 1d array trainTarget = np.ravel(trainTarget, 'C') valTarget = np.ravel(valTarget, 'C') # Start cross-validation of candidate algorithms print 'Evaluating model algorithms for dataset....' GBCclassifier = GradientBoostingClassifier() GBCclassifier.fit(trainData, trainTarget) if algoEval < cross_val_score( GBCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100: algoEval = cross_val_score( GBCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 classifier = GradientBoostingClassifier() winningAlgo = 'Gradient Boost' print 'GBC: ', cross_val_score( GBCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 GNBclassifier = GaussianNB() GNBclassifier.fit(trainData, trainTarget) if algoEval < cross_val_score( GNBclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100: algoEval = cross_val_score( GNBclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 classifier = GNBclassifier() winningAlgo = 'Gaussian Naive Bayes' print 'GNB: ', cross_val_score( GNBclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 SVCclassifier = SVC() SVCclassifier.fit(trainData, trainTarget) if algoEval < cross_val_score( SVCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100: algoEval = cross_val_score( SVCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 classifier = SVCclassifier() winningAlgo = 'Support Vector Machine' print 'SVM: ', cross_val_score( SVCclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 LDAclassifier = LinearDiscriminantAnalysis() LDAclassifier.fit(trainData, trainTarget) if algoEval < cross_val_score( LDAclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100: algoEval = cross_val_score( LDAclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 classifier = LDAclassifier() winningAlgo = 'Linear Discriminant Analysis' print 'LDA: ', cross_val_score( LDAclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 LinREGclassifier = LinearRegression() LinREGclassifier.fit(trainData, trainTarget) if algoEval < cross_val_score( LinREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100: algoEval = cross_val_score( LinREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 classifier = LinREGclassifier() winningAlgo = 'Linear Regression' print 'LinReg: ', cross_val_score( LinREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 LogREGclassifier = LogisticRegression() LogREGclassifier.fit(trainData, trainTarget) if algoEval < cross_val_score( GNBclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100: algoEval = cross_val_score( LogREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 classifier = LogREGclassifier() winningAlgo = 'Logistic Regression' print 'LogReg: ', cross_val_score( LogREGclassifier, valData, valTarget, cv=k_fold, n_jobs=1).mean() * 100 print '\n the best algorithm is ' + winningAlgo + ', proceeding to model test: \n' classifier.fit(trainData, trainTarget) predictedValues = classifier.predict(testData) testResults = test[['employee id']] testResults['category'] = predictedValues print 'Model predicted with ', accuracy_score(testTarget, predictedValues),\ ' accuracy, check prediction.csv for details.' # As this is a cluster simulation, the file will be saved on # the default path for local engine at /Python27/Lib/site-packages/ipyparallel testResults.to_csv('prediction.csv', index=False)
def un_tuned_models(model, x_train, y_train, xtest): train_model = model.fit(x_train, y_train) y_pred = train_model.predict(xtest) rmse = (-cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=5)) return (y_pred, rmse)
def main(): mode = 1 #0-cv, 1-predict print "Reading train data" #training_set_raw = pd.read_csv('/modules/cs342/Assignment2/training_set.csv') #training_set_metadata_raw = pd.read_csv('/modules/cs342/Assignment2/training_set_metadata.csv') training_set_raw = pd.read_csv('./train_data_aug4.csv') training_set_metadata_raw = pd.read_csv('./train_meta_aug4.csv') training_set_targets = training_set_metadata_raw['target'] training_set_data = training_set_metadata_raw.drop('target',axis=1) classes = sorted(training_set_targets.unique()) class_weight = { c: 1 for c in classes } for c in [64, 15]: class_weight[c] = 2 training_set_data = fill_in_hostgal_specz(training_set_data) full_train = format(training_set_data, training_set_raw) extragalactic_data, extragalactic_targets, extra_ids, intragalactic_data, intragalactic_targets, intra_ids = splitGalaxies(full_train, training_set_targets) initial_intra = training_set_raw.loc[training_set_raw['object_id'].isin(intra_ids)] print initial_intra.shape print training_set_raw.shape print len(training_set_raw['object_id'].unique()) print training_set_metadata_raw.shape print len(training_set_metadata_raw['object_id'].unique()) print "Computing periods" #intra_periods = do_periods(initial_intra) intra_periods = pd.read_csv('./periods_train_aug4.csv') intra_periods.loc[intra_periods['period'].isnull(),'period_score'] = 1 intra_periods.loc[intra_periods['period'].isnull(),'period'] = 0 intra_periods.loc[intra_periods['period_score'].isnull(),'period_score'] = 0 initial_extra = training_set_raw.loc[training_set_raw['object_id'].isin(extra_ids)] intragalactic_data = intragalactic_data.merge( right=intra_periods, how='outer', on='object_id' ) #print intragalactic_data intragalactic_data = removeExtraCols(intragalactic_data) #intragalactic_data['period_score'] = intragalactic_data['score'] #intragalactic_data = intragalactic_data.drop('score',axis=1) extragalactic_data = extragalactic_data.drop('object_id',axis=1) if mode==0: print "Model for extra:" param_grid = { 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } clf = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100) #CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5) #CV_rfc.fit(extragalactic_data, extragalactic_targets) #print "params" #print CV_rfc.best_params_ print extragalactic_data.columns print cross_val_score(clf, extragalactic_data, extragalactic_targets, cv=10, scoring="neg_log_loss").mean() print "Model for intra:" clf = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100) print intragalactic_data.columns #print intragalactic_data clf.fit(intragalactic_data, intragalactic_targets.values.ravel()) print clf.feature_importances_ print cross_val_score(clf, intragalactic_data, intragalactic_targets, cv=10, scoring="neg_log_loss").mean() else: print "Training" extra_model = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100) print extragalactic_data.columns extra_model.fit(extragalactic_data, extragalactic_targets.values.ravel()) intra_model = RandomForestClassifier(n_jobs=2, max_depth=20,n_estimators=100) intra_model.fit(intragalactic_data, intragalactic_targets.values.ravel()) print "Finished training. Starting predictions" print "Reading test data" test_set_metadata_raw = pd.read_csv('/modules/cs342/Assignment2/test_set_metadata.csv') filepath = '/modules/cs342/Assignment2/test_set.csv' extra_classes = extra_model.classes_ intra_classes = intra_model.classes_ extra_ids = [] intra_ids = [] test_set_metadata_raw = fill_in_hostgal_specz(test_set_metadata_raw) extra_ids, intra_ids = splitTestGalaxies(test_set_metadata_raw) column_names = [] column_names.append('object_id') for classi in extra_classes: className = "class_" + str(classi) column_names.append(className) for classi in intra_classes: className = "class_" + str(classi) column_names.append(className) column_names.append("class_99") #print column_names count = 0 batch_no = 0 batch_extra_dataFrame = pd.DataFrame() batch_intra_dataFrame = pd.DataFrame() myextrabatchlist = [] myintrabatchlist = [] print " >Starting new batch 0" my_extra_data_list = [] my_intra_data_list = [] extra_idss = set(extra_ids) intra_idss = set(intra_ids) cc=-1 for obj_id, d in get_objects_by_id(filepath): cc=cc+1 #combined = format(test_set_metadata_raw.loc[test_set_metadata_raw['object_id']==obj_id],d) if (obj_id in extra_idss): my_extra_data_list.append(d) # = np.append(my_extra_data_list, d) else: my_intra_data_list.append(d) #= np.append(my_intra_data_list, d) if(count == 10000): print " >>Formatting batch objects" arr = my_predict(column_names,my_extra_data_list, my_intra_data_list, test_set_metadata_raw, extra_model, intra_model) print " >>Write to csv" finish = pd.DataFrame(arr, columns=column_names) finish["class_99"] = (1-finish.drop("object_id", axis=1)).product(axis=1) #Adding values to class_99 #Below is a very messy way of making all rows sum to 1 despite the above finish.loc[:,finish.columns!="object_id"] = finish.loc[:,finish.columns!="object_id"].div(finish.loc[:,finish.columns!="object_id"].sum(axis=1), axis=0) if(batch_no==0): finish.to_csv("rf_feaug.csv", index = False, header = True) else: with open("rf_feaug.csv", 'a') as f: finish.to_csv(f, index = False, header=False) print " >Starting new batch " + str(batch_no + 1) batch_no = batch_no + 1 lst = 0 count = 0 my_extra_data_list = [] my_intra_data_list = [] else: count = count + 1 print "!Remaining objects: " + str(count) print " >>Formatting batch objects" arr = my_predict(column_names,my_extra_data_list, my_intra_data_list, test_set_metadata_raw, extra_model, intra_model) print " >>Write to csv" finish = pd.DataFrame(arr, columns=column_names) finish["class_99"] = (1-finish.drop("object_id", axis=1)).product(axis=1) #Adding values to class_99 #Below is a very messy way of making all rows sum to 1 despite the above finish.loc[:,finish.columns!="object_id"] = finish.loc[:,finish.columns!="object_id"].div(finish.loc[:,finish.columns!="object_id"].sum(axis=1), axis=0) with open("rf_feaug.csv", 'a') as f: finish.to_csv(f, index = False, header=False) print " >>Clean up." preds = pd.read_csv("rf_feaug.csv") preds['object_id']=preds['object_id'].apply(int) #preds['object_id']=preds['object_id'].apply(int) print preds.shape print cc preds.to_csv("rf_feaug2.csv", index=False) #preds.to_csv('predictions2.csv', index=False) print "DONE."
print('\nR^2:',metrics.r2_score(yy_test,clf3.predict(xx_test))) data.ix[data['Age'].isnull(),'Age'] = clf3.predict(aa) data.dropna(inplace=True) # 字符编码 for i in data.columns: if data[i].dtype == 'object': le = LabelEncoder() data[i] = le.fit_transform(data[i]) Sur = 'Survived' lab = [i for i in data.columns if i not in Sur] y = data.ix[:,Sur] x = data.ix[:,lab] X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.7,random_state=1) forst = ensemble.RandomForestClassifier() gbdt = ensemble.GradientBoostingClassifier() cv1 = cross_val_score(forst,X_train,y_train,cv=5,scoring='f1') cv2 = cross_val_score(gbdt,X_train,y_train,cv=5,scoring='f1') print(cv1.mean()) print(cv2.mean()) print('-------------------') # # grid = GridSearchCV(estimator=gbdt,param_grid={'learning_rate':np.arange(0.1,1,0.1), # 'n_estimators':range(20,100,10), # 'subsample':(0.7,0.8,0.1), # 'max_depth':range(2,10,2) # },scoring='f1') # grid.fit(X_train,y_train) # print(grid.best_params_) gbdt_new = ensemble.GradientBoostingClassifier(learning_rate=0.1,n_estimators=90,subsample=0.1,max_depth=4) gbdt_new.fit(X_train,y_train) print('\n准确率:',metrics.accuracy_score(y_test,gbdt_new.predict(X_test)))
from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_validate from sklearn.model_selection import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn import svm SimData = make_classification(n_samples = 100, n_features = 20, n_informative = 2, n_redundant = 2, n_repeated = 2, n_classes = 2, flip_y = 0.01) # produce a matrix of features and corresponding discrete targets X_train, X_test, y_train, y_test = train_test_split(SimData[0], SimData[1], test_size = 0.5) # randomly sample a training set while holding out 50% of the data for testing (evaluating) our classifier clf = svm.SVC(kernel = 'linear', C = 1).fit(X_train, y_train) score = clf.score(X_test, y_test) print('Accuracy with training/test split: {}'.format(round(score, 2))) clf = svm.SVC(kernel = 'linear', C = 1) scores = cross_val_score(clf, SimData[0], SimData[1], cv = 5) # estimate the accuracy of a linear kernel SVM by splitting the data, fitting a model and computing the score 5 consecutive times (with different splits each time) print('Accuracy with 5-fold cross-validation: {} (+/- {})'.format(round(scores.mean(),2), round(scores.std(),2))) scoring = ['f1', 'roc_auc'] # specifying multiple metrics for evaluation, scores = cross_validate(clf, SimData[0], SimData[1], scoring = scoring, cv = 5, return_train_score = False) # returns a dict containing training scores, fit-times and score-times in addition to the test score print('f1 with 5-fold cross-validation: {}'.format(round(scores['test_f1'].mean(),2))) print('ROC AUC with 5-fold cross-validation: {}'.format(round(scores['test_roc_auc'].mean(),2))) cv = StratifiedKFold(n_splits = 5) # The folds are made by preserving the percentage of samples for each class. scores = cross_val_score(clf, SimData[0], SimData[1], cv = cv) print('Accuracy with stratified 5-fold cross-validation: {})'.format(round(scores.mean(),2))) rfecv = RFECV(estimator = clf, step = 1, cv = cv, scoring = 'accuracy') rfecv.fit(SimData[0], SimData[1]) print("Optimal number of features : %d" % rfecv.n_features_)
previsores[:, 5] = labelencoder_previsores.fit_transform(previsores[:, 5]) previsores[:, 8] = labelencoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelencoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 10] = labelencoder_previsores.fit_transform(previsores[:, 10]) onehotencoder = OneHotEncoder(categorical_features=[0, 1, 3, 5, 8, 9, 10]) previsores = onehotencoder.fit_transform( previsores).toarray() #com o onehotencoder muda pra variavel dummy def criarRede(): regressor = Sequential() regressor.add(Dense(units=158, activation='relu', input_dim=316)) regressor.add(Dense(units=158, activation='relu')) regressor.add(Dense(units=1, activation='linear')) regressor.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error']) return regressor regressor = KerasRegressor(build_fn=criarRede, epochs=100, batch_size=300) resultados = cross_val_score(estimator=regressor, X=previsores, y=preco_real, cv=10, scoring='mean_absolute_error') media = resultados.mean() desvio = resultados.std
def classifierMortalityUnderSampling(): X = df_train.drop(['HospID', 'SiteID', 'surgid', 'Complics', 'Mortality'], axis=1) # 'HospID_total_cardiac_surgery', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear', # 'surgid_total_cardiac_surgery','surgid_total_CABG', 'surgid_Reop_CABG'], axis=1) y = df_train['Mortality'] # Labels X_test = df_test.drop( ['HospID', 'SiteID', 'surgid', 'Complics', 'Mortality'], axis=1) y_test = df_test['Mortality'] # define undersample strategy undersample = RandomUnderSampler(sampling_strategy='majority') # fit and apply the transform X_over, y_over = undersample.fit_resample(X, y) # summarize class distribution print(Counter(y_over)) # X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2) xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', learning_rate=0.1) xgb_model.fit(X, y) y_pred = xgb_model.predict(X_test) preds = xgb_model.predict_proba(X_test) print(confusion_matrix(y_test, y_pred)) print( f"The accuracy of the model is {round(accuracy_score(y_test, y_pred), 5) * 100} %" ) cm = confusion_matrix(y_test, y_pred) labels = ['TN', 'FP', 'FN', 'TP'] categories = ['Alive', 'Dead'] plt = make_confusion_matrix(cm, categories=categories, cmap='RdPu', title='Confusion Metrics Mortality:', group_names=labels) plt.show() feature_importance(xgb_model, df_model_draft, X_test, y_test, 'pink', 'RdPu') make_roc_auc_curve(y_test, preds, 'ROC Curve for XGBoost with Experience') # example of evaluating a decision tree with random undersampling from numpy import mean from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from imblearn.pipeline import Pipeline # define pipeline steps = [('under', RandomUnderSampler()), ('model', XGBClassifier())] pipeline = Pipeline(steps=steps) # evaluate pipeline cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1) print('Mean ROC AUC: %.5f' % mean(scores)) print(scores)
keys = df_test[KEY_COL] x_test = df_test.drop(columns=[KEY_COL]) logger.info('x_test: {}'.format(x_test.shape)) del df_test # logger.info('--- reduce memory usage ---') # x_train = utils.reduce_mem_usage(x_train, logger) # x_test = utils.reduce_mem_usage(x_test, logger) # gc.collect() logger.info('--- cross validation ---') epochs = 20 batch_size = 1000 clf = KerasClassifier(build_fn=gen_model, epochs=epochs, batch_size=batch_size) kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) cv_auc = cross_val_score(clf, x_train, y_train, cv=kfold, scoring='roc_auc', verbose=3) logger.info('auc of each cv: {}'.format(cv_auc)) mean_auc = mean(cv_auc) logger.info('mean auc: {}'.format(mean_auc)) logger.info('fitting to train data...') clf.fit(x_train, y_train) logger.info('predicting test data...') pred_test = clf.predict_proba(x_test, batch_size=1000, verbose=1)[:, 1] logger.info('--- save submission file ---') df_submission = pd.DataFrame({ KEY_COL: keys, TGT_COL: pred_test })
# coding:utf-8 import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import BaggingRegressor from sklearn.ensemble import AdaBoostRegressor from xgboost import XGBRegressor train_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_train.csv",index_col = 0) test_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_test.csv",index_col = 0) numeric_cols = train_df.columns[train_df.dtypes != 'object'] y_train=train_df["SalePrice"] X_train=train_df.drop(['SalePrice'],axis=1) test_df=test_df.drop(['MSSubClass_90'],axis=1) # boosting 比bagging更高级,它是弄来一把分类器,把它们线性排列,下一个分类器把上一个分类器分类不好的地方加上更高的权重,这样,下一个分类器在这部分就能学习得更深刻 ridge = Ridge(alpha = 15) params = [10,15,20,25,30,35,40,45,50] test_scores = [] for param in params: clf = AdaBoostRegressor(base_estimator = ridge,n_estimators = param) test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 10,scoring = 'neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params,test_scores) plt.title('n_estimators vs CV Error') plt.show()
seed = 7 scoring = 'accuracy' models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict(X_validation)
['Outlook','Wind'])) ct_2 = make_column_transformer( (KBinsDiscretizer(3,encode='ordinal'), ['Temperature']), (KBinsDiscretizer(2,encode='ordinal'), ['Humidity']), (OneHotEncoder(), ['Outlook','Wind'])) one_X = ct_1.fit_transform(df) two_X= ct_2.fit_transform(df) #%% from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB, CategoricalNB import numpy as np ngb=GaussianNB() sc_1=cross_val_score(ngb,X=one_X,y=y,scoring='accuracy',cv=5) print('the avg score for GassianNB in setting one is',np.mean(sc_1)) # %% cgb=CategoricalNB() sc_2=cross_val_score(cgb,X=one_X,y=y,scoring='accuracy',cv=5) print('the avg score for CategoryNB in setting one is',np.mean(sc_2)) # %% ngb=GaussianNB() sc_1=cross_val_score(ngb,X=two_X,y=y,scoring='accuracy',cv=5) print('the avg score for GassianNB in setting two is',np.mean(sc_1)) # %% cgb=CategoricalNB() sc_2=cross_val_score(cgb,X=two_X,y=y,scoring='accuracy',cv=5) print('the avg score for CategoryNB in setting two is',np.mean(sc_2)) # %%
'intrusion_pcl1', 'q6.5_PHYS_pcl2', 'q6.13_SLEEP_pcl2', 'q6.3_FLASH_pcl2' ] path = "C:\PycharmProjects\PTSD\Data\PTSD.xlsx" df = pd.read_excel(path) df = df[~df['PCL_Strict3'].isna()] df = df[["ID", 'PCL_Strict3']] df_pcl3 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx") df_pcl2 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx") df_pcl1 = pd.read_excel( "C:\PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx") df = df.merge(df_pcl1, on="ID") df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID") df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID") df = df[features_0 + ['PCL_Strict3']].dropna() X = df[features_0] Y = df['PCL_Strict3'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=271828, stratify=Y) scores = cross_val_score(BaggingClassifier(), X_train, y_train, scoring='f1', cv=10) print(sum(scores) / len(scores))
train_test_split(iris.data,iris.target,test_size=0.51,random_state=0) # 分析数据 # 创建分类器 #classifier = nb.KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='auto') classifier = nb.RadiusNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto') # 训练分类器 classifier.fit(train_data, train_label) # 预测 test_label_predicted = classifier.predict(test_data) # 交叉验证 scores = cross_val_score(classifier, iris.data, iris.target, cv=10) # 比较结果 size = len(test_label_predicted) outer = np.zeros((size), dtype=int) for i in range(size): if test_label_expected[i] != test_label_predicted[i]: outer[i] = 1 result = np.vstack((test_label_expected, test_label_predicted, outer)) result = result.T # 计算正确率 #classifier.score(test_data,test_label_expected) okresult = float(np.sum(outer == 0)) / len(outer) print( "Classification report for classifier %s:\n%s\n" %
train_size = int(len(df) * 0.7) print(train_size) data = df.loc[:, df.columns != 'DELAYED'] labels = df['DELAYED'] train_data = data[:train_size] train_labels = labels[:train_size] test_data = data[train_size:] test_labels = labels[train_size:] model = RandomForestClassifier(n_estimators=100, verbose=3) # 5-fold cross validation of model score_acc = (cross_val_score(model, train_data, train_labels, cv=5, scoring='accuracy')) print("Accuracy: %0.2f (+/- %0.2f)" % (score_acc.mean(), score_acc.std() * 2)) score_f1 = (cross_val_score(model, train_data, train_labels, cv=5, scoring='f1')) print("F1 Score: %0.2f (+/- %0.2f)" % (score_f1.mean(), score_f1.std() * 2)) # Fit the classifier to the training set model.fit(train_data, train_labels) # Predict probabilities of classes on the test set
os.chdir("E:/") titanic_train = pd.read_csv("train.csv") #EDA titanic_train.shape titanic_train.info() #data preparation titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked']) titanic_train1.shape titanic_train1.info() titanic_train1.head(6) #feature engineering X_train = titanic_train1.drop(['PassengerId','Age','Cabin','Ticket', 'Name','Survived'], 1) y_train = titanic_train['Survived'] #build the decision tree model dt = tree.DecisionTreeClassifier() #use cross validation to estimate performance of model. #No model build during cross validation is not used as final model cv_scores = model_selection.cross_val_score(dt, X_train, y_train, cv=10, verbose=1) cv_scores.mean() #build final model on entire train data which is used for prediction dt.fit(X_train,y_train) # natively deploy decision tree model(pickle format) joblib.dump(dt, "tree1.pkl")
x = x.dropna() y = df['Survived'] # 4) Разделить данные на обучающую и проверочную выборки (или использовать кросс-валидацию). Будем строить дерево решений. # Нужно выбрать параметр модели, который, на ваш взгляд, может повлиять на результат, и выбрать для него возможные значения. # Прокомментировать свой выбор. Изменяя в цикле значения параметра, посчитать для каждого случая точноть, полноту, F-меру (может быть, другие метрики?). # Изобразить результаты на диаграмме/-ах. Интерпретировать результаты. Нарисовать лучшее дерево. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) clf = DecisionTreeClassifier(min_samples_split=5) clf.fit(np.array(x_train), np.array(y_train)) importances = pandas.Series(clf.feature_importances_, index=x_labels) print(importances) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred)) print(np.mean(cross_val_score(clf, X_train, y_train, cv=5))) # 5) Проделать аналогичные операции для модели Random Forest. Сравнить результаты. model = RandomForestClassifier(n_estimators = 100) model.fit(X_train, y_train) y_pred = model.predict(X_test) print(classification_report(y_test, y_pred)) scores = []\n", for t in range(1,100): rfc = RandomForestClassifier(n_estimators=t) rfc.fit(X_train, y_train) y_pred = rfc.predict(X_test) scores.append(f1_score(y_test, y_pred)) rfc.fit(X_train, y_train) y_pred = rfc.predict(X_test)
import numpy as np import pandas from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold data = pandas.read_csv('samples/abalone.csv') data['Sex'] = data['Sex'].map(lambda v: -1 if v == 'F' else (0 if v == 'I' else 1)) x = data.drop(['Rings'], axis=1) y = data.Rings cv = KFold(n_splits=5, shuffle=True, random_state=1) b = [] for i in range(1, 51): clf = RandomForestRegressor(n_estimators=i, random_state=1) a = np.mean(cross_val_score(estimator=clf, X=x, y=y, cv=cv, scoring='r2')) b.append([i, a]) print(i, a) ans = list(filter(lambda v: v[1] > 0.52, b)) print('ans =', ans[0][0])
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1) # spot check algorithms models = [] models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(gamma='auto'))) # evaluate each model results = [] names = [] for name, model in models: kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True) cv_results = cross_val_score(model, X_train, Y_train, scoring='accuracy') results.append(cv_results) names.append(name) print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) # compare algorithms pyplot.boxplot(results, labels=names) pyplot.title('alogorithm comparison') pyplot.show()
print(classification_report(test_labels, predictions)) print(confusion_matrix(test_labels, predictions)) # In[ ]: # cross validation 找到最好的k值 from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score k_range = range(1,50) k_scores = [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, data_now, label, cv = 5, scoring='accuracy') print("k = " + str(k) + ", score = " + str(scores) + ", mean = " + str(scores.mean())) k_scores.append(scores.mean()) # In[ ]: plt.plot(k_range, k_scores) plt.xlabel('K for KNN') plt.ylabel('Cross Validation Accuracy') plt.show() # In[ ]:
# preparando los modelos semilla = 7 modelos = [] modelos.append(("LR", LinearRegression())) modelos.append(("DT", DecisionTreeRegressor())) modelos.append(("RF", RandomForestRegressor())) modelos.append(("SVR", SVR())) # evaluando cada modelo results = [] names = [] scoring = 'r2' for name, model in modelos: kfold = ms.KFold(n_splits=3, random_state=semilla) cv_results = ms.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # gráficas de los modelos con los puntos linear = LinearRegression().fit(X, Y) decision_tree = DecisionTreeRegressor().fit(X, Y) random_forest = RandomForestRegressor().fit(X, Y) svr = SVR().fit(X, Y) plt.scatter(X, Y) plt.plot(datosGlobal.dia.sort_values(), linear.predict(datosGlobal.dia.sort_values().values.reshape(-1, 1)), label="lineal") plt.plot(datosGlobal.dia.sort_values(), decision_tree.predict(datosGlobal.dia.sort_values().values.reshape(-1, 1)), label="arbol de decision") plt.plot(datosGlobal.dia.sort_values(), random_forest.predict(datosGlobal.dia.sort_values().values.reshape(-1, 1)), label="bosque aleatorio")
#XGB XGB_model = XGBClassifier(min_child_weight=0.1,max_depth=7) #KNN KNN_model = KNeighborsClassifier() #SVM SVM_model = SVC(kernel = 'linear',probability = True) #隨機森 RFC_model = RandomForestClassifier(n_estimators=100,n_jobs=5) #羅吉斯回歸 LR_model = LogisticRegression() scores_x = cross_val_score(XGB_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error') scores_k = cross_val_score(KNN_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error') scores_s = cross_val_score(SVM_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error') scores_r = cross_val_score(RFC_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error') scores_l = cross_val_score(LR_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error') sm_x = -scores_x.mean() sm_k = -scores_k.mean() sm_s = -scores_s.mean() sm_r = -scores_r.mean() sm_l = -scores_l.mean() mape_list.append([name,sm_x,sm_k,sm_s,sm_r,sm_l])
def main(): # 1 查看训练集和测试集的数据特征 train_data = pandas.read_csv('data/train.csv') test_data = pandas.read_csv('data/test.csv') print(train_data.info()) print(test_data.info()) # 2 人工选取预测有效的特征 selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch', 'Fare'] x_train = train_data[selected_features] x_test = test_data[selected_features] y_train = train_data['Survived'] # 3 补充缺失值 # 得知Embared特征惨在缺失值,需要补完 print(x_train['Embarked'].value_counts()) print(x_test['Embarked'].value_counts()) # 对于类别型特征,使用出现频率最高的特征来填充,可以作为减少引入误差的方法之一 x_train['Embarked'].fillna('S', inplace=True) x_test['Embarked'].fillna('S', inplace=True) x_train['Age'].fillna(x_train['Age'].mean(), inplace=True) x_test['Age'].fillna(x_test['Age'].mean(), inplace=True) x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True) print(x_train.info()) print(x_test.info()) # 4 采用DictVectorizer对特征向量化 dict_vectorizer = DictVectorizer(sparse=False) x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record')) print(dict_vectorizer.feature_names_) x_test = dict_vectorizer.transform(x_test.to_dict(orient='record')) # 5 训练模型 forest_classifier = RandomForestClassifier() xgb_classifier = XGBClassifier() # 使用5折交叉验证的方式进行性能评估 forest_mean_score = cross_val_score(forest_classifier, x_train, y_train, cv=5).mean() print(forest_mean_score) xgb_mean_score = cross_val_score(xgb_classifier, x_train, y_train, cv=5).mean() print(xgb_mean_score) # 6 使用并行网格搜索的方式选择更好的超参组合 params = { 'max_depth': range(2, 8), 'n_estimators': range(100, 1200, 200), 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0] } xgbc_best = XGBClassifier() grid_search_cv = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5) grid_search_cv.fit(x_train, y_train) print(grid_search_cv.best_score_) print(grid_search_cv.best_params_) # 7 预测结果并写入文件 predict_result = grid_search_cv.predict(x_test) submission_data = pandas.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': predict_result}) submission_data.to_csv('data/submission/titanic_submission.csv', index=False)