def test_xor(): """Check on a XOR problem""" y = np.zeros((10, 10)) y[:5, :5] = 1 y[5:, 5:] = 1 gridx, gridy = np.indices(y.shape) X = np.vstack([gridx.ravel(), gridy.ravel()]).T y = y.ravel() clf = tree.DecisionTreeClassifier() clf.fit(X, y) assert_equal(clf.score(X, y), 1.0) clf = tree.DecisionTreeClassifier(max_features=1) clf.fit(X, y) assert_equal(clf.score(X, y), 1.0) clf = tree.ExtraTreeClassifier() clf.fit(X, y) assert_equal(clf.score(X, y), 1.0) clf = tree.ExtraTreeClassifier(max_features=1) clf.fit(X, y) assert_equal(clf.score(X, y), 1.0)
def trees( x_train, x_test, y_train, y_test ): res = [] print("hello trees") m = tree.DecisionTreeClassifier() m.fit(x_train, y_train) print("fiting") predictions = m.predict(x_test) acc = accuracy_score(y_test,predictions) modelPack['DecisionTreeClassifier'] = m res.append( ( acc , "DecisionTreeClassifier" ) ) m = tree.ExtraTreeClassifier() m.fit(x_train, y_train) predictions = m.predict(x_test) acc = accuracy_score(y_test,predictions) modelPack['ExtraTreeClassifier'] = m res.append( ( acc , "ExtraTreeClassifier" ) ) print(res) return res
def get_algorithms(): MLA_dict = { # Ensemble methods "ada": ensemble.AdaBoostClassifier(), "bc": ensemble.BaggingClassifier(), "etc": ensemble.ExtraTreesClassifier(), "gbc": ensemble.GradientBoostingClassifier(), "rfc": ensemble.RandomForestClassifier(), # Gaussian processes "gpc": gaussian_process.GaussianProcessClassifier(), # Linear models "lr": linear_model.LogisticRegressionCV(), "pac": linear_model.PassiveAggressiveClassifier(), "rcc": linear_model.RidgeClassifierCV(), "sgd": linear_model.SGDClassifier(), "per": linear_model.Perceptron(), # Navies bayes "bnb": naive_bayes.BernoulliNB(), "gnb": naive_bayes.GaussianNB(), # Nearest neighbour "knn": neighbors.KNeighborsClassifier(), # SVM "svc": svm.SVC(probability=True), "nvc": svm.NuSVC(probability=True), "lvc": svm.LinearSVC(), # Trees "dtc": tree.DecisionTreeClassifier(), "ets": tree.ExtraTreeClassifier(), # Discriminant analysis "lda": discriminant_analysis.LinearDiscriminantAnalysis(), "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(), } return MLA_dict
def test_grids_list_get(self): iris = load_iris() client = DjangoClient() response = client.get(reverse('grids_list')) self.assertEqual(200, response.status_code) self.assertEqual(0, len(response.data)) gs1 = ATGridSearchCV(tree.DecisionTreeClassifier(), { 'criterion': ['gini', 'entropy'], 'max_depth': range(1, 6), 'max_features': ['auto', 'log2'] }, webserver_url=self.live_server_url) wait(gs1.fit(iris.data, iris.target)) response = client.get(reverse('grids_list')) self.assertEqual(200, response.status_code) self.assertEqual(1, len(response.data)) gs2 = ATGridSearchCV(tree.ExtraTreeClassifier(), { 'criterion': ['gini', 'entropy'], 'max_depth': range(1, 6), 'max_features': ['auto', 'log2'] }, webserver_url=self.live_server_url) wait(gs2.fit(iris.data, iris.target)) response = client.get(reverse('grids_list')) self.assertEqual(200, response.status_code) self.assertEqual(2, len(response.data))
def use_dtree(X_train, X_test, y_train, y_test): clf = tree.ExtraTreeClassifier() clf = clf.fit(X_train, y_train) expected = y_test predicted = clf.predict(X_test) print("Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(expected, predicted))) return clf
def test_sk_ExtraTreeClassifier(): print("Testing sklearn, ExtraTreeClassifier...") mod = tree.ExtraTreeClassifier() X, y = iris_data mod.fit(X, y) docs = {'name': "ExtraTreeClassifier test"} fv = X[0, :] upload(mod, fv, docs)
def init_decision_tree(self) -> None: all_models = [ tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier() ] self.models.extend(all_models) models = ["decision", "extra"] for mod in models: self.model_keys[mod] = "tree"
def test_classification_toy(): """Check classification on a toy dataset.""" # Decision trees clf = tree.DecisionTreeClassifier() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) clf = tree.DecisionTreeClassifier(max_features=1, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Extra-trees clf = tree.ExtraTreeClassifier() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) clf = tree.ExtraTreeClassifier(max_features=1, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result)
def fit(self, XTrain, yTrain): for t in range(self.n_estimators): clf = tree.ExtraTreeClassifier(max_features="log2") index = self.sample(XTrain.shape[0]) #print(index) XSub = XTrain[index] ySub = yTrain[index] f = clf.fit(XSub, ySub) self.h.append(f)
def train_test(x_tr, y_tr, x_te, y_te, name): algorithms = { 'ada_boost': ensemble.AdaBoostClassifier(), 'bagging': ensemble.BaggingClassifier(), 'extra_trees': ensemble.ExtraTreesClassifier(), 'random_forest': ensemble.RandomForestClassifier(), 'logistic_regression': linear_model.LogisticRegression(), 'passive_aggressive': linear_model.PassiveAggressiveClassifier(), 'ridge': linear_model.RidgeClassifier(), 'sgd': linear_model.SGDClassifier(), 'bernoulli': naive_bayes.BernoulliNB(), 'gaussian': naive_bayes.GaussianNB(), 'k_neighbors': neighbors.KNeighborsClassifier(), 'nearest_centroid': neighbors.NearestCentroid(), 'mlp': neural_network.MLPClassifier(), 'linear_svc': svm.LinearSVC(), 'decision_tree': tree.DecisionTreeClassifier(), 'extra_tree': tree.ExtraTreeClassifier(), 'gradient_boosting': ensemble.GradientBoostingClassifier(), 'hist_gradient_boosting': HistGradientBoostingClassifier() } res = {} try: clf = GridSearchCV(algorithms.get(name), getattr(CVParameters, name), cv=2, n_jobs=-1) start = time.clock() clf.fit(x_tr, y_tr) tr_time = time.clock() - start print(tr_time) print(clf.best_params_) print(clf.best_score_) tr_score = clf.score(x_tr, y_tr) score = clf.score(x_te, y_te) tr_fscore = f1_score(y_tr, clf.predict(x_tr), average='weighted') fscore = f1_score(y_te, clf.predict(x_te), average='weighted') print(tr_score, score, tr_fscore, fscore) res = { name: { 'test': score, 'train': tr_score, 'f1_test': fscore, 'f1_train': tr_fscore, 'tr_time': tr_time } } res[name].update(clf.best_params_) except Exception as e: print(e) return res
def ModelSelection(test_data, features, label): MLA = [ ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), gaussian_process.GaussianProcessClassifier(), linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), neighbors.KNeighborsClassifier(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), ] MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score'] MLA_compare = pd.DataFrame(columns=MLA_columns) x_train, x_test, y_train, y_test = train_test_split(train_data[features], train_data[label], test_size=0.2) row_index = 0 MLA_predict = train_data[label] for alg in MLA: MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params()) alg.fit(x_train, y_train) MLA_predict[MLA_name] = alg.predict(x_test) MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test) row_index += 1 MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True) return MLA_compare, x_train, x_test, y_train, y_test
def main(): df = pd.read_csv(sys.argv[1], header=0) x = df.iloc[0:184, 1:].to_numpy() y = df.iloc[0:184, 0].to_numpy() svmclf = svm.SVC(kernel='linear', C=1) dctclf = tree.DecisionTreeClassifier() extclf = tree.ExtraTreeClassifier() mlpclf = neural_network.MLPClassifier() gnbclf = GaussianNB() sgdclf = SGDClassifier() clf_lst = [svmclf, dctclf, extclf, mlpclf, gnbclf, sgdclf] cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0) """ 输出分类结果 """ for clf in clf_lst: scores = cross_val_score(clf, x, y, cv=cv, scoring='recall') print(scores)
def all_classifiers(): # Model Data MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html XGBClassifier() ] return MLA
train = df[df['game_year'] < 2013] test = df[df['game_year'] == 2013] xTrain = scale(train[xColumns]) yTrain = train['homeWin'] xTest = scale((test[xColumns])) yTest = test['homeWin'] logreg = LogisticRegression() logreg.fit(xTrain, yTrain) yHat = logreg.predict(xTest) print sum([1 - abs(x) for x in (yHat - yTest)]) / float(len(yHat)) #compare all for each year naive_bayes = GaussianNB() treeClf = tree.DecisionTreeClassifier() extraTtreeClf = tree.ExtraTreeClassifier() randomTreeClf = RandomForestClassifier(n_estimators=100) extraTreeClf = ExtraTreesClassifier(n_estimators=100) gBoostTreeClf = GradientBoostingClassifier(n_estimators=100) lassoClf = sklearn.linear_model.Lasso() logiClf = LogisticRegression() rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=1.0) lin_svc = svm.LinearSVC(C=1.0) for clf in [ treeClf, extraTtreeClf, randomTreeClf, extraTreeClf, gBoostTreeClf, lassoClf, logiClf, rbf_svc, lin_svc, naive_bayes ]: print clf for year in range(2009, 2014): train = df[df['game_year'] < year] test = df[df['game_year'] == year]
#Navies Bayes naive_bayes.BernoulliNB(), #naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), #Discriminant Analysis #discriminant_analysis.LinearDiscriminantAnalysis(), #discriminant_analysis.QuadraticDiscriminantAnalysis(), ] for methode in Methodes: clf = methode scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1") print(str(methode) + str(scores))
def get_stacking(X_train, Y_train, X_test, Y_test, batch_size, epochs): from sklearn.ensemble import GradientBoostingClassifier kwargs = dict(learning_rate=0.1) clf_gbc = (CLF_GBC, GradientBoostingClassifier(**kwargs)) clf_gnb = ('GaussianNB', GaussianNB()) kwargs = None kwargs = dict(alpha=0.631578947368421) clf_multinomial_nb = ('MultinomialNB', MultinomialNB(**kwargs)) # kwargs = None kwargs = dict(alpha=1.894736842105263, norm=False) clf_complement_nb = ('ComplementNB', ComplementNB(**kwargs)) # kwargs = None kwargs = {'alpha': 0.3157894736842105, 'binarize': 0.5263157894736842} clf_bernoulli_nb = ('BernoulliNB', BernoulliNB(**kwargs)) # kwargs = {'max_depth': 5} kwargs = { 'criterion': 'gini', 'max_depth': 4.0, 'max_features': None, 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best' } clf_tree_dec = ('DecisionTreeClassifier', tree.DecisionTreeClassifier(**kwargs)) clf_tree_extra = (CLF_EXTRA_TREE, tree.ExtraTreeClassifier()) # kwargs = dict(kernel="linear", C=0.025, cache_size=200, probability=True) kwargs = dict(C=10, gamma=0.001, kernel='rbf', random_state=RANDOM_SEED, probability=True) clf_svm = (CLF_SVC, svm.SVC(**kwargs)) # kwargs = {'C': 100.0} kwargs = { 'C': 100.0, 'dual': False, 'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'saga' } clf_log_reg_100 = ('LogisticRegression_100', LogisticRegression(**kwargs)) clf_log_reg_10k = ('LogisticRegression_10k', LogisticRegression(C=10000)) # kwargs = {'n_neighbors': 3} kwargs = { 'algorithm': 'auto', 'leaf_size': 20, 'metric': 'minkowski', 'n_jobs': 6, 'n_neighbors': 5, 'p': 2, 'weights': 'distance' } clf_kneighbors = (CLF_KNN, KNeighborsClassifier(**kwargs)) clf_rand_forest_50 = (CLF_RANDOM_50, RandomForestClassifier(n_estimators=50, n_jobs=12)) clf_rand_forest_10 = (CLF_RANDOM_5, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=12)) bernoulli_rbm = ( CLF_BERNOULLI_RBM, Pipeline(steps=[( 'rbm', BernoulliRBM( n_components=200, n_iter=1, learning_rate=0.01, verbose=False) ), ('logistic', LogisticRegression(C=10000))])) mlp = (CLF_MLP, MLPClassifier(hidden_layer_sizes=(75, ), max_iter=250, alpha=1e-4, solver='sgd', verbose=0, tol=1e-4, random_state=RANDOM_SEED, learning_rate_init=.1, early_stopping=True)) adaboost = ('AdaBoostClassifier', AdaBoostClassifier()) # base_clfs = [clf_gnb, clf_multinomial_nb, clf_complement_nb, clf_bernoulli_nb, clf_tree_dec, clf_tree_extra, # clf_svm, clf_log_reg_100, clf_log_reg_10k, clf_kneighbors, clf_rand_forest_50, clf_rand_forest_10, # bernoulli_rbm, mlp, adaboost] clf_xgb = ('xgb', xgb.XGBClassifier(objective="multi:softprob", random_state=RANDOM_SEED)) f22StackingCnn = ('F22StackingCnn', F22StackingCnn(batch_size, epochs)) # base_clfs = [clf_gnb, clf_gbc, clf_xgb, clf_multinomial_nb, clf_complement_nb, clf_bernoulli_nb, clf_tree_dec, clf_tree_extra, # clf_svm, clf_log_reg_100, clf_log_reg_10k, clf_kneighbors, clf_rand_forest_50, clf_rand_forest_10, # bernoulli_rbm, mlp, adaboost] logging.info('Shape: {}'.format(X_train.shape)) base_clfs = [ clf_gnb, clf_gbc, clf_xgb, clf_multinomial_nb, clf_complement_nb, clf_bernoulli_nb, clf_tree_dec, clf_tree_extra, clf_svm, clf_log_reg_100, clf_log_reg_10k, clf_kneighbors, clf_rand_forest_50, clf_rand_forest_10, bernoulli_rbm, mlp, adaboost ] kwargs = { 'C': 100.0, 'dual': False, 'fit_intercept': True, 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'saga' } lr = LogisticRegression(**kwargs) classifiers = [x[1] for x in base_clfs] stacking = ('StackingCVClassifier', StackingCVClassifier(classifiers=classifiers, use_probas=True, use_features_in_secondary=True, meta_classifier=lr)) clf_stacking = stacking[1] clf_list = base_clfs.copy() clf_list.append(stacking) return clf_list, clf_stacking
print(metrics.accuracy_score(y_test, predict_log)) # print(log_regression.predict_proba(testingdf)) logclassifier = pd.DataFrame(log_regression.predict_proba(testingdf)) # Decision Tree Classifer treereg = tree.DecisionTreeClassifier() treereg.fit(x_train, y_train) predicttree = treereg.predict(x_test) print("Decision Tree") # print(predicttree) print(metrics.accuracy_score(y_test, predicttree)) # print(treereg.predict_proba(testingdf)) treeclassifier = pd.DataFrame(treereg.predict_proba(testingdf)) # Extra Tree Classifier extrareg = tree.ExtraTreeClassifier() extrareg.fit(x_train, y_train) predictextree = extrareg.predict(x_test) print("Extra Tree Classifier") # print(predictextree) print(metrics.accuracy_score(y_test, predictextree)) # print(extrareg.predict_proba(testingdf)) extraclassifier = pd.DataFrame(extrareg.predict_proba(testingdf)) # Random Forest Classifier forestreg = ensemble.RandomForestClassifier() forestreg.fit(x_train, y_train) predictforest = forestreg.predict(x_test) print("Random Forest Classifier") # print(predictforest) print(metrics.accuracy_score(y_test, predictforest))
#ÁRBOLES DE DECISIÓN #Segundo algoritmo Árboles de decisión #Árbol de decisión normal arbNor = tree.DecisionTreeClassifier() arbNor = arbNor.fit(data_train, target_train) predADnor = arbNor.predict(data_test) scoresADnor = cross_val_score(arbNor, atributos, target, cv=5, scoring='accuracy') #Árbol de decisión extra arbEx = tree.ExtraTreeClassifier() arbEx = arbEx.fit(data_train, target_train) predADex = arbEx.predict(data_test) scoresADex = cross_val_score(arbEx, atributos, target, cv=5, scoring='accuracy') #Porcentajes de acierto print("Usando AD normal se tiene una tasa de acierto del ", np.mean(scoresADnor) * 100) print("Usando AD extra se tiene una tasa de acierto del ", np.mean(scoresADex) * 100) #Matrices de validación
linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(n_neighbors=3), #SVM svm.SVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), ] #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit #note: this is an alternative to train_test_split cv_split = model_selection.ShuffleSplit( n_splits=10, test_size=.3, train_size=.6, random_state=0 ) # run model 10x with 60/30 split intentionally leaving out 10% #create table to compare MLA MLA_columns = [ 'MLA Name', 'MLA Parameters', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy Min', 'MLA Time' ] MLA_compare = pd.DataFrame(columns=MLA_columns)
def main(): train_df = pd.read_csv("train.csv") test_df = pd.read_csv("test.csv") combine = [train_df, test_df] for df in combine: df.info() standardize_data(df) create_columns(df) create_bins(df) encode_data(df) # Define target (Y variable) target = ["Survived"] # Define features (X variables) train_df_x = [ "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone", "Title", ] # Define numerical features (binned and encoded) train_df_x_bin = [ "Pclass", "Sex_Code", "AgeBin_Code", "FareBin_Code", "Embarked_Code", "FamilySize", "IsAlone", "Title_Code", ] # Analyze feature correlation with target for x in train_df_x: if train_df[x].dtype != "float64": print(train_df[[x, target[0]]].groupby(x).mean()) # Graph individual features by survival fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.histplot(x="Fare", data=train_df, hue="Survived", multiple="stack", ax=axis[0]) sns.histplot(x="Age", data=train_df, hue="Survived", multiple="stack", ax=axis[1]) sns.histplot(x="FamilySize", data=train_df, hue="Survived", multiple="stack", ax=axis[2]) fig, axis = plt.subplots(2, 3, figsize=(16, 12)) sns.barplot(x="Pclass", y="Survived", data=train_df, ax=axis[0, 0]) sns.barplot(x="Sex", y="Survived", data=train_df, ax=axis[0, 1]) sns.barplot(x="Embarked", y="Survived", data=train_df, ax=axis[0, 2]) sns.barplot(x="IsAlone", y="Survived", data=train_df, ax=axis[1, 0]) sns.barplot(x="Title", y="Survived", data=train_df, ax=axis[1, 1]) # Compare class with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Sex", ax=axis[0]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Compare Sex with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Pclass", ax=axis[0]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Correlation heatmap of dataset fig, ax = plt.subplots(figsize=(14, 12)) fig = sns.heatmap( train_df.corr(), cmap=sns.diverging_palette(240, 10, as_cmap=True), annot=True, ax=ax, ) # Machine Learning Algorithm (MLA) selection and initialization mla = [ linear_model.LogisticRegressionCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(dual=False), neighbors.KNeighborsClassifier(), gaussian_process.GaussianProcessClassifier(), naive_bayes.GaussianNB(), naive_bayes.BernoulliNB(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier(), ensemble.ExtraTreesClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), ] mla_compare = test_models(mla, train_df, train_df_x_bin, target) best_estimator = optimize_params(mla, mla_compare, train_df, train_df_x_bin, target) generate_submission_csv(test_df, train_df_x_bin, best_estimator)
# height, weight, shoes size X = [[181, 80, 44], [107, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42], [181, 85, 43]] # gender Y = [ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male' ] # decision tree classifier clf = tree.DecisionTreeClassifier() clf = clf.fit(X, Y) # decission tree regresor reg = tree.DecisionTreeRegressor() #reg = reg.fit(X, Y) # deccion tree, extremely randomized rand = tree.ExtraTreeClassifier() rand = rand.fit(X, Y) value = [[140, 180, 42]] clf_prediction = clf.predict(value) #reg_prediction = reg.predict(value) rand_prediction = rand.predict(value) print(clf_prediction) #print(reg_prediction) print(rand_prediction)
classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegression(random_state=RANDOM_SEED)), classification_binary( linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)), classification_binary( linear_model.RidgeClassifier(random_state=RANDOM_SEED)), classification_binary(linear_model.RidgeClassifierCV()), classification_binary( linear_model.SGDClassifier(random_state=RANDOM_SEED)), # Decision trees regression(tree.DecisionTreeRegressor(**TREE_PARAMS)), regression(tree.ExtraTreeRegressor(**TREE_PARAMS)), classification(tree.DecisionTreeClassifier(**TREE_PARAMS)), classification(tree.ExtraTreeClassifier(**TREE_PARAMS)), classification_binary(tree.DecisionTreeClassifier(**TREE_PARAMS)), classification_binary(tree.ExtraTreeClassifier(**TREE_PARAMS)), # Random forest regression(ensemble.RandomForestRegressor(**FOREST_PARAMS)), regression(ensemble.ExtraTreesRegressor(**FOREST_PARAMS)), classification(ensemble.RandomForestClassifier(**FOREST_PARAMS)), classification(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)), classification_binary( ensemble.RandomForestClassifier(**FOREST_PARAMS)), classification_binary(ensemble.ExtraTreesClassifier(**FOREST_PARAMS)), ], # Following is the list of extra tests for languages/models which are # not fully supported yet.
def compare_algorithm(data, target): x_train, x_cross, y_train, y_cross = train_test_split(data, target) MLA = [ # Ensemble Methods ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(max_iter=1000, tol=0.001), linear_model.Perceptron(max_iter=1000, tol=0.001), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis discriminant_analysis.LinearDiscriminantAnalysis(), discriminant_analysis.QuadraticDiscriminantAnalysis(), # xgboost: http://xgboost.readthedocs.io/en/latest/model.html xgb.XGBClassifier() ] MLA_columns = [] MLA_compare = pd.DataFrame(columns=MLA_columns) row_index = 0 for alg in MLA: predicted = alg.fit(x_train, y_train).predict(x_cross) fp, tp, th = roc_curve(y_cross, predicted) MLA_name = alg.__class__.__name__ MLA_compare.loc[row_index, 'MLA Name'] = MLA_name MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round( alg.score(x_train, y_train), 4) MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round( alg.score(x_cross, y_cross), 4) MLA_compare.loc[row_index, 'MLA Precission'] = precision_score( y_cross, predicted) MLA_compare.loc[row_index, 'MLA Recall'] = recall_score(y_cross, predicted) MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp) row_index = row_index + 1 MLA_compare.sort_values(by=['MLA Test Accuracy'], ascending=False, inplace=True) print(MLA_compare)
class ScikitLearnModelConverterTest(tf.test.TestCase, parameterized.TestCase): @parameterized.parameters( (tree.DecisionTreeRegressor(random_state=42),), (tree.ExtraTreeRegressor(random_state=42),), (ensemble.RandomForestRegressor(random_state=42),), (ensemble.ExtraTreesRegressor(random_state=42),), (ensemble.GradientBoostingRegressor(random_state=42,),), (ensemble.GradientBoostingRegressor(random_state=42, init="zero"),), (ensemble.GradientBoostingRegressor( random_state=42, init=tree.DecisionTreeRegressor(random_state=42), ),), ) def test_convert_reproduces_regression_model( self, sklearn_tree, ): features, labels = datasets.make_regression( n_samples=100, n_features=10, random_state=42, ) sklearn_tree.fit(features, labels) tf_features = tf.constant(features, dtype=tf.float32) with self.subTest(msg="inference_is_reproduced_before_save"): tf_tree = scikit_learn_model_converter.convert(sklearn_tree) tf_labels = tf_tree(tf_features).numpy().ravel() sklearn_labels = sklearn_tree.predict(features).astype(np.float32) self.assertAllClose(sklearn_labels, tf_labels, rtol=1e-5) with self.subTest(msg="inference_is_reproduced_after_save"): path = pathlib.Path(self.get_temp_dir()) tf_tree = scikit_learn_model_converter.convert( sklearn_tree, intermediate_write_path=path / "intermediate_path", ) tf.saved_model.save(obj=tf_tree, export_dir=path) loaded_tf_tree = tf.saved_model.load(path) self.assertAllEqual(tf_tree(tf_features), loaded_tf_tree(tf_features)) @parameterized.parameters((tree.DecisionTreeClassifier(random_state=42),), (tree.ExtraTreeClassifier(random_state=42),), (ensemble.RandomForestClassifier(random_state=42),), (ensemble.ExtraTreesClassifier(random_state=42),)) def test_convert_reproduces_classification_model( self, sklearn_tree, ): features, labels = datasets.make_classification( n_samples=100, n_features=10, n_classes=4, n_clusters_per_class=1, random_state=42, ) sklearn_tree.fit(features, labels) tf_features = tf.constant(features, dtype=tf.float32) with self.subTest(msg="inference_is_reproduced_before_save"): tf_tree = scikit_learn_model_converter.convert(sklearn_tree) tf_labels = tf_tree(tf_features).numpy() sklearn_labels = sklearn_tree.predict_proba(features).astype(np.float32) self.assertAllClose(sklearn_labels, tf_labels, rtol=1e-5) with self.subTest(msg="inference_is_reproduced_after_save"): path = pathlib.Path(self.get_temp_dir()) tf_tree = scikit_learn_model_converter.convert( sklearn_tree, intermediate_write_path=path / "intermediate_path", ) tf.saved_model.save(obj=tf_tree, export_dir=path) loaded_tf_tree = tf.saved_model.load(path) self.assertAllEqual(tf_tree(tf_features), loaded_tf_tree(tf_features)) def test_convert_raises_when_unrecognised_model_provided(self): features, labels = datasets.make_regression( n_samples=100, n_features=10, random_state=42, ) sklearn_model = linear_model.LinearRegression().fit(features, labels) with self.assertRaises(NotImplementedError): scikit_learn_model_converter.convert(sklearn_model) def test_convert_raises_when_sklearn_model_is_not_fit(self): with self.assertRaises( ValueError, msg="Scikit-learn model must be fit to data before converting to TF.", ): _ = scikit_learn_model_converter.convert(tree.DecisionTreeRegressor()) def test_convert_raises_when_regression_target_is_multivariate(self): features, labels = datasets.make_regression( n_samples=100, n_features=10, # This produces a two-dimensional target variable. n_targets=2, random_state=42, ) sklearn_tree = tree.DecisionTreeRegressor().fit(features, labels) with self.assertRaisesRegex( ValueError, "Only scalar regression and single-label classification are supported.", ): _ = scikit_learn_model_converter.convert(sklearn_tree) def test_convert_raises_when_classification_target_is_multilabel(self): features, labels = datasets.make_multilabel_classification( n_samples=100, n_features=10, # This assigns two class labels per example. n_labels=2, random_state=42, ) sklearn_tree = tree.DecisionTreeClassifier().fit(features, labels) with self.assertRaisesRegex( ValueError, "Only scalar regression and single-label classification are supported.", ): _ = scikit_learn_model_converter.convert(sklearn_tree) def test_convert_uses_intermediate_model_path_if_provided(self): features, labels = datasets.make_classification( n_samples=100, n_features=10, n_classes=4, n_clusters_per_class=1, random_state=42, ) sklearn_tree = tree.DecisionTreeClassifier().fit(features, labels) write_path = self.create_tempdir() _ = scikit_learn_model_converter.convert( sklearn_tree, intermediate_write_path=write_path, ) # We should be able to load the intermediate TFDF model from the given path. tfdf_tree = tf.keras.models.load_model(write_path) self.assertIsInstance(tfdf_tree, tf.keras.Model) def test_convert_sklearn_tree_to_tfdf_pytree_raises_if_weight_provided_for_classification_tree( self): features, labels = datasets.make_classification(random_state=42) sklearn_tree = tree.DecisionTreeClassifier(random_state=42).fit( features, labels, ) with self.assertRaisesRegex( ValueError, "weight should not be passed for classification trees.", ): _ = scikit_learn_model_converter.convert_sklearn_tree_to_tfdf_pytree( sklearn_tree, weight=0.5, ) def test_convert_raises_when_gbt_initial_estimator_is_not_tree_or_constant( self): features, labels = datasets.make_regression( n_samples=100, n_features=10, random_state=42, ) init_estimator = linear_model.LinearRegression() sklearn_model = ensemble.GradientBoostingRegressor(init=init_estimator) sklearn_model.fit(features, labels) with self.assertRaises(ValueError): _ = scikit_learn_model_converter.convert(sklearn_model)
dataset_format='array', target=dataset.default_target_attribute) print("Categorical features: {}".format(categorical_indicator)) enc = preprocessing.OneHotEncoder(categorical_features=categorical_indicator) X = enc.fit_transform(X) clf.fit(X, y) ############################################################################ # Runs: Easily explore models # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ # We can run (many) scikit-learn algorithms on (many) OpenML tasks. # Get a task task = openml.tasks.get_task(403) # Build any classifier or pipeline clf = tree.ExtraTreeClassifier() # Run the flow run = openml.runs.run_model_on_task(clf, task) # pprint(vars(run), depth=2) ############################################################################ # Share the run on the OpenML server # # So far the run is only available locally. By calling the publish function, # the run is sent to the OpenML server: myrun = run.publish() # For this tutorial, our configuration publishes to the test server # as to not pollute the main server.
y_scikit_test = np.where(y_test == 1)[1] print('--------------------------------------------------------') print('Modelling on {}'.format(column_name)) # print('Tick Length: {}'.format(value)) print('X Variables with lookback of {}'.format(lookbacklen)) print('Difference calculation period {}'.format(period)) print( 'Artificial tagging on historical data to capture movement of +/- {} points' .format(movement_required)) print('Decisions: {}'.format(list(decisions))) ###### Tree clf_DTC = tree.DecisionTreeClassifier() clf_DTC = clf_DTC.fit(x_train, y_scikit_train) clf_ETC = tree.ExtraTreeClassifier() clf_ETC = clf_ETC.fit(x_train, y_scikit_train) ###### Neighbhors clf_KNC = neighbors.KNeighborsClassifier() clf_KNC = clf_KNC.fit(x_train, y_scikit_train) ###### linear Model clf_RCCV = linear_model.RidgeClassifierCV() clf_RCCV = clf_RCCV.fit(x_train, y_scikit_train) ###### Ensemble clf_RFC = ensemble.RandomForestClassifier() clf_RFC = clf_RFC.fit(x_train, y_scikit_train) clf_ETC_ens = ensemble.ExtraTreesClassifier()
predict_y = model.fit(train_X, train_failed_y).predict(test_X) #只看挂科这一类分类效果好 坏 predict_failed_y = [1 - x for x in predict_y] print("------Bagging---------") f1 = f1_score(test_failed_y, predict_failed_y) print("f1:%f" % f1) macro_auc = roc_auc_score(test_failed_y, predict_failed_y, average="macro") print("macro auc:%f" % macro_auc) accuracy = accuracy_score(test_failed_y, predict_failed_y) print("accuracy:%f" % accuracy) precision = precision_score(test_failed_y, predict_failed_y) print("precision:%f" % precision) recall = recall_score(test_failed_y, predict_failed_y) print("recall:%f" % recall) model = tree.ExtraTreeClassifier(random_state=random_state) predict_y = model.fit(train_X, train_failed_y).predict(test_X) #只看挂科这一类分类效果好 坏 predict_failed_y = [1 - x for x in predict_y] print("------ExtraTree---------") f1 = f1_score(test_failed_y, predict_failed_y) print("f1:%f" % f1) macro_auc = roc_auc_score(test_failed_y, predict_failed_y, average="macro") print("macro auc:%f" % macro_auc) accuracy = accuracy_score(test_failed_y, predict_failed_y) print("accuracy:%f" % accuracy) precision = precision_score(test_failed_y, predict_failed_y) print("precision:%f" % precision) recall = recall_score(test_failed_y, predict_failed_y) print("recall:%f" % recall)
dt_pred = clf.predict(testObs) print(dt_pred) # Calculate the accuracy of the classifier. print("DT Accuracy:") print((sum(testCls == dt_pred)) / len(dt_pred)) # Create a confusion matrix using Scikit-Learn confusion_matrix dt_tab = confusion_matrix(testCls, dt_pred, labels=labs) print(dt_tab) # Create a classification report for the result including precision, recall, and f measure. print(metrics.classification_report(testCls, dt_pred)) # Exercise 2: Use the evaluation metric code from the KNN example to assess the quality of your decision tree classifier. Did you find any differences? # You can also use a measure of entropy as the split criteria by including the parameter criterion="entropy". clf = tree.ExtraTreeClassifier(criterion="entropy") clf = clf.fit(trainObs, trainCls) print(clf) dt_pred = clf.predict(testObs) print(dt_pred) # Calculate the accuracy of the classifier. print("DT Entropy Accuracy:") print((sum(testCls == dt_pred)) / len(dt_pred)) # Create a confusion matrix using Scikit-Learn confusion_matrix dt_tab = confusion_matrix(testCls, dt_pred, labels=labs) print(dt_tab) # Create a classification report for the result including precision, recall, and f measure. print(metrics.classification_report(testCls, dt_pred))
# CV scores clf = model.fit(X_model, y_model) auc_scores(clf, X_model, y_model) try: print('\n') print('Feature importance:\{}'.format(feature_importance(df.loc[M, model_features], y_model, model))) except: print('NA') continue # Scaled Tree Plots clf = tree.DecisionTreeClassifier(max_depth=5, max_features=20, random_state=0).fit( df.loc[M, model_features], df_tgt.loc[M, targets[:5]]) show_tree(clf, list(model_features), ['kidney', 'heart', 'lung', 'liver', 'pancreas']) clf = tree.ExtraTreeClassifier(max_depth=5, max_features=20, random_state=0).fit( df.loc[M, model_features], df_tgt.loc[M, targets[:5]]) show_tree(clf, list(model_features), ['kidney', 'heart', 'lung', 'liver', 'pancreas']) # Reverse scaling df.loc[M, floats] = scaler.inverse_transform(df.loc[M, floats]) # NOTE: each time scaler scales values! # Unscaled Tree plots clf = tree.DecisionTreeClassifier(max_depth=4, max_features=20, random_state=0).fit( df.loc[M, model_features], df_tgt.loc[M, targets[:5]]) show_tree(clf, list(model_features), ['kidney', 'heart', 'lung', 'liver', 'pancreas']) clf = tree.ExtraTreeClassifier(max_depth=5, max_features=20, random_state=0).fit( df.loc[M, model_features], df_tgt.loc[M, targets[:5]]) show_tree(clf, list(model_features), ['kidney', 'heart', 'lung', 'liver', 'pancreas']) # ---------------------------------------------> Fit model <-------------------------------------------- #
def model_dt(X_train, X_test, y_train, y_test): #excluding id_sentence e id_word X_train = np.delete(np.array(X_train).astype(float), np.s_[0:2], axis=1) X_test = np.delete(np.array(X_test).astype(float), np.s_[0:2], axis=1) clf11 = GaussianNB() model11 = clf11.fit( np.array(X_train).astype(float), np.array(Y_train).astype(float)) predictions11 = model11.predict(np.array(X_test).astype(float)) clf2 = ensemble.RandomForestClassifier(n_estimators=20) model2 = clf2.fit(X_train, Y_train) predictions2 = model2.predict(X_test) _ = joblib.dump(model2, 'final_randomforest.pkl', compress=3) clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best') model = clf.fit(X_train, Y_train) predictions = model.predict(X_test) clf3 = tree.ExtraTreeClassifier() model3 = clf3.fit(X_train, Y_train) predictions3 = model3.predict(X_test) clf4 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best') model4 = clf4.fit(X_train, Y_train) predictions4 = model4.predict(X_test) clf5 = ensemble.VotingClassifier(estimators=[('lr', clf), ('rf', clf2), ('gnb', clf3)], voting='hard') model5 = clf5.fit(X_train, Y_train) predictions5 = model5.predict(X_test) clf6 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_features='auto') model6 = clf6.fit(X_train, Y_train) predictions6 = model6.predict(X_test) clf7 = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_features='auto', max_depth=10) model7 = clf7.fit(X_train, Y_train) predictions7 = model7.predict(X_test) clf8 = ensemble.RandomForestClassifier(n_estimators=5) model8 = clf8.fit(X_train, Y_train) predictions8 = model8.predict(X_test) clf9 = ensemble.RandomForestClassifier(n_estimators=50) model9 = clf9.fit(X_train, Y_train) predictions9 = model9.predict(X_test) #print len(X_train), len(Y_train) #print len(X_test), len(Y_test) #print '--FI', precision_score(Y_test, predictions, average=None) #print '--FI', recall_score(Y_test, predictions, average=None) #print '--FI', f1_score(Y_test, predictions, average=None) #print '--FI', accuracy_score(Y_test, predictions, normalize=True) #print '--FI', confusion_matrix(Y_test, predictions) target_names = ['LOC', 'ORG', 'PER', 'NON'] #print(classification_report(Y_test, predictions0, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions2, target_names=target_names, digits=3)) #print(classification_report(Y_test, predictions10, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions11, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions3, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions4, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions5, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions6, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions7, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions8, target_names=target_names, digits=3)) print( classification_report(Y_test, predictions9, target_names=target_names, digits=3)) #print 'media_mod1', sum(f1_score(Y_test, predictions0, average=None)[0:3])/3.0 print 'media_mod1', sum(f1_score(Y_test, predictions, average=None)[0:3]) / 3.0 print 'media_mod2', sum(f1_score(Y_test, predictions2, average=None)[0:3]) / 3.0 #print 'media_mod2', sum(f1_score(Y_test, predictions10, average=None)[0:3])/3.0 print 'media_mod11', sum( f1_score(Y_test, predictions11, average=None)[0:3]) / 3.0 print 'media_mod3', sum(f1_score(Y_test, predictions3, average=None)[0:3]) / 3.0 print 'media_mod4', sum(f1_score(Y_test, predictions4, average=None)[0:3]) / 3.0 print 'media_mod5', sum(f1_score(Y_test, predictions5, average=None)[0:3]) / 3.0 print 'media_mod6', sum(f1_score(Y_test, predictions6, average=None)[0:3]) / 3.0 print 'media_mod7', sum(f1_score(Y_test, predictions7, average=None)[0:3]) / 3.0 print 'media_mod8', sum(f1_score(Y_test, predictions8, average=None)[0:3]) / 3.0 print 'media_mod9', sum(f1_score(Y_test, predictions9, average=None)[0:3]) / 3.0 tree.export_graphviz(clf, out_file='tree.dot') dot_data = tree.export_graphviz(clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("tree.pdf") iris = load_iris() dot_data = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) Image(graph.create_png())