a.append(s[0]) df.drop('B', axis=1, inplace=True) df['Gender'] = a for i in df.index: if (df.loc[i, 'Gender'] == 'm'): df.loc[i, 'Gender'] = 1.0 else: df.loc[i, 'Gender'] = 0.0 target = df['Gender'] target = list(target) df.drop('Gender', axis=1, inplace=True) from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel clf = RandomForestClassifier(n_estimators=50, max_features='sqrt') clf.fit(df, target) model = SelectFromModel(clf, prefit=True) train_reduced = model.transform(df) from sklearn.decomposition import PCA pca = PCA(n_components=50) train_red = pca.fit_transform(train_reduced) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis model = LinearDiscriminantAnalysis() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(train_red, target, test_size=0.2) model.fit(X_train, y_train) print(model.score(X_test, y_test))
parameters = { 'bootstrap': False, 'min_samples_leaf': 4, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 5 } model = RandomForestClassifier(**parameters) model.fit(X_train_reduced, Y_train) # In[133]: output = model.predict(test1_reduced).astype(int) model1 = round(model.score(X_train_reduced, Y_train) * 100, 2) model1 # #### Applying Random Forest Classifier. One can play with parameters (hyperparameter tuning to increase score). I have achieved .803 with less feature engineering. However, as i increased the number of dummies for age, it came down to 78.9. # In[131]: output = model.predict(test1_reduced).astype(int) submission = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": output }) submission.to_csv("titanic51_submission.csv", index=False) # I'll update as i improve. Your guidance is appreciated. Also thanks a lot for all the tutorials where i learned a lot.
features['importance'] = clf.feature_importances_ features.sort_values(by=['importance'], ascending=True, inplace=True) features.set_index('feature', inplace=True) features.plot(kind='barh', figsize=(20, 20)) #plt.show() model = SelectFromModel(clf, threshold=0.005, prefit=True) train_reduce = model.transform(train) test_reduce = model.transform(test) print(train_reduce.shape) ############################交叉验证 train_x = train_reduce[:623] train_cv = train_reduce[623:] train_y = targets[:623].as_matrix() train_cv_y = targets[623:].as_matrix() from sklearn import linear_model from sklearn import tree from sklearn import ensemble from sklearn import svm model = ensemble.GradientBoostingClassifier(n_estimators=50) model.fit(train_x, train_y) print(model.score(train_cv, train_cv_y)) output = model.predict(test_reduce).astype(int) df_output = pd.DataFrame() aux = pd.read_csv('test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('output.csv', index=False)
print("Accuracy: {:.4f}".format(acc)) y_pred_ada = clfGB.predict(x_test) cnf_matrix = confusion_matrix(y_test, y_pred_ada) print(cnf_matrix) TPR = cnf_matrix[0][0] / (cnf_matrix[0][0] + cnf_matrix[1][0]) specificity = cnf_matrix[1][1] / (cnf_matrix[0][1] + cnf_matrix[1][1]) print("sensitivity (TPR)", TPR) print("AccuracyGradientClassifier:", acc) print("specificity (1-FPR)", specificity) #linearRegression lm = linear_model.LinearRegression() model = lm.fit(x_train, y_train.ravel()) predictions = lm.predict(x_test) scoreLR = model.score(x_test, y_test) print("linear regression:", scoreLR) print(predictions[0:5]) print("ADA boost: ", scoreAda) print("RF: ", scoreRF) #print("AdaBoost: ",scoreAB) print("KNC: ", scoreKNC) print("scoreRFExtreme: ", scoreRFExtreme) print("MLP: ", scoreMLP) #y_pred_ada=bdt.fit(x_train, y_train.ravel()).predict(x_test) y_pred_ada = bdt.predict(x_test) print(y_test[0:5], y_pred_ada[0:5]) cnf_matrix = confusion_matrix(y_test, y_pred_ada)
def main(): data = pd.read_csv( 'selfie_dataset.txt', sep=" ", header=None, names=[ "Nome", "Rate", "partial_faces", "is_female", "baby", "child", "teenager", "youth", "middle_age", "senior", "white", "black", "asian", "oval_face", "round_face", "heart_face", "smiling", "mouth_open", "frowning", "wearing_glasses", "wearing_sunglasses", "wearing_lipstick", "2tongue_out0", "duck_face", "black_hair", "blond_hair", "brown_hair", "red_hair", "curly_hair", "straight_hair", "braid_hair", "showing_cellphone", "using_earphone", "using_mirror", "wearing_hat", "braces", "harsh_lighting", "dim_lighting" ]) labels1 = np.array(data['Rate']) mx = max(labels1) mn = min(labels1) labels = [] for i in labels1: if ((i >= 0) and (i < (mx + mn) / 3)): labels.append(1) elif ((i >= (mx + mn) / 5) and (i < 2 * (mx + mn) / 5)): labels.append(2) elif ((i >= 2 * (mx + mn) / 5) and (i < 3 * (mx + mn) / 5)): labels.append(3) elif ((i >= 3 * (mx + mn) / 5) and (i < 4 * (mx + mn) / 5)): labels.append(4) elif ((i >= 4 * (mx + mn) / 5) and (i < 5 * (mx + mn) / 5)): labels.append(5) features1 = data.drop("Rate", axis=1) features2 = features1.drop("Nome", axis=1) feature_list = list(features2.columns) features = np.array(features2) train_features, test_features, train_labels, test_labels = train_test_split( features, labels, test_size=0.1, random_state=0) print('The shape of our train_features is:', train_features.shape) print('The shape of our test_features is:', test_features.shape) isTrained = False min_importance = 0.04 n_estimators = 200 retrain = True if (isTrained): if (retrain): crf = joblib.load("classifier.pkl") rf = SelectFromModel(crf, threshold=min_importance) rf.fit(train_features, train_labels) train_features = rf.transform(train_features) test_features = rf.transform(test_features) print('The shape of our important_train_features is:', train_features.shape) print('The shape of our important_test_features is:', test_features.shape) rf_important = RandomForestClassifier(n_estimators=n_estimators, random_state=1) rf_important.fit(train_features, train_labels) rf = rf_important print(rf_important) print("\n\n") predictions = rf_important.predict(test_features) importances = list(rf_important.feature_importances_) else: rf = joblib.load("classifier.pkl") print(rf) print("\n\n") predictions = rf.predict(test_features) importances = list(rf.feature_importances_) else: rf = RandomForestClassifier(n_estimators=n_estimators, criterion="entropy", random_state=2) rf.fit(train_features, train_labels) joblib.dump(rf, 'classifier.pkl') print(rf) print("\n\n") predictions = rf.predict(test_features) importances = list(rf.feature_importances_) print('Mean Absolute Error:', mean_absolute_error(test_labels, predictions)) print('Train Accuracy:', rf.score(train_features, train_labels), '%') print('Test Accuracy:', rf.score(test_features, test_labels), '%') print("\n\n") print("Importances: \n") feature_importances = [ (feature, round(importance, 4)) for feature, importance in zip(feature_list, importances) ] feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) for pair in feature_importances: print('{} : {}'.format(*pair)) print()