def initialize_models(X_train, y_train, X_test, y_test, accuracy, fscore): # TODO: Initialize the three models clf_A = dtc(random_state=13) clf_B = rfc(random_state=13) clf_C = abc(random_state=13) # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 (ensure to set the count of the values to be `int` and not `float`) # HINT: samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) samples_100 = len(y_train) samples_10 = len(y_train) // 10 samples_1 = len(y_train) // 100 # Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test) # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore) return clf_C
def evaluate_algorithms(clf_A, clf_B, clf_C): results = {} samples_100 = len(y_train) samples_10 = int(0.1 * len(y_train)) samples_1 = int(0.01 * len(y_train)) for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test) vs.evaluate(results, accuracy, fscore)
#samples_1 is 1% of samples_100 (ensure to set the count of the values to be `int` and not `float`) samples_100 = len(y_train) samples_10 = int(0.1 * samples_100) samples_1 = int(0.01 * samples_100) #Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = vs.train_predict(clf, samples, X_train, y_train, X_test, y_test) #Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore).savefig('performance.jpg') ###GridSearchCV #Import 'GridSearchCV', 'make_scorer', and any other necessary libraries from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, fbeta_score, accuracy_score from sklearn.neighbors import KNeighborsClassifier #Initialize the classifier clf = DecisionTreeClassifier() #Create the parameters list you wish to tune, using a dictionary if needed. #parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]} parameters = { 'min_samples_split': [2, 4, 8, 16, 32], 'min_samples_leaf': [2, 4, 8, 16, 32],
samples_10 = int(0.1 * X_train.shape[0]) samples_100 = X_train.shape[0] # Collect results on the learners results = {} j = 0 for clf in [clf_A, clf_B, clf_C, clf_D, clf_E]: clf_name = clf.__class__.__name__ + str(j) j += 1 results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = \ train_predict(clf, samples, X_train, y_train, X_test, y_test) # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore) #for i in results: sigmoidsvc = SVC(random_state=1990, kernel='sigmoid') res = train_predict(sigmoidsvc, samples, X_train, y_train, X_test, y_test) r = {} i = 0 for clf in [clf_A, clf_B, clf_C, clf_D, clf_E]: clf_name = clf.__class__.__name__ + str(i) r[clf_name] = { 'f_test': results[clf_name][2]['f_test'], 'acc_test': results[clf_name][2]['acc_test'] } i += 1 r = {} i = 0
results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} results[clf_name] = train_predict(clf, X_train, y_train, X_test, y_test) # Show processing time in h:m:s m, s = divmod(time() - time1, 60) h, m = divmod(m, 60) print("\nTime elapsed to train classifier: %d:%02d:%02d" % (h, m, s)) ### Plot training & prediction times and scores for all classifiers k = 0 # Training & Prediction times vs.evaluate(results, 0, k) k = 1 # Precision score vs.evaluate(results, precision, k) k = 2 # F-Score vs.evaluate(results, fscore, k) ### Model Tuning for Decision Tree or Logistic Regression classifiers time2 = time() clf = LogisticRegression() #clf = DecisionTreeClassifier(random_state=27) # Create the parameters list to tune param_grid = {'C': [1, 10, 100, 1000]} #parameters = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], # 'max_depth': [1, 3, 5], 'max_leaf_nodes': [2, 5, 10, 15, 30]}
def evaluate(dataset, key_field): income_raw = dataset[key_field] # 1. start of pre-processing data features_raw = dataset.drop(key_field, axis = 1) # Visualize skewed continuous features of original data # vs.distribution(data) # Log-transform the skewed features skewed = ['capital-gain', 'capital-loss'] features_log_transformed = pd.DataFrame(data = features_raw) features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1)) # Visualize the new log distributions # vs.distribution(features_log_transformed, transformed = True) # Initialize a scaler, then apply it to the features scaler = MinMaxScaler() # default=(0, 1) numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] features_log_minmax_transform = pd.DataFrame(data = features_log_transformed) features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical]) # Show an example of a record with scaling applied display(features_log_minmax_transform.head(n = 5)) # vs.distribution(features_log_minmax_transform) # TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies() features_final = pd.get_dummies(features_log_minmax_transform) # TODO: Encode the 'income_raw' data to numerical values encoder = LabelEncoder() income = encoder.fit_transform(income_raw) # Print the number of features after one-hot encoding encoded = list(features_final.columns) print("{} total features after one-hot encoding.".format(len(encoded))) # Uncomment the following line to see the encoded feature names # print encoded X_train, X_test, y_train, y_test = train_test_split(features_final, income, test_size = 0.2, random_state = 0) # Show the results of the split print("Training set has {} samples.".format(X_train.shape[0])) print("Testing set has {} samples.".format(X_test.shape[0])) # 1. end of pre-processing data # 2. start of building native predictor ''' TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data encoded to numerical values done in the data preprocessing step. FP = income.count() - TP # Specific to the naive case TN = 0 # No predicted negatives in the naive case FN = 0 # No predicted negatives in the naive case ''' # TODO: Calculate accuracy, precision and recall encoder = LabelEncoder() income = encoder.fit_transform(income_raw) TP = np.sum(income) FP = len(income) - TP accuracy = np.true_divide(TP,TP + FP) recall = 1 precision = accuracy # TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall. # HINT: The formula above can be written as (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall) fscore = (1 + 0.5**2) * (precision * recall) / ((0.5**2 * precision) + recall) # Print the results print("Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore)) # 2. end of building native predictor # 3. start of evaluation # TODO: Initialize the three models clf_random_forest = RandomForestClassifier() clf_decision_tree = DecisionTreeClassifier(random_state=0) clf_C = SVC(kernel = 'rbf') clf_M = MLPClassifier(solver='sgd',activation = 'identity',max_iter = 70,alpha = 1e-5,hidden_layer_sizes = (100,50),random_state = 1,verbose = False) # TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 # HINT: samples_1 is 1% of samples_100 samples_100 = len(y_train) samples_10 = int(len(y_train)*0.1) samples_1 = int(len(y_train)*0.01) # Collect results on the learners results = {} for clf in [clf_random_forest, clf_decision_tree, clf_C, clf_M]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train, X_test, y_test) if clf == clf_decision_tree: storeTree(clf, "decision_tree") # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore)
# Calculate the number of samples for 1%, 10%, and 100% of the training data samples_1 = int(float(len(X_train)) * 0.01) samples_10 = int(float(len(X_train)) * 0.10) samples_100 = len(X_train) # Collect results on the learners results = {} for clf in [clf_A, clf_B, clf_C]: clf_name = clf.__class__.__name__ results[clf_name] = {} for i, samples in enumerate([samples_1, samples_10, samples_100]): results[clf_name][i] = train_predict(clf, samples, X_train, y_train.values.ravel(), X_test, y_test.values.ravel()) # Run metrics visualization for the three supervised learning models chosen vs.evaluate(results, accuracy, fscore) # Import 'GridSearchCV', 'make_scorer', and any other necessary libraries from sklearn.metrics import make_scorer from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import ShuffleSplit cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)