def main(): classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0) classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0) classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski') classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10) classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0) classifier6 = GaussianNB() print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis = 1, inplace = True) #print train.head() train = train.values eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard') #eclf = EnsembleClassifier(clfs = [classifier2], voting = 'hard') scores = cross_val_score(estimator = eclf, X = train[0:,1:], y = train[0:,0], cv = 10, scoring = 'roc_auc') print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std())) eclf.fit(train[0:,1:],train[0:,0]) print("Saving the classifier") data_io.save_model(eclf)
def main(): classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0) classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0) classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski') classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10) classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0) classifier6 = GaussianNB() classifier7 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['Id'], axis = 1, inplace = True) #print train.head() train = train.values #eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard') #eclf = EnsembleClassifier(clfs = [classifier1], voting = 'hard') eclf = classifier3 #scores = cross_val_score(estimator = eclf, X = train[0:,0:-1], y = train[0:,-1], cv = 10, scoring = 'roc_auc') #print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std())) eclf.fit(train[0:,0:-1],train[0:,-1]) # importances = eclf.feature_importances_ # indices = np.argsort(importances)[::-1] # for f in range(train[0:,0:-1].shape[1]): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # print("Saving the classifier") data_io.save_model(eclf)
def main(): print("Loading the test data") classifier = data_io.load_model() print ("Load test data. And Clean..") test = data_io.get_test_df() test = FeatureConverter().clean_data(test) passengerIds = test['Id'] test.drop(['Id'], axis = 1, inplace = True) test = test.values print("Making predictions") predictions = classifier.predict(test).astype(int) #predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
def main(): print("Loading the test data") classifier = data_io.load_model() print("Load test data. And Clean..") test = data_io.get_test_df() test = FeatureConverter().clean_data(test) passengerIds = test['Id'] test.drop(['Id'], axis=1, inplace=True) test = test.values print("Making predictions") predictions = classifier.predict(test).astype(int) #predictions = predictions.reshape(len(predictions), 1) print("Writing predictions to file") data_io.write_submission(predictions, passengerIds, ['Id', 'Cover_Type'])
svc_params_grid = \ { 'C': [0.001, 0.1, 1.0, 10.0, 100.0], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.1, 1.0, 10.0, 100.0] } if __name__=="__main__": print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis = 1, inplace = True) #print train.head() train = train.values grid_search = GridSearchCV(RandomForestClassifier(n_estimators = 100), rf_params_grid, cv = 5, verbose = 1) grid_search.fit(train[0:,1:],train[0:,0]) print grid_search.best_params_ grid_search = GridSearchCV(LogisticRegression(random_state = 0), lr_params_grid, cv = 5, verbose = 1) grid_search.fit(train[0:,1:],train[0:,0]) print grid_search.best_params_ grid_search = GridSearchCV(SVC(random_state = 0), svc_params_grid, cv = 5, verbose = 1) grid_search.fit(train[0:,1:],train[0:,0])
{ 'C': [0.001, 0.1, 1.0, 10.0, 100.0], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.1, 1.0, 10.0, 100.0] } if __name__ == "__main__": print("Reading in the training data") train = data_io.get_train_df() print( "Cleaning data. Check here for imputation, One hot encoding and factorization procedures.." ) train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis=1, inplace=True) #print train.head() train = train.values grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100), rf_params_grid, cv=5, verbose=1) grid_search.fit(train[0:, 1:], train[0:, 0]) print grid_search.best_params_ grid_search = GridSearchCV(LogisticRegression(random_state=0), lr_params_grid, cv=5, verbose=1)