import time import csv as csv import sys from sklearn import metrics from sklearn.cross_validation import ShuffleSplit from sklearn.grid_search import RandomizedSearchCV, GridSearchCV from sklearn.linear_model import SGDClassifier from operator import itemgetter # Script ################################### if __name__ == '__main__': # Do all the feature engineering print "Generating initial training/test sets" input_df, submit_df = loaddata.getDataSets(raw=False, binary=True, bins=False, scaled=True) # Collect the test data's PassengerIds then drop it from the train and test sets submit_ids = submit_df['PassengerId'] input_df.drop(['PassengerId'], axis=1, inplace=1) submit_df.drop(['PassengerId'], axis=1, inplace=1) # Run dimensionality reduction and clustering on the remaining feature set. This will return an unlabeled # set of derived parameters along with the ClusterID so we can train multiple models for different groups print "Dimensionality Reduction and Clustering..." input_df, submit_df = loaddata.reduceAndCluster(input_df, submit_df, 2) # Add the passenger ID back into the test set so we can keep track of them as we train different models submit_df = pd.concat([submit_ids, submit_df], axis=1)
print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") if params == None: params = score.parameters return params # Script ################################### input_df, submit_df = loaddata.getDataSets(bins=False, scaled=True, raw=False) # Collect the test data's PassengerIds ids = submit_df['PassengerId'].values # Remove variables that we couldn't transform into features: drop_list = ['PassengerId'] input_df.drop(drop_list, axis=1, inplace=1) submit_df.drop(drop_list, axis=1, inplace=1) submit_df.drop('Survived', axis=1, inplace=1) print 'Building Naive Bayes Classifier with ' + str(len(input_df.columns)) \ + ' columns: ' + str(list(input_df.columns.values)) train_data = input_df.values X = train_data[0::,1::]
Custom scoring function for hyperparameter optimization. In this case, we want to print out the oob score """ score = estimator.oob_score_ print "oob_score_:", score return score if __name__ == '__main__': """ Main script, this contains logic to execute the full pipeline to generate a RandomForest for the titanic data """ ############################################################################################################## # Prepare data for pipeline # print "\nGenerating initial training/test sets" input_df, submit_df = loaddata.getDataSets(bins=True, scaled=True, binary=True) # Collect the test data's PassengerIds then drop it from the train and test sets submit_ids = submit_df['PassengerId'] input_df.drop('PassengerId', axis=1, inplace=1) submit_df.drop('PassengerId', axis=1, inplace=1) features_list = input_df.columns.values[1::] # Save for feature importance graph X = input_df.values[:, 1::] y = input_df.values[:, 0] # Set the weights to adjust for uneven class distributions (fewer passengers survived than died) survived_weight = .75 y_weights = np.array([survived_weight if s == 1 else 1 for s in y])
print("") if params == None: params = score.parameters return params # Script ################################### if __name__ == '__main__': # Do all the feature engineering input_df, submit_df = loaddata.getDataSets(raw=False, binary=True, bins=False) submit_df.drop('Survived', axis=1, inplace=1) print 'All generated features: ' + str(list(input_df.columns.values)) # Collect the test data's PassengerIds ids = submit_df['PassengerId'].values # Remove variables that aren't appropriate for this model: drop_list = ['PassengerId'] input_df.drop(drop_list, axis=1, inplace=1) submit_df.drop(drop_list, axis=1, inplace=1) print 'Building SVC with ', len(input_df.columns), ' columns: ', list(input_df.columns.values) print "Number of training examples: ", input_df.shape[0]
import sys import re from sklearn import cross_validation from sklearn.grid_search import GridSearchCV from sklearn.grid_search import RandomizedSearchCV from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from operator import itemgetter # Script ################################### if __name__ == '__main__': # Do all the feature engineering print "Generating initial training/test sets" input_df, submit_df = loaddata.getDataSets(raw=False, binary=True, bins=False, scaled=True, balanced=True) # Collect the test data's PassengerIds then drop it from the train and test sets submit_ids = submit_df['PassengerId'] input_df.drop(['PassengerId'], axis=1, inplace=1) submit_df.drop(['PassengerId'], axis=1, inplace=1) # Run dimensionality reduction and clustering on the remaining feature set. This will return an unlabeled # set of derived parameters along with the ClusterID so we can train multiple models for different groups print "Dimensionality Reduction and Clustering..." input_df, submit_df = loaddata.reduceAndCluster(input_df, submit_df, 2) # Add the passenger ID back into the test set so we can keep track of them as we train different models submit_df = pd.concat([submit_ids, submit_df], axis=1) print 'Generated', input_df.columns.size, 'features:', input_df.columns.values
print 'ROC AUC: %0.2f' % roc_auc if plot: # Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() return roc_auc if __name__ == "__main__": """ Test method """ print "Testing ROC Curve..." input_df, _ = loaddata.getDataSets(bins=True, scaled=True, binary=True) input_df.drop("PassengerId", axis=1, inplace=True) X = input_df.values[:, 1::] y = input_df.values[:, 0] forest = RandomForestClassifier(n_estimators=10000, n_jobs=-1) generate_roc_curve(forest, X, y)
print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") if params == None: params = score.parameters return params # Script ################################### input_df, submit_df = loaddata.getDataSets(bins=False, scaled=False, raw=False) # Collect the test data's PassengerIds ids = submit_df['PassengerId'].values # Remove variables that we couldn't transform into features: drop_list = ['PassengerId'] input_df.drop(drop_list, axis=1, inplace=1) submit_df.drop(drop_list, axis=1, inplace=1) submit_df.drop('Survived', axis=1, inplace=1) print 'Building Naive Bayes Classifier with ' + str(len(input_df.columns)) \ + ' columns: ' + str(list(input_df.columns.values)) train_data = input_df.values X = train_data[0::, 1::]