import time
import csv as csv
import sys
from sklearn import metrics
from sklearn.cross_validation import ShuffleSplit
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import SGDClassifier
from operator import itemgetter

# Script
###################################
if __name__ == '__main__':
    # Do all the feature engineering
    print "Generating initial training/test sets"
    input_df, submit_df = loaddata.getDataSets(raw=False,
                                               binary=True,
                                               bins=False,
                                               scaled=True)

    # Collect the test data's PassengerIds then drop it from the train and test sets
    submit_ids = submit_df['PassengerId']
    input_df.drop(['PassengerId'], axis=1, inplace=1)
    submit_df.drop(['PassengerId'], axis=1, inplace=1)

    # Run dimensionality reduction and clustering on the remaining feature set. This will return an unlabeled
    # set of derived parameters along with the ClusterID so we can train multiple models for different groups
    print "Dimensionality Reduction and Clustering..."
    input_df, submit_df = loaddata.reduceAndCluster(input_df, submit_df, 2)

    # Add the passenger ID back into the test set so we can keep track of them as we train different models
    submit_df = pd.concat([submit_ids, submit_df], axis=1)
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")
        
        if params == None:
            params = score.parameters
    
    return params
    

# Script
###################################

input_df, submit_df = loaddata.getDataSets(bins=False, scaled=True, raw=False)

# Collect the test data's PassengerIds
ids = submit_df['PassengerId'].values

# Remove variables that we couldn't transform into features: 
drop_list = ['PassengerId']
input_df.drop(drop_list, axis=1, inplace=1) 
submit_df.drop(drop_list, axis=1, inplace=1) 
submit_df.drop('Survived', axis=1, inplace=1)

print 'Building Naive Bayes Classifier with ' + str(len(input_df.columns)) \
      + ' columns: ' + str(list(input_df.columns.values))

train_data = input_df.values
X = train_data[0::,1::]
Esempio n. 3
0
    Custom scoring function for hyperparameter optimization. In this case, we want to print out the oob score
    """
    score = estimator.oob_score_
    print "oob_score_:", score
    return score


if __name__ == '__main__':
    """
    Main script, this contains logic to execute the full pipeline to generate a RandomForest for the titanic data
    """
    ##############################################################################################################
    # Prepare data for pipeline
    #
    print "\nGenerating initial training/test sets"
    input_df, submit_df = loaddata.getDataSets(bins=True, scaled=True, binary=True)

    # Collect the test data's PassengerIds then drop it from the train and test sets
    submit_ids = submit_df['PassengerId']

    input_df.drop('PassengerId', axis=1, inplace=1)
    submit_df.drop('PassengerId', axis=1, inplace=1)

    features_list = input_df.columns.values[1::] # Save for feature importance graph
    X = input_df.values[:, 1::]
    y = input_df.values[:, 0]

    # Set the weights to adjust for uneven class distributions (fewer passengers survived than died)
    survived_weight = .75
    y_weights = np.array([survived_weight if s == 1 else 1 for s in y])
Esempio n. 4
0
        print("")
        
        if params == None:
            params = score.parameters
    
    return params




# Script
###################################
if __name__ == '__main__':
        
    # Do all the feature engineering
    input_df, submit_df = loaddata.getDataSets(raw=False, binary=True, bins=False)
    submit_df.drop('Survived', axis=1, inplace=1)
    
    print 'All generated features: ' + str(list(input_df.columns.values))
    
    # Collect the test data's PassengerIds
    ids = submit_df['PassengerId'].values
    
    # Remove variables that aren't appropriate for this model:
    drop_list = ['PassengerId']
    input_df.drop(drop_list, axis=1, inplace=1) 
    submit_df.drop(drop_list, axis=1, inplace=1) 
    
    print 'Building SVC with ', len(input_df.columns), ' columns: ', list(input_df.columns.values)
    print "Number of training examples: ", input_df.shape[0]
    
import sys
import re
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from operator import itemgetter


# Script
###################################
if __name__ == '__main__':
    # Do all the feature engineering
    print "Generating initial training/test sets"
    input_df, submit_df = loaddata.getDataSets(raw=False, binary=True, bins=False, scaled=True, balanced=True)
    
    # Collect the test data's PassengerIds then drop it from the train and test sets
    submit_ids = submit_df['PassengerId']
    input_df.drop(['PassengerId'], axis=1, inplace=1) 
    submit_df.drop(['PassengerId'], axis=1, inplace=1) 
    
    # Run dimensionality reduction and clustering on the remaining feature set. This will return an unlabeled
    # set of derived parameters along with the ClusterID so we can train multiple models for different groups
    print "Dimensionality Reduction and Clustering..."
    input_df, submit_df = loaddata.reduceAndCluster(input_df, submit_df, 2)
    
    # Add the passenger ID back into the test set so we can keep track of them as we train different models
    submit_df = pd.concat([submit_ids, submit_df], axis=1)
    
    print 'Generated', input_df.columns.size, 'features:', input_df.columns.values
Esempio n. 6
0
    print 'ROC AUC: %0.2f' % roc_auc

    if plot:
        # Plot of a ROC curve for a specific class
        plt.figure()
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc="lower right")
        plt.show()

    return roc_auc


if __name__ == "__main__":
    """
    Test method
    """
    print "Testing ROC Curve..."
    input_df, _ = loaddata.getDataSets(bins=True, scaled=True, binary=True)
    input_df.drop("PassengerId", axis=1, inplace=True)
    X = input_df.values[:, 1::]
    y = input_df.values[:, 0]
    forest = RandomForestClassifier(n_estimators=10000, n_jobs=-1)

    generate_roc_curve(forest, X, y)
Esempio n. 7
0
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

        if params == None:
            params = score.parameters

    return params


# Script
###################################

input_df, submit_df = loaddata.getDataSets(bins=False, scaled=False, raw=False)

# Collect the test data's PassengerIds
ids = submit_df['PassengerId'].values

# Remove variables that we couldn't transform into features:
drop_list = ['PassengerId']
input_df.drop(drop_list, axis=1, inplace=1)
submit_df.drop(drop_list, axis=1, inplace=1)
submit_df.drop('Survived', axis=1, inplace=1)

print 'Building Naive Bayes Classifier with ' + str(len(input_df.columns)) \
      + ' columns: ' + str(list(input_df.columns.values))

train_data = input_df.values
X = train_data[0::, 1::]