import sys

from sklearn.cross_validation import cross_val_score
from util import load_validation_data

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

if __name__ == "__main__":
    # Get training data
    Xt, Yt, Xunused = load_validation_data()

    # Cross validation, 5-fold
    cvf = 5

    # Initialize classifiers
    classifiers = {
        "Naive Bayes"         : GaussianNB(),
        "Gradient Boost"      : GradientBoostingClassifier(),
        "Adaboost"            : AdaBoostClassifier(DecisionTreeClassifier(max_depth=1)),
        "Decision Tree"       : DecisionTreeClassifier(),
        "Extra Random Trees"  : ExtraTreesClassifier(n_estimators=300),
        "Logistic Regression" : LogisticRegression(),
        "K-Nearest-Neighbors" : KNeighborsClassifier(),
        "SGD"                 : SGDClassifier(),
        "SVM"                 : LinearSVC(),
                              verbose=True,
                              weightdecay=0.01)

    trainer.trainUntilConvergence()
    #trainer.trainEpochs(5)

    print "trained"
    #trainer.trainEpochs(5)

    # Return a functor that wraps calling predict
    return NeuralNetworkClassifier(trainer)


if __name__ == "__main__":
    # First obtain our training and testing data
    # Training has 50K samples, Testing 100K
    Xt, Yt, Xv = load_validation_data()

    # Run Neural Network over training data
    classifier = classify(Xt, Yt)

    # Prepare validation data and predict
    tstdata = ClassificationDataSet(Xv.shape[1], 1, nb_classes=2)
    tstdata.setField('input', Xv)
    tstdata._convertToOneOfMany()  # one output neuron per class

    predictions = classifier.predict(tstdata)

    # Write prediction to file
    write_test_prediction("out_nn.txt", np.array(majority))
    # build neural net and train it
    net = buildNetwork(trndata.indim, n_hidden, trndata.outdim, outclass=SoftmaxLayer)
    trainer = BackpropTrainer(net, dataset=trndata, momentum=0.1, verbose=True, weightdecay=0.01)

    trainer.trainUntilConvergence()
    #trainer.trainEpochs(5)

    print "trained"
    #trainer.trainEpochs(5)

    # Return a functor that wraps calling predict
    return NeuralNetworkClassifier(trainer)

if __name__ == "__main__":
    # First obtain our training and testing data
    # Training has 50K samples, Testing 100K
    Xt, Yt, Xv = load_validation_data()

    # Run Neural Network over training data
    classifier = classify(Xt, Yt)

    # Prepare validation data and predict
    tstdata = ClassificationDataSet(Xv.shape[1], 1, nb_classes=2)
    tstdata.setField('input', Xv)
    tstdata._convertToOneOfMany() # one output neuron per class

    predictions = classifier.predict(tstdata)

    # Write prediction to file
    write_test_prediction("out_nn.txt", np.array(majority))
Example #4
0
    """
    gbc = GradientBoostingClassifier(verbose=1)
    parameters = {'max_depth' : range(3,11),'n_estimators' : [400,500]}

    classifier = GridSearchCV(gbc, parameters, scoring=metric)
    classifier.fit(Xtrain, Ytrain)
    return classifier

if __name__ == "__main__":
    # Let's take our training data and train a decision tree
    # on a subset. Scikit-learn provides a good module for cross-
    # validation.
    Xt, Xv, Yt, Yv = get_split_training_dataset()
    Classifier = train(Xt, Yt)
    print "Gradient Boost Classifier"
    suite(Yv, Classifier.predict(Xv))

    # smaller feature set
    Xtimp, features = fclassify.get_important_data_features(Xt, Yt)
    Xvimp = fclassify.compress_data_to_important_features(Xv, features)
    ClassifierImp = train(Xtimp,Yt)
    print "Gradient Boosts Classiifer, 25 important features"
    suite(Yv, ClassifierImp.predict(Xvimp))

    # save predictions on test data

    X, Y, validation_data = load_validation_data()
    predictions = Classifier.predict(validation_data)
    filename = 'gradient_boost_predictions.txt'
    write_test_prediction(filename, np.array(predictions))
Example #5
0
    predictions = classifier.predict(Xv)
    print "Neural Net Test Accuracy:", acc(Yv, predictions), "%"


if __name__ == "__main__":
    if len(sys.argv) < 2:
        training = '../data/raw/phy_train.dat'
        print "Usage: $ python neural_network.py /path/to/data/file/"
        print "Using default data file:", training
    else:
        training = sys.argv[1]

    impute_data = False
    # load data from file, imputing data and/or removing some features if requested,
    # then shuffle and split into test and validation
    X, Y, test_data = load_validation_data()
    if impute_data:
        X = remove_features_missing_data(X)
        test_data = remove_features_missing_data(test_data)
    Xt, Xv, Yt, Yv = shuffle_split(X, Y)

    # get the top features, running in parallel
    children = []
    for n_features in [20]:
        '''for n_features in [23, 21, 19, 17]:
        children.append(os.fork())
        if children[-1]:
        continue'''
        X, features = get_important_data_features(X,
                                                  Y,
                                                  max_features=n_features)
    print "Neural Net Train Accuracy:",acc(Yt, predictions),"%"
    predictions = classifier.predict(Xv)
    print "Neural Net Test Accuracy:",acc(Yv, predictions),"%"

if __name__ == "__main__":
    if len(sys.argv) < 2:
        training = '../data/raw/phy_train.dat'
        print "Usage: $ python neural_network.py /path/to/data/file/"
        print "Using default data file:", training
    else:
        training = sys.argv[1]

    impute_data = False
    # load data from file, imputing data and/or removing some features if requested,
    # then shuffle and split into test and validation
    X, Y, test_data = load_validation_data()
    if impute_data:
        X = remove_features_missing_data(X)
        test_data = remove_features_missing_data(test_data)
    Xt, Xv, Yt, Yv = shuffle_split(X,Y)

    # get the top features, running in parallel
    children = []
    for n_features in [20]:
        '''for n_features in [23, 21, 19, 17]:
        children.append(os.fork())
        if children[-1]:
        continue'''
        X, features = get_important_data_features(X, Y, max_features=n_features)
        print X.shape
        # Do it for test data too...