def get_train_data(limit=-1):
    print('Loading train data')
    X,y = rd.read_train(limit=limit)
    print('Augmenting data set')
    X,y = rd.nudge_dataset(X,y)
    print('Scaling data')
    X = scale(X)
    return X,y
Example #2
0
def get_train_data(limit=-1):
    print('Loading train data')
    X, y = rd.read_train(limit=limit)
    print('Augmenting data set')
    X, y = rd.nudge_dataset(X, y)
    print('Scaling data')
    X = scale(X)
    return X, y
def classifyRF(train_file="train.csv", test_file ="test.csv", trees=70):
    #So, let's classifiy this thing. Reading the Features and then the test.
    print("Reading train data")
    X,y = rd.read_train(file_name=train_file)
    print("Augmenting dataset")
    X,y = rd.nudge_dataset(X,y)
    print("Reading test data")
    test_data = rd.read_test(file_name=test_file)

    #Creating the classifier. It has a ton of parameters, but since this a hard and fast one, here you go.
    rfc = RandomForestClassifier(trees)
    #Train with the data we have. Cry a little inside.
    print("Training classifier")
    rfc.fit(X, y)
    predictions = rfc.predict(test_data)

    #Most submitions are cute with a CSV. Might as well learn how to do it.
    pd.DataFrame({"ImageId": range(1,len(predictions)+1), "Label": predictions}).to_csv('submit.csv', index=False, header=True)
Example #4
0
def classifyRF(train_file="train.csv", test_file="test.csv", trees=70):
    #So, let's classifiy this thing. Reading the Features and then the test.
    print("Reading train data")
    X, y = rd.read_train(file_name=train_file)
    print("Augmenting dataset")
    X, y = rd.nudge_dataset(X, y)
    print("Reading test data")
    test_data = rd.read_test(file_name=test_file)

    #Creating the classifier. It has a ton of parameters, but since this a hard and fast one, here you go.
    rfc = RandomForestClassifier(trees)
    #Train with the data we have. Cry a little inside.
    print("Training classifier")
    rfc.fit(X, y)
    predictions = rfc.predict(test_data)

    #Most submitions are cute with a CSV. Might as well learn how to do it.
    pd.DataFrame({
        "ImageId": range(1,
                         len(predictions) + 1),
        "Label": predictions
    }).to_csv('submit.csv', index=False, header=True)
#Let's test out what number of trees is best on a forest!
import numpy as np
import read_dataset as rd
import evaluation as e
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# loading training data
print('Loading training data')
X,y = rd.read_train()
X,y = rd.nudge_dataset(X,y)

scores = []
scores_std = []

#just so we know it didn't blow up or something
print('Start learning...')
#The last few might be excessive.
forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250]

for tree in forests:
    print("This forest has {} trees!".format(tree))
    classifier = RandomForestClassifier(tree)
    #score = cross_validation.cross_val_score(classifier, X, y)
    #scores.append(np.mean(score))
    #scores_std.append(np.std(score))
    name = "plots_extended/RandomForest_{}_trees.png".format(tree)
    e.evaluate_classifier(classifier,X,y, name=name)

#print('Score: ', np.array(scores))
#print('Std  : ', np.array(scores_std))
Example #6
0
#Let's test out what number of trees is best on a forest!
import numpy as np
import read_dataset as rd
import evaluation as e
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

# loading training data
print('Loading training data')
X, y = rd.read_train()
X, y = rd.nudge_dataset(X, y)

scores = []
scores_std = []

#just so we know it didn't blow up or something
print('Start learning...')
#The last few might be excessive.
forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250]

for tree in forests:
    print("This forest has {} trees!".format(tree))
    classifier = RandomForestClassifier(tree)
    #score = cross_validation.cross_val_score(classifier, X, y)
    #scores.append(np.mean(score))
    #scores_std.append(np.std(score))
    name = "plots_extended/RandomForest_{}_trees.png".format(tree)
    e.evaluate_classifier(classifier, X, y, name=name)

#print('Score: ', np.array(scores))
#print('Std  : ', np.array(scores_std))