def get_train_data(limit=-1): print('Loading train data') X,y = rd.read_train(limit=limit) print('Augmenting data set') X,y = rd.nudge_dataset(X,y) print('Scaling data') X = scale(X) return X,y
def get_train_data(limit=-1): print('Loading train data') X, y = rd.read_train(limit=limit) print('Augmenting data set') X, y = rd.nudge_dataset(X, y) print('Scaling data') X = scale(X) return X, y
def classifyRF(train_file="train.csv", test_file ="test.csv", trees=70): #So, let's classifiy this thing. Reading the Features and then the test. print("Reading train data") X,y = rd.read_train(file_name=train_file) print("Augmenting dataset") X,y = rd.nudge_dataset(X,y) print("Reading test data") test_data = rd.read_test(file_name=test_file) #Creating the classifier. It has a ton of parameters, but since this a hard and fast one, here you go. rfc = RandomForestClassifier(trees) #Train with the data we have. Cry a little inside. print("Training classifier") rfc.fit(X, y) predictions = rfc.predict(test_data) #Most submitions are cute with a CSV. Might as well learn how to do it. pd.DataFrame({"ImageId": range(1,len(predictions)+1), "Label": predictions}).to_csv('submit.csv', index=False, header=True)
def classifyRF(train_file="train.csv", test_file="test.csv", trees=70): #So, let's classifiy this thing. Reading the Features and then the test. print("Reading train data") X, y = rd.read_train(file_name=train_file) print("Augmenting dataset") X, y = rd.nudge_dataset(X, y) print("Reading test data") test_data = rd.read_test(file_name=test_file) #Creating the classifier. It has a ton of parameters, but since this a hard and fast one, here you go. rfc = RandomForestClassifier(trees) #Train with the data we have. Cry a little inside. print("Training classifier") rfc.fit(X, y) predictions = rfc.predict(test_data) #Most submitions are cute with a CSV. Might as well learn how to do it. pd.DataFrame({ "ImageId": range(1, len(predictions) + 1), "Label": predictions }).to_csv('submit.csv', index=False, header=True)
#Let's test out what number of trees is best on a forest! import numpy as np import read_dataset as rd import evaluation as e from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation # loading training data print('Loading training data') X,y = rd.read_train() X,y = rd.nudge_dataset(X,y) scores = [] scores_std = [] #just so we know it didn't blow up or something print('Start learning...') #The last few might be excessive. forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250] for tree in forests: print("This forest has {} trees!".format(tree)) classifier = RandomForestClassifier(tree) #score = cross_validation.cross_val_score(classifier, X, y) #scores.append(np.mean(score)) #scores_std.append(np.std(score)) name = "plots_extended/RandomForest_{}_trees.png".format(tree) e.evaluate_classifier(classifier,X,y, name=name) #print('Score: ', np.array(scores)) #print('Std : ', np.array(scores_std))
#Let's test out what number of trees is best on a forest! import numpy as np import read_dataset as rd import evaluation as e from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation # loading training data print('Loading training data') X, y = rd.read_train() X, y = rd.nudge_dataset(X, y) scores = [] scores_std = [] #just so we know it didn't blow up or something print('Start learning...') #The last few might be excessive. forests = [10, 15, 20, 25, 30, 40, 50, 70, 100, 125, 150, 175, 200, 250] for tree in forests: print("This forest has {} trees!".format(tree)) classifier = RandomForestClassifier(tree) #score = cross_validation.cross_val_score(classifier, X, y) #scores.append(np.mean(score)) #scores_std.append(np.std(score)) name = "plots_extended/RandomForest_{}_trees.png".format(tree) e.evaluate_classifier(classifier, X, y, name=name) #print('Score: ', np.array(scores)) #print('Std : ', np.array(scores_std))