def main(): #read in the training file train = csv_io.read_csv("input_data/train.csv") train = train.values #set the training responses target = [x[0] for x in train] #set the training features train = [x[1:] for x in train] #read in the test file realtest = csv_io.read_csv("input_data/test.csv") realtest = realtest.values # random forest code rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1) # fit the training data print('fitting the model') rf.fit(train, target) print("Fitting complete!!!") # run model against test data predicted_probs = rf.predict_proba(realtest) predicted_probs = ["%d" % x[1] for x in predicted_probs] total_len = len(predicted_probs) result_df = pd.DataFrame({"ImageId" : range(1, total_len + 1), "Label" : predicted_probs[:]}) csv_io.write_csv("random_forest_solution.csv", result_df) print ('Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle')
def main(): #read in the training file header, csv = csv_io.read_csv("transactions.csv") header = np.array(header[2:]) weight = np.array([x[0] for x in csv]) target = [x[1] for x in csv] days_ago = [x[2] for x in csv] #balance = [x[9] for x in csv] train = [x[2:] for x in csv] print('fitting the model') rf = RandomForestRegressor(oob_score=True, n_estimators=100, max_features="auto") #rf = RandomForestRegressor(n_estimators=100) scores = cross_validation.cross_val_score(rf, train, target, cv=3) print scores print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #rf = AdaBoostRegressor() #scores = cross_validation.cross_val_score(rf, train, target, cv=3) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #rf = GradientBoostingRegressor() #scores = cross_validation.cross_val_score(rf, train, target, cv=3) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #rf = BaggingRegressor() #scores = cross_validation.cross_val_score(rf, train, target, cv=3) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #return #rf.fit(train, target, weight) rf.fit(train, target) print "oob_score: %f" % rf.oob_score_ #print target #print list(rf.oob_prediction_) order = np.argsort(rf.feature_importances_)[::-1] print order[0:19] print header[order[0:19]] print rf.feature_importances_[order[0:19]] print np.sum(rf.feature_importances_[order[0:19]]) print np.sum(rf.feature_importances_) return print np.array(np.array(range(len(train))),rf.feature_importances_) print list(np.sort(rf.feature_importances_)) #x = np.array([target, list(rf.oob_prediction_)]).T #print x print rf.oob_score_ #plt.plot(days_ago, balance, 'g-') #plt.show() plt.plot(days_ago, target, 'g-', days_ago, list(rf.oob_prediction_), 'r-') plt.show()
def main(): #read in the training and test data train = csv_io.read_csv("data/train.csv") #the first column of the training set will be the target for the random forest classifier target = [x[0] for x in train] train = [x[1:] for x in train] test = csv.read_csv("data/test.csv") #create and train the random forest print(train.data)
def to_json(input_file): csv_data = csv_io.read_csv(input_file) relation = {} for row in csv_data: relation[row[0]] = {} for i in range(1, len(row)): relation[row[0]][row[i]] = {'know': 1} json_io.write_json('output/social_graph.json', relation)
def main(): #Auth storage = Storage('prediction.dat') credentials = storage.get() if credentials is None or credentials.invalid: credentials = run(FLOW, storage) http = httplib2.Http() http = credentials.authorize(http) service = build("prediction", "v1.4", http=http) testset = csv_io.read_csv("test.csv", False) submit = [] count = 1 for i in range(len(testset)): try: body = {"input":{"csvInstance":parseInput(testset[i])}} prediction = service.trainedmodels().predict(id="br2", body=body).execute() predval = prediction['outputValue'] #Google Predict isn't smart enough to figure out that we want a probability #Instead it fits a regression, which can go outside the range 0-1 if predval >=1.0: predval = 0.99999 if predval <=0.0: predval = 0.00001 submit.append(predval) print str(count) + ": " + str(predval) except AccessTokenRefreshError: print ("The credentials have been revoked or expired, please re-run the application to re-authorize") except: etype, value, tb = sys.exc_info() msg = ''.join(traceback.format_exception(etype, value, tb)) csv_io.write_csv("g_submit_err.csv", [["%f" % x] for x in submit]) print "error on: " + str(i) + " exception: " + msg if count % 50 == 0: csv_io.write_csv("g_submit.csv", [["%f" % x] for x in submit]) print "wrote to disk" count = count + 1 #uncomment below if hitting throttling limites from Google #time.sleep(0.5) csv_io.write_csv("g_submit.csv", [["%f" % x] for x in submit])
import numpy as np from sklearn import svm from csv_io import read_csv if __name__ == '__main__': data = read_csv(r'data/training.csv') data_points, n_features = data.shape n_features = n_features - 1 X = data[:,2:5] y = data[:,n_features] clf = svm.SVC() clf.fit(X,y) data = read_csv(r'data/test.csv') X = data[:,2:5] y = clf.predict(X)