Ejemplo n.º 1
0
def main():
    #read in the training file
    train = csv_io.read_csv("input_data/train.csv")
    train = train.values

    #set the training responses
    target = [x[0] for x in train]

    #set the training features
    train = [x[1:] for x in train]
    
    #read in the test file
    realtest = csv_io.read_csv("input_data/test.csv")
    realtest = realtest.values

    # random forest code
    rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1)
    
    # fit the training data
    print('fitting the model')
    rf.fit(train, target)
    print("Fitting complete!!!")

    # run model against test data
    predicted_probs = rf.predict_proba(realtest)
        
    predicted_probs = ["%d" % x[1] for x in predicted_probs]

    total_len = len(predicted_probs)

    result_df = pd.DataFrame({"ImageId" : range(1, total_len + 1), "Label" : predicted_probs[:]})

    csv_io.write_csv("random_forest_solution.csv", result_df)
    
    print ('Random Forest Complete! You Rock! Submit random_forest_solution.csv to Kaggle')
Ejemplo n.º 2
0
def main():
   #read in the training file
   header, csv = csv_io.read_csv("transactions.csv")
   header = np.array(header[2:])
   weight = np.array([x[0] for x in csv])
   target = [x[1] for x in csv]
   days_ago = [x[2] for x in csv]
   #balance = [x[9] for x in csv]
   train = [x[2:] for x in csv]


   print('fitting the model')

   rf = RandomForestRegressor(oob_score=True, n_estimators=100, max_features="auto")
   #rf = RandomForestRegressor(n_estimators=100)
   scores = cross_validation.cross_val_score(rf, train, target, cv=3)
   print scores
   print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

   #rf = AdaBoostRegressor()
   #scores = cross_validation.cross_val_score(rf, train, target, cv=3)
   #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

   #rf = GradientBoostingRegressor()
   #scores = cross_validation.cross_val_score(rf, train, target, cv=3)
   #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

   #rf = BaggingRegressor()
   #scores = cross_validation.cross_val_score(rf, train, target, cv=3)
   #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
   #return

   #rf.fit(train, target, weight)
   rf.fit(train, target)
   print "oob_score: %f" % rf.oob_score_ 

   #print target
   #print list(rf.oob_prediction_)

   order = np.argsort(rf.feature_importances_)[::-1]
   print order[0:19]
   print header[order[0:19]]
   print rf.feature_importances_[order[0:19]]
   print np.sum(rf.feature_importances_[order[0:19]])
   print np.sum(rf.feature_importances_)

   return
   print np.array(np.array(range(len(train))),rf.feature_importances_)
   print list(np.sort(rf.feature_importances_))
   #x = np.array([target, list(rf.oob_prediction_)]).T
   #print x
   print rf.oob_score_ 

   #plt.plot(days_ago, balance, 'g-')
   #plt.show()

   plt.plot(days_ago, target, 'g-', days_ago, list(rf.oob_prediction_), 'r-')
   plt.show()
Ejemplo n.º 3
0
def main():
    #read in the training and test data
    train = csv_io.read_csv("data/train.csv")
    #the first column of the training set will be the target for the random forest classifier
    target = [x[0] for x in train]
    train = [x[1:] for x in train]
    test = csv.read_csv("data/test.csv")

    #create and train the random forest
    print(train.data)
Ejemplo n.º 4
0
def to_json(input_file):
    
    csv_data = csv_io.read_csv(input_file)

    relation = {}
    for row in csv_data:
        relation[row[0]] = {}
        for i in range(1, len(row)):
            relation[row[0]][row[i]] = {'know': 1}

    json_io.write_json('output/social_graph.json', relation)
Ejemplo n.º 5
0
def main():

    #Auth
    storage = Storage('prediction.dat')
    credentials = storage.get()
    if credentials is None or credentials.invalid: credentials = run(FLOW, storage) 
    http = httplib2.Http()
    http = credentials.authorize(http)
    
    service = build("prediction", "v1.4", http=http)

    testset = csv_io.read_csv("test.csv", False)
    submit = []
    count = 1
    for i in range(len(testset)):
        try:
            body = {"input":{"csvInstance":parseInput(testset[i])}}
            prediction = service.trainedmodels().predict(id="br2", body=body).execute()
            predval = prediction['outputValue']
            #Google Predict isn't smart enough to figure out that we want a probability
            #Instead it fits a regression, which can go outside the range 0-1
            if predval >=1.0: predval = 0.99999
            if predval <=0.0: predval = 0.00001
            
            submit.append(predval)
            print str(count) + ": " + str(predval)
        except AccessTokenRefreshError:
            print ("The credentials have been revoked or expired, please re-run the application to re-authorize")
        except:
            etype, value, tb = sys.exc_info()
            msg = ''.join(traceback.format_exception(etype, value, tb))
            csv_io.write_csv("g_submit_err.csv", [["%f" % x] for x in submit])
            print "error on: " + str(i) + "  exception: " + msg
        if count % 50 == 0:
            csv_io.write_csv("g_submit.csv", [["%f" % x] for x in submit])
            print "wrote to disk"
        count = count + 1
        #uncomment below if hitting throttling limites from Google
        #time.sleep(0.5)
    csv_io.write_csv("g_submit.csv", [["%f" % x] for x in submit])
Ejemplo n.º 6
0
import numpy as np
from sklearn import svm
from csv_io import read_csv


if __name__ == '__main__':

	data = read_csv(r'data/training.csv')
	
	data_points, n_features = data.shape
	n_features = n_features - 1
	
	X = data[:,2:5]
	y = data[:,n_features]
	
	clf = svm.SVC()
	clf.fit(X,y)
	
	data = read_csv(r'data/test.csv')

	X = data[:,2:5]
	y = clf.predict(X)