Example #1
0
def beatthebenchmark():
	#Columns to be picked from training file
	pickTrain = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth','Ca','P','pH','SOC','Sand']
	data = np.genfromtxt(trainloc, names=True, delimiter=',', usecols=(pickTrain))
	#Column to be picked from test file
	pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI']
	test = np.genfromtxt(testloc, names=True, delimiter=',', usecols=(pickTest))
	ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0)
	#Features to train model on
	featuresList = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI']
	#Keep a copy of train file for later use
	data1 = np.copy(data)
	#Dependent/Target variables
	targets = ['Ca','P','pH','SOC','Sand']
	#Prepare empty result
	df = pd.DataFrame({"PIDN": ids, "Ca": test['PIDN'], "P": test['PIDN'], "pH": test['PIDN'], "SOC": test['PIDN'], "Sand": test['PIDN']})
	for target in targets:
		#Prepare data for training
		data, testa, features, fillVal = util.prepDataTrain(data1, target, featuresList, False, 10, False, True, 'mean', False, 'set')
	print 'Data preped'
	#Use/tune your predictor
	clf = ensemble.GradientBoostingRegressor(n_estimators=20)
	clf.fit(data[features].tolist(), data[target])
	#Prepare test data
	test = util.prepDataTest(test, featuresList, True, fillVal, False, 'set')
	#Get predictions
	pred = clf.predict(test[features].tolist())
	#Store results
	df[target] = pred
	df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
Example #2
0
def classification(data, featuresList):
    data["target"][:, (data["target"] > 0)] = 1
    data["target"][:, (data["target"] == 0)] = 0

    data, testa, features, fillVal = util.prepDataTrain(
        data, "target", featuresList, True, 50, True, True, "median", False, "set"
    )
    print "Data preped"

    clf = bayes.GaussianNB()
    # clf = tree.DecisionTreeClassifier()
    clf.fit(data[features].tolist(), data["target"])
    pred = clf.predict_proba(testa[features].tolist())[:, 1]
    pred[pred > 0.005] = 1
    pred[pred <= 0.005] = 0
    res = testa["target"] - pred
    print res, pred, testa["target"], len(np.where(res[res < -0.5])[0]), len(np.where(res[res > 0.5])[0]), len(
        np.where(testa["target"][testa["target"] > 0.5])[0]
    ), testa.shape, data.shape
Example #3
0
def classification(data, featuresList):
    data['target'][:, (data['target'] > 0)] = 1
    data['target'][:, (data['target'] == 0)] = 0

    data, testa, features, fillVal = util.prepDataTrain(
        data, 'target', featuresList, True, 50, True, True, 'median', False,
        'set')
    print 'Data preped'

    clf = bayes.GaussianNB()
    #clf = tree.DecisionTreeClassifier()
    clf.fit(data[features].tolist(), data['target'])
    pred = clf.predict_proba(testa[features].tolist())[:, 1]
    pred[pred > .005] = 1
    pred[pred <= .005] = 0
    res = testa['target'] - pred
    print res, pred, testa['target'], len(np.where(res[res < -.5])[0]), len(
        np.where(res[res > .5])[0]), len(
            np.where(testa['target'][testa['target'] > .5])
            [0]), testa.shape, data.shape
Example #4
0
    clf = 0
    print target

    '''
    for i in range(len(data)):
        if data[target][i] > (data[target].mean() + 2*data[target].std()) or data[target][i] < (data[target].mean() - 2*data[target].std()):
            delList = np.append(delList, i)
            print (data[target].mean() - 1*data[target].std()), data[target].std()
    '''

    #clf = linear.BayesianRidge(verbose=True, alpha_1=2, alpha_2=2, lambda_1=.01, lambda_2=.01, fit_intercept=True, compute_score=True)
    #clf = linear.BayesianRidge(verbose=True)
    #clf = tree.DecisionTreeRegressor(max_depth=2)
    clf = svm.SVR(C=10000.0, kernel='rbf', degree=1)
    data = np.delete(data, delList, 0)
    data, testa, features, fillVal = util.prepDataTrain(data, target, featuresList, False, 20, False, True, 'mean', False, 'set')
    data = recfunctions.rec_drop_fields(data, delFeatures)
    #features = ['CTI','Depth', 'RELI', 'LSTN']
    #an.plotData(np.sqrt(1+data['P']), data['ELEV']*(-1*data['TMAP']))
    #data, clust, enc, newCol = clusterData(data, clusterFields, True)
    #testa, clust, enc, newCol = clusterData(testa, pickTest, True, enc, clust, False)
    #features = np.concatenate((features, newCol))
    
    #Use/tune your predictor
    #clf.fit(data[features].tolist(), data[target])
    #import pydot
    #dot_data = StringIO.StringIO() 
    #tree.export_graphviz(clf, out_file=dot_data) 
    #graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    
    #graph.write_pdf("./ds.pdf")
Example #5
0
import numpy as np
import utilities as util
import sklearn.linear_model as linear
import sklearn.ensemble as ensemble
from sklearn import cross_validation
import pandas as pd

data = np.genfromtxt('../training.csv', names=True, delimiter=',')
test = np.genfromtxt('../test.csv', names=True, delimiter=',')
featuresList = ['DER_mass_MMC','DER_mass_transverse_met_lep','DER_mass_vis','DER_pt_h','DER_deltaeta_jet_jet','DER_mass_jet_jet','DER_prodeta_jet_jet','DER_deltar_tau_lep','DER_pt_tot','DER_sum_pt','DER_pt_ratio_lep_tau','DER_met_phi_centrality','DER_lep_eta_centrality','PRI_tau_pt','PRI_tau_eta','PRI_tau_phi','PRI_lep_pt','PRI_lep_eta','PRI_lep_phi','PRI_met','PRI_met_phi','PRI_met_sumet','PRI_jet_num','PRI_jet_leading_pt','PRI_jet_leading_eta','PRI_jet_leading_phi','PRI_jet_subleading_pt','PRI_jet_subleading_eta','PRI_jet_subleading_phi','PRI_jet_all_pt']

data, testa, features, fillVal = util.prepDataTrain(data, 'Label', featuresList, False, 10, False, True, 'mean', False, 'set')

print 'Data preped'

clf = ensemble.GradientBoostingClassifier(n_estimators=50)
#clf = ensemble.BaggingClassifier()

clf.fit(data[features].tolist(), data['Label'])
#scores = cross_validation.cross_val_score(clf, data[features].tolist(), data['Label'], cv=5, scoring='f1')
#print scores

#print clf.score(test[features].tolist(), test['Label'])
print 'fitted'
pcut = .50
ids = test['EventId'].astype(int)
X_test = util.prepDataTest(test, featuresList, True, fillVal, False, 'set')
#data = pd.read_csv("../test.csv")
#X_test = data.values[:, 1:]

Example #6
0
data1 = np.copy(data)

featuresList = [
    'weatherVar185', 'weatherVar21', 'weatherVar189', 'weatherVar161',
    'weatherVar103', 'weatherVar95', 'weatherVar194', 'weatherVar216',
    'weatherVar186', 'weatherVar110', 'weatherVar137', 'weatherVar23',
    'weatherVar49', 'weatherVar232', 'weatherVar68', 'weatherVar22',
    'weatherVar151', 'weatherVar16', 'geodemVar14', 'geodemVar29', 'var8',
    'var4', 'var10', 'var11', 'var12', 'var13', 'var15', 'var17'
]

#Cross validation testscores
for i in ([0, 1]):
    data = np.copy(data1)
    data, testa, features, fillVal = util.prepDataTrain(
        data, 'target', featuresList, True, 50, False, True, 'median', False,
        'set', i)
    data['target'] = np.log(math.e + data['target'])
    data['target'][
        data['target'] > 3] = 3  #np.log(data['target'][data['target'] > 10])

    print 'Data preped'

    clf = ensemble.GradientBoostingRegressor(n_estimators=45,
                                             max_depth=5,
                                             min_samples_leaf=20,
                                             min_samples_split=30,
                                             verbose=True,
                                             loss='ls')
    clf.fit(data[features].tolist(), data['target'])
    print 'fitted'
Example #7
0
#Features to train model on
featuresList = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI']

#Keep a copy of train file for later use
data1 = np.copy(data)

#Dependent/Target variables
targets = ['Ca','P','pH','SOC','Sand']

#Prepare empty result
df = pd.DataFrame({"PIDN": ids, "Ca": test['PIDN'], "P": test['PIDN'], "pH": test['PIDN'], "SOC": test['PIDN'], "Sand": test['PIDN']})

for target in targets:
    #Prepare data for training
    data, testa, features, fillVal = util.prepDataTrain(data1, target, featuresList, False, 10, False, True, 'mean', False, 'set')

    print 'Data preped'
    
    #Use/tune your predictor
    clf = ensemble.GradientBoostingRegressor(n_estimators=20)
    clf.fit(data[features].tolist(), data[target])

    #Prepare test data
    test = util.prepDataTest(test, featuresList, True, fillVal, False, 'set')
    
    #Get predictions
    pred = clf.predict(test[features].tolist())
    
    #Store results
    df[target] = pred
Example #8
0
    "geodemVar29",
    "var8",
    "var4",
    "var10",
    "var11",
    "var12",
    "var13",
    "var15",
    "var17",
]

# Cross validation testscores
for i in [0, 1]:
    data = np.copy(data1)
    data, testa, features, fillVal = util.prepDataTrain(
        data, "target", featuresList, True, 50, False, True, "median", False, "set", i
    )
    data["target"] = np.log(math.e + data["target"])
    data["target"][data["target"] > 3] = 3  # np.log(data['target'][data['target'] > 10])

    print "Data preped"

    clf = ensemble.GradientBoostingRegressor(
        n_estimators=45, max_depth=5, min_samples_leaf=20, min_samples_split=30, verbose=True, loss="ls"
    )
    clf.fit(data[features].tolist(), data["target"])
    print "fitted"

    pred = np.power(clf.predict(testa[features].tolist()), math.e)
    print normalized_weighted_gini(testa["target"], pred, testa["var11"])
    # for i in range(len(clf.feature_importances_)):
Example #9
0
def beatthebenchmark():
    #Columns to be picked from training file
    pickTrain = [
        'BSAN', 'BSAS', 'BSAV', 'CTI', 'ELEV', 'EVI', 'LSTD', 'LSTN', 'REF1',
        'REF2', 'REF3', 'REF7', 'RELI', 'TMAP', 'TMFI', 'Depth', 'Ca', 'P',
        'pH', 'SOC', 'Sand'
    ]
    data = np.genfromtxt(trainloc,
                         names=True,
                         delimiter=',',
                         usecols=(pickTrain))
    #Column to be picked from test file
    pickTest = [
        'PIDN', 'BSAN', 'BSAS', 'BSAV', 'CTI', 'ELEV', 'EVI', 'LSTD', 'LSTN',
        'REF1', 'REF2', 'REF3', 'REF7', 'RELI', 'TMAP', 'TMFI'
    ]
    test = np.genfromtxt(testloc,
                         names=True,
                         delimiter=',',
                         usecols=(pickTest))
    ids = np.genfromtxt(testloc,
                        dtype=str,
                        skip_header=1,
                        delimiter=',',
                        usecols=0)
    #Features to train model on
    featuresList = [
        'BSAN', 'BSAS', 'BSAV', 'CTI', 'ELEV', 'EVI', 'LSTD', 'LSTN', 'REF1',
        'REF2', 'REF3', 'REF7', 'RELI', 'TMAP', 'TMFI'
    ]
    #Keep a copy of train file for later use
    data1 = np.copy(data)
    #Dependent/Target variables
    targets = ['Ca', 'P', 'pH', 'SOC', 'Sand']
    #Prepare empty result
    df = pd.DataFrame({
        "PIDN": ids,
        "Ca": test['PIDN'],
        "P": test['PIDN'],
        "pH": test['PIDN'],
        "SOC": test['PIDN'],
        "Sand": test['PIDN']
    })
    for target in targets:
        #Prepare data for training
        data, testa, features, fillVal = util.prepDataTrain(
            data1, target, featuresList, False, 10, False, True, 'mean', False,
            'set')
    print 'Data preped'
    #Use/tune your predictor
    clf = ensemble.GradientBoostingRegressor(n_estimators=20)
    clf.fit(data[features].tolist(), data[target])
    #Prepare test data
    test = util.prepDataTest(test, featuresList, True, fillVal, False, 'set')
    #Get predictions
    pred = clf.predict(test[features].tolist())
    #Store results
    df[target] = pred
    df.to_csv(predloc,
              index=False,
              cols=["PIDN", "Ca", "P", "pH", "SOC", "Sand"])