Esempio n. 1
0
def lgd(numfolds=5, threshold=0.5, featsfile='featsGBM.txt'):
    """"""
    # open up file containing the columns we wish to use
    f = fio.fileio(usePCA=False)
    st = time.time()

    # Found these with magic
    numcols = pd.read_csv(featsfile).feature.values[:150].tolist()

    X = f.loadNumericTrain(usecols=numcols)
    y = f.loadLabels().loss.values
    print "Training data took %f seconds to load" % (time.time() - st)

    rgr = GradientBoostingRegressor(
        **ml.INIT_PARAMS['GradientBoostingRegressor'])

    # Load the test data
    Xtest = f.loadNumericTest(usecols=numcols)

    # Open up the train/test files
    bTrain = pd.read_csv('../data/subs/train/gbm.csv')
    bTest = pd.read_csv('../data/subs/test/gbm.csv')
    zp = bTrain.loss.values
    yp = bTest.loss.values

    if True:
        y_ = np.zeros(zp.size)
        Z, p = X[zp > threshold, :], y[zp > threshold]
        y_[zp > threshold] = ml.stratKFold(Z, p, rgr, nFolds=2, classify=False)
        print "CV Error: %f" % mean_absolute_error(y, y_)

    # Train on all, transform to log space
    yy = y[zp > threshold]
    yy = np.log10(yy + 1.)

    rgr.fit(X[zp > threshold, :], yy)

    # predict and transform
    yr = rgr.predict(Xtest)
    yr = 10.**(yr) - 1.
    yr[yr < 0] = 0
    yr[yr > 100] = 100.

    print "Training took %f seconds" % (time.time() - st)

    sub_ = pd.read_csv('../data/sampleSubmission.csv')
    sub_.loss = yr * (yp > threshold)
    sub_.loss[sub_.loss < 0] = 0.

    # Write to file
    sub_.to_csv('../data/subs/testSubmission.csv', index=False)
Esempio n. 2
0
def lgd(numfolds=5,threshold=0.5,featsfile='featsGBM.txt'):
	""""""
	# open up file containing the columns we wish to use
	f = fio.fileio(usePCA=False)
	st = time.time()

	# Found these with magic
	numcols = pd.read_csv(featsfile).feature.values[:150].tolist()

	X = f.loadNumericTrain(usecols=numcols)
	y = f.loadLabels().loss.values
	print "Training data took %f seconds to load" %(time.time() - st)
	
	rgr = GradientBoostingRegressor(**ml.INIT_PARAMS['GradientBoostingRegressor'])

	# Load the test data	
	Xtest = f.loadNumericTest(usecols=numcols)

	# Open up the train/test files
	bTrain = pd.read_csv('../data/subs/train/gbm.csv')
	bTest = pd.read_csv('../data/subs/test/gbm.csv')
	zp = bTrain.loss.values
	yp = bTest.loss.values

	if True:
		y_ = np.zeros(zp.size)
		Z, p = X[zp > threshold,:], y[zp > threshold]
		y_[zp > threshold] = ml.stratKFold(Z, p, rgr, nFolds=2, classify=False)
		print "CV Error: %f"%mean_absolute_error(y, y_)

	# Train on all, transform to log space
	yy = y[zp > threshold]
	yy = np.log10(yy + 1.)

	rgr.fit(X[zp > threshold,:],yy)

	# predict and transform
	yr = rgr.predict(Xtest)
	yr = 10.**(yr) - 1.
	yr[yr < 0] = 0
	yr[yr > 100] = 100.

	print "Training took %f seconds"%(time.time() - st)

	sub_ = pd.read_csv('../data/sampleSubmission.csv')
	sub_.loss = yr*(yp > threshold)
	sub_.loss[sub_.loss < 0] = 0.

	# Write to file
	sub_.to_csv('../data/subs/testSubmission.csv',index=False)
Esempio n. 3
0
def classifier(filename='',
               calctest=True,
               traindir='',
               testdir='',
               numfolds=5):
    """"""
    outfile = filename + '.csv'

    # open up file containing the columns we wish to use
    st = time.time()
    f = fio.fileio(usePCA=True)
    cols = ['f274', 'f727', 'f2', 'f271', 'f527', 'f528']
    X = f.loadNumericTrain(usecols=cols)

    yf = f.loadLabels()
    y = yf.loss.values
    print "Training data took %f seconds to load" % (time.time() - st)

    # Train the gradient boosting classifier
    clf = GradientBoostingClassifier(
        **ml.INIT_PARAMS['GradientBoostingClassifier'])

    st = time.time()
    y_ = ml.stratKFold(X, y, clf, nFolds=numfolds)
    fpr, tpr, thresh = roc_curve(y > 0, y_)

    # Print the scores
    print "AUC: %f" % auc(fpr, tpr)
    print "F1 Score: %f" % ml.maxF1(y > 0, y_)
    print "%d-Fold CV took %f seconds" % (numfolds, time.time() - st)

    yf['loss'] = y_
    yf.to_csv(traindir + outfile, index=False)

    if calctest:
        st = time.time()
        # Load the test data
        Xtest = f.loadNumericTest(usecols=cols)

        # Fit the data
        clf.fit(X, y > 0)

        sub_ = pd.read_csv('../data/sampleSubmission.csv')
        sub_.loss = clf.predict_proba(Xtest)[:, 1]

        # Write to file
        sub_.to_csv(testdir + outfile, index=False)
        print "Test submission took %s seconds" % (time.time() - st)
Esempio n. 4
0
def classifier(filename='',
				calctest=True,
				traindir='',
				testdir='',
				numfolds=5):
	""""""
	outfile = filename+'.csv'

	# open up file containing the columns we wish to use
	st = time.time()
	f = fio.fileio(usePCA=True)
	cols = ['f274','f727', 'f2', 'f271', 'f527', 'f528']
	X = f.loadNumericTrain(usecols=cols)

	yf = f.loadLabels()
	y = yf.loss.values
	print "Training data took %f seconds to load" %(time.time() - st)
	
	# Train the gradient boosting classifier
	clf = GradientBoostingClassifier(**ml.INIT_PARAMS['GradientBoostingClassifier'])

	st = time.time()
	y_ = ml.stratKFold(X,y,clf,nFolds=numfolds)
	fpr, tpr, thresh = roc_curve(y > 0,y_)

	# Print the scores
	print "AUC: %f"%auc(fpr,tpr)
	print "F1 Score: %f"%ml.maxF1(y > 0,y_)
	print "%d-Fold CV took %f seconds"%(numfolds,time.time() - st)

	yf['loss'] = y_
	yf.to_csv(traindir+outfile,index=False)

	if calctest:
		st = time.time()
		# Load the test data	
		Xtest = f.loadNumericTest(usecols=cols)

		# Fit the data
		clf.fit(X,y > 0)

		sub_ = pd.read_csv('../data/sampleSubmission.csv')
		sub_.loss = clf.predict_proba(Xtest)[:,1]

		# Write to file
		sub_.to_csv(testdir+outfile,index=False)
		print "Test submission took %s seconds" %(time.time() - st)