Esempio n. 1
0
    def __init__(self, config_file=''):
        # Parse config file
        self.parser = SafeConfigParser()
        self.parser.read(config_file)

        # machine learning specific variables
        self.classify = constants.DO_CLASSIFICATION  # Regress or classify?
        self.vars_features = constants.fixed_vars
        self.vars_target = constants.ML_TARGETS

        if self.classify:
            self.var_target = constants.ML_TARGETS
            self.task = 'classification'
            self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)
        else:
            self.var_target = constants.ML_TARGETS
            self.task = 'regression'
            self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)  # SVR()

        # Get path to input
        self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl

        # Output directory is <dir>_<classification>_<2014>
        self.path_out_dir = constants.out_dir
        utils.make_dir_if_missing(self.path_out_dir)

        # Model pickle
        self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle
        self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'
def RandomForest(x_train,y_train,x_test,degree):    
     params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True}
     clf = RandomForestRegressor(**params)
     clf.fit(x_train, y_train)          
     y_predict = clf.predict(x_test)
     #plt.plot(x_test,y_predict,color='red')
     return y_predict
Esempio n. 3
0
def RF_ST(trainFileName,testFilename):
    trainData = ld.LoadData_DATA_ST(trainFileName)
    testData = ld.LoadData_DATA_ST(testFilename)
    
    store = ['1','2','3','4','5']
    res = []
    
    for i in store:
        train_X = [];train_y = []
        context = trainData[i]
        for array in context:
            array = [float(x) for x in array[2:]]
            train_X.append((array[2:-1]))
            train_y.append(array[-1])
            
        test_X = [];items = []
        context = testData[i]
        for array in context:
            items.append((array[0],array[1]))
            array = [float(x) for x in array[2:] ]
            test_X.append((array[2:]))
            
         
        clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\
                    fit(train_X,train_y)
        pred_y = clf.predict(test_X)
         
        for i in range(len(pred_y)):
            res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)])
    return res
Esempio n. 4
0
def RF_ALL(trainFileName,testFileName):
    train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName)
    Eval_X, items = ld.LoadData_DATA_ITEM(testFileName)
    clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\
            fit(train_X, train_y)
    pred_y = clf.predict(Eval_X)
    res = []
    for i in range(len(Eval_X)):
        res.append([items[i],'all','%.4f'%max(pred_y[i],0)])
    return res
 def __init__(self, sig_weight=1., pow_sig=1., pow_bg=1., gap=1., n_estimators=10,
              criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="auto",
              bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0,
              min_density=None, compute_importances=None):
     RandomForestRegressor.__init__(self)
     # Everything should be set via set_params
     self.sig_weight = sig_weight
     self.pow_bg = pow_bg
     self.pow_sig = pow_sig
     self.gap = gap
 def run(self):
     print "Reading device separations..."
     indexes = np.load("indexesTrain.npy")
     self.train = self.train.values
     print "Getting attributes..."
     trainFeatures = [self.getMainFeatures(self.train, indexes, i) for i in range(len(indexes))]
     for i in range(len(indexes)):
         (trainVect, targetVect) = self.getAttributes(trainFeatures, indexes, i)
         classifier = RandomForestRegressor(n_estimators=500, verbose=2, n_jobs=4, random_state=1)
         classifier.fit(trainVect, targetVect)
         pickle.dump(classifier, open("models/models" + str(i) + ".mod", "w"))
def RandomForest(x_train,y_train,x_test,y_test):
     degree = [1,2,3,4,7]
     result = {}
     rmse_list = []
     for d in degree:
          params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True}
          clf = RandomForestRegressor(**params)
          clf.fit(x_train[:, np.newaxis], y_train)
          y_predict = clf.predict(x_test[:, np.newaxis])
          rmsevalue = rmse(y_test,y_predict)
          result[rmsevalue] = [y_predict,d]
          rmse_list.append(rmsevalue)
     rmseMin = min(rmse_list)     
     return rmsevalue,result[rmseMin]
Esempio n. 8
0
def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2):
    alg = RandomForestRegressor(random_state=1)
    alg.fit(train_set[predictors], train_target)
    
    #importances = alg.feature_importances_
    #print("Original ",numpy.argsort(importances))
    #indices = numpy.argsort(importances)[::-1]
    #print (" importances ",importances)
    #print (" indices ",indices)
    
    #for f in range(train_set.shape[1]-2):
    #    print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]],
    #                                    importances[indices[f]]))

    predictions = alg.predict(test_set[predictors])
    return predictions;
Esempio n. 9
0
 def __init__(self):
     super(ItemSetModel, self).__init__()
     #self.clf = DecisionTreeRegressor()
     #self.clf = Lasso(0.1)
     #self.clf = SVR(kernel='rbf')
     #self.clf = ElasticNetCV()
     self.clf = RandomForestRegressor(max_depth=7, n_estimators=10)
def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'):
    print "Loading data..."
    
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    y = np.array(train_data[["ACTION"]])
    #X = np.array(train_data.ix[:,1:-1])     # Ignores ACTION, ROLE_CODE
    X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]])
    X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE
 
    SEED = 4
    #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y)
    
    
    
    clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y)

    print clf.feature_importances_
    #Try feature selection
    
    mean_auc = 0.0
    n = 10
    for i in range(n):
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it
        
        # train model and make predictions
        clf.fit(X_train, y_train) 
        preds = clf.predict(X_cv)

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc
    
    print "Mean AUC: %f" % (mean_auc/n)
    predictions = clf.predict_(X_test)
    #print predictions
    
    #print 'Writing predictions to %s...' % (output_file)
    create_test_submission(output_file, predictions)

    return 0
Esempio n. 11
0
    def run(self):
        # extract data from the batch
        df_train = pd.read_csv(self.input().path, header=[0, 1])

        X, y = preprocess2(df_train, snr=10.)
        # train regressor
        reg = RandomForestRegressor(10, min_samples_leaf=10, max_depth=9,
                                    n_jobs=-1)
        # reg = KNeighborsRegressor(algorithm="auto")
        # reg = LinearRegression()
        # reg = sklearn.svm.SVR(kernel="rbf", degree=3, C=100., gamma=10.)
        # reg = LinearSaO2Unmixing()
        reg.fit(X, y.values)
        # reg = LinearSaO2Unmixing()
        # save regressor
        regressor_file = self.output().open('w')
        pickle.dump(reg, regressor_file)
        regressor_file.close()
Esempio n. 12
0
    def make_models(self, missing_columns):

        available_table = self.full_table.copy()
        #clear out the table
        for column in missing_columns:
            del available_table[column]
        available_features = available_table.as_matrix()

        clfs = {}
        #build a model for each missing column
        for column in missing_columns:
            labels = self.full_table.as_matrix(columns = [column])
            labels = np.reshape(labels, (len(labels))) #unnest the arrays
            clf = RandomForestRegressor(n_estimators = 100)
            clf.fit(available_features, labels, available_table['WGTP'])
            clfs[column] = clf

        return clfs
Esempio n. 13
0
def predict_per_cpu_full():
    data, target = load_data()
    data, target, labels = normalize_data(data, target)

    data = data[['C0', 'cpuFull']]
    data['target'] = target
    split_by_types = dict()

    cpu_groups = data.groupby('cpuFull')
    for name, group in cpu_groups:
        X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target'])
        split_by_types[str(name)] = {
            'train': {
                'data': X_train,
                'target': y_train
            },
            'test': {
                'data': X_test,
                'target': y_test
            }
        }

    # print split_by_types
    summ = 0.0
    for cpu, data_set in split_by_types.iteritems():
        plt.figure()
        # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0)
        reg = RandomForestRegressor(n_estimators=5)
        reg.fit(data_set['train']['data'], data_set['train']['target'])
        test_data = data_set['test']['data']
        y_pred = reg.predict(test_data)
        print mape(data_set['test']['target'], y_pred), cpu
        plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual')
        plt.scatter(test_data, y_pred, s=3, color='r', label='predicted')
        plt.legend(loc='upper left')
        plt.ylabel('mul time')
        plt.title('Category: {}'.format(cpu))
        plt.savefig('imgs/{}.png'.format(cpu))
Esempio n. 14
0
def train(data,val_ind,indices):
    
    max_numb = val_ind.shape[1]
    
    regs = []
    for i in range(max_numb):
        regs.append(0)
        
    for i in indices:
#        print i
#        reg = sklearn.linear_model.Lasso(max_iter=3000)
        reg = RandomForestRegressor()
#        reg=skl.tree.DecisionTreeRegressor()
#        reg = skl.linear_model.LinearRegression()
#        reg = AdaBoostRegressor()
#        print val_ind.shape
#        print val_ind[:,i]
#        print data.shape
#        print data[0]
#        print len(val_ind[:,i])
        reg.fit(data,val_ind[:,i])
        regs[i]=reg
        
    return regs
Esempio n. 15
0
y = household.as_matrix(columns = ['KWH'])
y = np.reshape(y, (len(y)))
del household['KWH']
#del household['ST']
#del household['DIVISION']
#del household['ELEP']

#if 'CDD' in household.columns:
#    del household['CDD']
#    del household['HDD']
X = household.as_matrix()
X = np.nan_to_num(X)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 10, n_jobs = 8)
clf.fit(X_train, y_train)

print(metrics.mean_squared_error(y_test, clf.predict(X_test)))
print(metrics.r2_score(y_test, clf.predict(X_test)))

predictions = clf.predict(X_test)[:50]
'''
features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)
'''
pums = pd.read_csv("../joined_weather.csv")
pums = pums.sample(1000)
pums_puma_vector = pums.as_matrix(columns = ['PUMA'])
left_matrix = pums[['PUMA', 'WGTP', 'SERIALNO']]
del pums['PUMA']
Esempio n. 16
0
import pandas as pd 
from sklearn.ensemble.forest import RandomForestRegressor
import time

dset = pd.read_csv("./data/concrete_data.csv")
X = dset.iloc[:, 0:7]
y = dset.iloc[:, 8]


estimator = RandomForestRegressor(max_features = 3, n_estimators = 50, n_jobs = 1, oob_score = True)

t0 = time.time()
estimator.fit(X, y)
print(time.time() - t0)

Esempio n. 17
0
 def predict(self, X):
     return RandomForestRegressor.predict(self, X)[:, numpy.newaxis]
            s = s + '' + str(valoresDiff[i + j]) + ','
            d.append(valoresDiff[i + j])
        for j in range(0, 15):
            s = s + '' + str(valoresVol[i + j]) + ','
            d.append(valoresVol[i + j])
        #maxValores = max(valoresCopiar[longValores:i+j]) # Esta cambia porque no debemos tener los valores
        #maxValores = max(valoresCopiar[i+j:i+j+5])
        maxValores = max(valoresCopiar[i + 14 + longValoresTest:i +
                                       longValoresTest + 14 + 5])
        #s = s + str(150)
        #d.append(150)
        X_test.append(d)
        y_test.append(maxValores)

    # Entrenamos
    regr = RandomForestRegressor()
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    regr2 = SVR(kernel='rbf', C=1e3, gamma=0.1)
    regr2.fit(X_train, y_train)
    y_pred2 = regr2.predict(X_test)

    regr3 = linear_model.LinearRegression()
    regr3.fit(X_train, y_train)
    y_pred3 = regr3.predict(X_test)

    # Votacion
    # Si todos OK => Se invierte
    votacion = 0
    max_pred_array = [max(y_pred), max(y_pred2), max(y_pred3)]
Esempio n. 19
0
def doEval(dayNight, landuse, topo, traffic_static, traffic_dynamic, weather,
           time, output):

    if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False:
        return

    groupName = "lu"
    if landuse == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "to"
    if topo == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ts"
    if traffic_static == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "td"
    if traffic_dynamic == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "we"
    if weather == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    groupName = groupName + "ti"
    if time == True:
        groupName = groupName + "1"
    else:
        groupName = groupName + "0"

    print("Group: " + groupName)

    columnsToUse = []

    if landuse == True:
        columnsToUse.append('leisure_area')
        columnsToUse.append('landuse_area')
    if topo == True:
        columnsToUse.append('buildings_number')
        columnsToUse.append('buildings_area')
    if traffic_static == True:
        columnsToUse.append('lane_length')
        columnsToUse.append('length')
    if traffic_dynamic == True:
        columnsToUse.append('traffic_length_car')
        columnsToUse.append('traffic_length_lgv')
        columnsToUse.append('traffic_length_hgv')
    if weather == True:
        columnsToUse.append('winddirection')
        columnsToUse.append('windspeed')
        columnsToUse.append('temperature')
        columnsToUse.append('rain')
        columnsToUse.append('pressure')
    if time == True:
        columnsToUse.append('hour')
        columnsToUse.append('day_of_week')
        columnsToUse.append('month')
        columnsToUse.append('bank_holiday')
        columnsToUse.append('race_day')

    data = {}
    columns = []
    loadData(dataFile, ['timestamp'], data, columns)

    locationValues = findOutKForValidation("location", data)

    for location in locationValues:

        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, columnsToUse, "target", dayNight)

        print("\t" + str(len(trainX)) + "," + str(len(testX)))

        model = RandomForestRegressor(min_samples_leaf=9,
                                      n_estimators=59,
                                      n_jobs=1,
                                      random_state=42)

        model.fit(trainX, trainY)

        prediction = model.predict(testX)

        rmse = rmseEval(testY, prediction)

        print("\t" + str(rmse))

        output.write(str(dayNight) + ",")
        output.write(groupName + ",")
        output.write(str(rmse[1]) + "\n")
        output.flush()
Esempio n. 20
0
btc_ld = np.log(btc_df) - np.log(btc_df.shift(1))
btc_ld = btc_ld.dropna()

#  split the dataframe into train and test
train = btc_ld.loc[datetime.date(year=2014,month=1,day=1):datetime.date(year=2017,month=12,day=31)]
test = btc_ld.loc[datetime.date(year=2018,month=1,day=1):datetime.date(year=2018,month=1,day=31)]

# split into input and output?
trainX = np.asarray(train.drop(columns='BTC'))
trainY = np.asarray(train.BTC)

testX = np.asarray(test.drop(columns='BTC'))
testY = np.asarray(test.BTC)

# Define the RF model
RF_Model = RandomForestRegressor(n_estimators=100,
                                 max_features=1, oob_score=True)
# Fit the model
rf_fitted = RF_Model.fit(trainX,trainY)

# predict the trained data
trainY_predict = rf_fitted.predict(trainX)
trainY_predict = trainY_predict.reshape(-1,1) #reshape (transpose)
# Plot the predicted training data
train_plot_df = pd.DataFrame(trainY_predict, columns='Predicted BTC')
train_plot_df = train_plot_df.set_index(train.index)
train_plot_df['BTC'] = train.BTC
# test
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
r2_train = r2_score(train.BTC,trainY_predict)
Esempio n. 21
0
import numpy as np
from sklearn.ensemble.forest import RandomForestRegressor
import stockPlot as sp

# Initiate the monthly trade object
monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013)
# Download data from Yahoo finance
monthData.monthlyDataDownload()
# Pre-processing of training an testing data
monthData.trainFeaturePre()
# Read pre-processed data from hard drive
# monthData.trainFeaturePreHd()
# Number of training months
trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan
# Initiate a random forest regressor
clf = RandomForestRegressor(n_estimators=10)
#
totalReturn = 1
predictedReturn = np.zeros(monthData.stockNum)
monthlyReturn = np.zeros(monthData.testSpan)
aggReturn = np.zeros(monthData.testSpan+1)
aggReturn[0] = 1
# rolling training and testing
for j in range(0, monthData.testSpan):
    for i in range(0, monthData.stockNum):
        clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i])
        predictedReturn[i] = clf.predict(monthData.xTest[j, :, i])
    monthlyReturn[j] = monthData.por10Returns(j, predictedReturn)
    yearReturn = totalReturn * (monthlyReturn[j]+1)
    aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j])
x_test = x_test.drop(['segment_id'], axis=1)

# prepare models
models = []
# models.append(('LR', LogisticRegression()))
# models.append(('LDA', LinearDiscriminantAnalysis()))
# models.append(('KNN', KNeighborsClassifier()))
# models.append(('CART', DecisionTreeClassifier()))
# models.append(('NB', GaussianNB()))
svReg = SVR(C=20.299419990722537, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.06841395086207253, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=True);

randForReg = RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=100,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

models.append(('LassoReg', Lasso(alpha=0.1)))
models.append(('SVM', svReg))
models.append(('LinearReg', LinearRegression()))
models.append(('randForest', randForReg))

mas = make_scorer(mean_absolute_error, greater_is_better=False);
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
Esempio n. 23
0
#         for j in range(np.shape(res)[1]):
#             if res[i][j] == 100:
#                 res[i][j] = 0
#             else:
#                 res[i][j] = -0.01 * res[i][j]
#     return res

# def normalizeY(arr):
#     arr=arr/100
#     return arr

if __name__ == '__main__':

    train_x, test_x, train_y, test_y, x_data, y_data = load(train_data_path)

    rf_model = RandomForestRegressor() 
    rf_model.fit(x_data, y_data)
    with open(filename, 'wb') as file:
	    pickle.dump(rf_model, file)
    rf_train_score = rf_model.score(x_data, y_data)
    rf_test_score = rf_model.score(test_x, test_y)
    print("RF train score:",rf_train_score)
    print("RF test score:",rf_test_score)

    dt_model =  DecisionTreeRegressor() 
    dt_model.fit(x_data, y_data)
    with open(filename2, 'wb') as file:
	    pickle.dump(dt_model, file)
    dt_train_score = dt_model.score(x_data, y_data)
    dt_test_score = dt_model.score(test_x, test_y)
    print("DT train score:",dt_train_score)
Esempio n. 24
0
def create_model():
    return RandomForestRegressor(min_samples_leaf=2,
                                 n_estimators=400,
                                 n_jobs=-1,
                                 random_state=42)
Esempio n. 25
0
build_auto(DummyRegressor(strategy="median"), "DummyAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(
    OptimalLGBMRegressor(objective="regression",
                         n_estimators=17,
                         num_iteration=11), "LGBMAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
    "LinearRegressionEnsembleAuto")
build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3),
           "RandomForestAuto",
           flat=True)
build_auto(RidgeCV(), "RidgeAuto")
build_auto(OptimalXGBRegressor(objective="reg:linear", ntree_limit=31),
           "XGBAuto")

auto_na_X, auto_na_y = load_auto("AutoNA.csv")

auto_na_X["cylinders"] = auto_na_X["cylinders"].fillna(-1).astype(int)
auto_na_X["model_year"] = auto_na_X["model_year"].fillna(-1).astype(int)
auto_na_X["origin"] = auto_na_X["origin"].fillna(-1).astype(int)


def build_auto_na(regressor, name):
    mapper = DataFrameMapper(
            'name':
            'Linear Model',
            'instance':
            SGDRegressor(penalty='elasticnet',
                         alpha=0.01,
                         l1_ratio=0.25,
                         fit_intercept=True,
                         tol=1e-4),
            'complexity_label':
            'non-zero coefficients',
            'complexity_computer':
            lambda clf: np.count_nonzero(clf.coef_)
        },
        {
            'name': 'RandomForest',
            'instance': RandomForestRegressor(n_estimators=100),
            'complexity_label': 'estimators',
            'complexity_computer': lambda clf: clf.n_estimators
        },
        {
            'name': 'SVR',
            'instance': SVR(kernel='rbf'),
            'complexity_label': 'support vectors',
            'complexity_computer': lambda clf: len(clf.support_vectors_)
        },
    ]
}
benchmark(configuration)

# benchmark n_features influence on prediction speed
percentile = 90
Esempio n. 27
0
def train(training, k):
    model = RandomForestRegressor(n_estimators=k, n_jobs=-1)
    model.fit(training[:,:-1], training[:,-1])
    return model
Esempio n. 28
0
x_max = np.max(np.array(X), axis=0)
outputs = ot.NumericalSample.ImportFromTextFile(base_dir + 'outputs.txt', '\t')
y = np.array(outputs).reshape((1, len(outputs)))[0]

x_min = np.min(X, axis=0)
x_max = np.max(X, axis=0)

n_train = 5000

X_train = np.array(X)[:n_train, :d]
y_train = y[:n_train]

X_test = np.array(X)[n_train:, :d]
y_true = y[n_train:]

reg = AdaBoostRegressor(RandomForestRegressor(),
                        n_estimators=50)  #, random_state=rng)
fit_train = reg.fit(X_train, y_train)

#plt.plot(y_true,fit_train.predict(X_test),'.')
#plt.plot(y_true,y_true,color="red",lw=2)

reg = AdaBoostRegressor(RandomForestRegressor(),
                        n_estimators=20)  #, random_state=rng)
fit_all = reg.fit(X, y)
s = 33

#plt.plot(y,fit_all.predict(X),'.')
#plt.plot(y,y,color="red",lw=2)

Esempio n. 29
0
        return False
    return True
    #return column in ['BDSP', 'RMSP', 'HFL', 'BLD', 'AGEP', 'NP', 'YBL', 'HINCP', 'HDD', 'CDD']

household = household[[column for column in household.columns if select_column(column)]]
X = household.as_matrix()
print(household.columns)
#X = household.as_matrix()

with open("kwh_model_features.json", "w") as f:
    json.dump(list(household.columns), f, indent = True)

print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 50, n_jobs = 8)
clf.fit(X_train, y_train)

print(y_test[:100])
print(np.sqrt(metrics.mean_squared_error(y_test, clf.predict(X_test))))
print(metrics.r2_score(y_test, clf.predict(X_test)))


features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)


with open("kwh_model.pkl", 'wb') as f:
    pickle.dump(clf, f)
            'name':
            'Linear Model',
            'instance':
            SGDRegressor(penalty='elasticnet',
                         alpha=0.01,
                         l1_ratio=0.25,
                         fit_intercept=True,
                         tol=1e-4),
            'complexity_label':
            'non-zero coefficients',
            'complexity_computer':
            lambda clf: np.count_nonzero(clf.coef_)
        },
        {
            'name': 'RandomForest',
            'instance': RandomForestRegressor(),
            'complexity_label': 'estimators',
            'complexity_computer': lambda clf: clf.n_estimators
        },
        {
            'name': 'SVR',
            'instance': SVR(kernel='rbf'),
            'complexity_label': 'support vectors',
            'complexity_computer': lambda clf: len(clf.support_vectors_)
        },
    ]
}
benchmark(configuration)

# benchmark n_features influence on prediction speed
percentile = 90
Esempio n. 31
0
import numpy as np
import json

household = pd.read_csv("../household_complete_one_hot.csv")
if 'KWH' in household.columns:
    del household['KWH']

X_columns = [column for column in household.columns if column != "ELEP"]
X = household.as_matrix(columns = X_columns)
y = [label[0] for label in household.as_matrix(columns = ["ELEP"])]

#print(y)

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8)
clf.fit(X_train, y_train)


print(y_test[:100])
print(metrics.mean_squared_error(clf.predict(X_test), y_test))
print(metrics.r2_score(y_test, clf.predict(X_test)))

features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True)
print("Features", features)

#fill spaces in ELEP
normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',')
print('pums shape', normalized_pums.shape)

with open("../vectorized_puma_regions/puma_list.json") as f:
                   # "Huber",
                   "Linear",
                   "Passive Aggressive",
                   "SGD",
                   "Theil-Sen",
                   "RANSAC",
                   "K-Neighbors",
                   "Radius Neighbors",
                   "MLP",
                   "Decision Tree",
                   "Extra Tree",
                   "SVR"
                   ]

classifiers = [
    RandomForestRegressor(n_estimators=200, n_jobs=5, random_state=randomstate),
    ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate),
    # GradientBoostingRegressor(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    # HistGradientBoostingClassifier(random_state=randomstate),    # learning_rate is a hyper-parameter in the range (0.0, 1.0]
    AdaBoostRegressor(n_estimators=200, random_state=randomstate),
    GaussianProcessRegressor(normalize_y=True),
    ARDRegression(),
    # HuberRegressor(),   # epsilon:  greater than 1.0, default 1.35
    LinearRegression(n_jobs=5),
    PassiveAggressiveRegressor(random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10
    SGDRegressor(random_state=randomstate),
    TheilSenRegressor(n_jobs=5, random_state=randomstate),
    RANSACRegressor(random_state=randomstate),
    KNeighborsRegressor(weights='distance'),  # n_neighbors: 3, 6, 9, 12, 15, 20
    RadiusNeighborsRegressor(weights='distance'),   # radius: 1, 2, 5, 10, 15
    MLPRegressor(max_iter=10000000, random_state=randomstate),
Esempio n. 33
0
class Estimators:
    """
	Estimators class. This class
	(i) fits charging duration and energy consumption model 
	with designated regressor type (Random Forest, Extra-Random Forest, or 
	Decision Tree Regressor). 
	(ii) predicts charging duration and energy consumption with trained models

	"""
    def __init__(self,
                 filePath,
                 durationModelType="RF",
                 energyEstimatorType="RF"):

        # Load and process data
        data = self.loadData(filePath)

        # Set regression type for charging duration and energy consumption
        self.durationModelType = durationModelType
        self.energyEstimatorType = energyEstimatorType

        # Parse attributes and target columns
        attrColumns = [
            'Start Time Seconds From Midnight', 'Vehicle Battery Capacity'
        ]
        self.X = data[attrColumns]  # Vehicle Battery Capacity
        # Station Start time
        # OPTIONAL: User Id?
        # OPTIONAL: Vehicle Model Year?

        self.durationData = pd.to_numeric(data["Charging Time Secs"],
                                          downcast='float') / 60.0  # mins
        self.energyData = pd.to_numeric(data["Total Charge"],
                                        downcast='float')  # kW?

        # Fit estimators
        self.fitDurationEstimator(durationModelType)
        self.fitEnergyEstimator(energyEstimatorType)

        ### Validate the models ### -- TO BE DEVELOPED...
        # K = 5 # k-folds cross-validation
        # y = self.energyData
        # R2 = cross_val_score(self.energyEstimator, self.X, y=np.ravel(y), cv=KFold(y.size, K), n_jobs=1, scoring="accuracy").mean()
        # self.R2 = R2
        # print "The %d-Folds estimate of the coefficient of determination is R2 = %s" % (K, R2)

        print "Done: fitting estimators"

    def loadData(self, filePath):
        """ Load and process raw data """

        ### Load data ###
        print "loading data..."
        timeStart = time.clock()
        if "xlsx" in filePath:
            data = pd.read_excel(filePath)
        elif "csv" in filePath:
            data = pd.read_csv(filePath)
        else:
            raise ValueError("Wrong file path: " + filePath)
        print "Done: loading data. Time: " + str(time.clock() - timeStart)

        ### Process data ###
        # - convert time in string to seconds from midnight
        # - filter out rows containing nan elements
        print "processing data..."
        timeStart = time.clock()

        # Get Start Time in seconds from midnight
        startTimeSec = [
            (datetime.strptime(startTime, "%m/%d/%Y %H:%M") -
             datetime.strptime(startTime, "%m/%d/%Y %H:%M").replace(
                 hour=0, minute=0, second=0, microsecond=0)).total_seconds()
            for startTime in data["Station Start Time"]
        ]
        print "Done: processing data. Time: " + str(time.clock() - timeStart)

        # Add new coloumn for start time in seconds from midnight
        data['Start Time Seconds From Midnight'] = startTimeSec

        # Remove indices with NaN elements from columns that you want to use in training
        nanIndices = data["Vehicle Battery Capacity"].isnull().values
        dataFiltered = data[np.invert(nanIndices)]
        # {COPY THE ABOVE TWO LINES AND CHANGE THE INDEX KEY IF YOU HAVE ANOTHER COLUMN TO FILTER OUT}

        return dataFiltered

    def fitDurationEstimator(self, modelType="RF"):
        """ Fit duration model with specified regressor type (Random forest by default) """

        print "fitting charging duration model..."

        if modelType == "RF":
            self.durationEstimator = RandomForestRegressor(random_state=0,
                                                           n_estimators=50,
                                                           max_depth=50)
            self.durationEstimator.fit(self.X, self.durationData)

        # {ADD OTHER IF STATEMENTS FOR OTHER REGRESSOR MODELS, E.G., EXTRA-TREE REGRESSOR}

    def fitEnergyEstimator(self, modelType="RF"):
        """ Fit energy consumption model with specified regressor type (Random forest by default) """

        print "fitting energy consumption model..."

        # Stack energy consumption data to attribute data before fitting the energy model.
        # i.e., we use a charging duration to predict an energy consumption
        X = self.X
        X["chargingDuration"] = self.durationData

        # Fit energy consumption regressor
        if modelType == "RF":
            self.energyEstimator = RandomForestRegressor(random_state=0,
                                                         n_estimators=50,
                                                         max_depth=50)
            self.energyEstimator.fit(X, self.energyData)

        # {ADD OTHER IF STATEMENTS FOR OTHER REGRESSOR MODELS, E.G., EXTRA-TREE REGRESSOR}

    def estimateChargingDuration(self, Xq):
        return self.durationEstimator.predict(Xq)

    def estimateEnergyConsumptions(self, Xq):
        return self.energyEstimator.predict(Xq)

    def predict(self, df):
        """
		Returns predicted charging duration and energy consumption based on the trained estimators
		
		Params
		df: dataframe with 'Start Time Seconds From Midnight', 'Vehicle Battery Capacity' columns NOTE: add more if needed
		
		"""

        # Get start time in seconds from midnight
        startTime = df["Station Start Time"][0]
        startTimeSec = (
            datetime.strptime(startTime, "%m/%d/%Y %H:%M") -
            datetime.strptime(startTime, "%m/%d/%Y %H:%M").replace(
                hour=0, minute=0, second=0, microsecond=0)).total_seconds()

        # Build input Dataframe
        Xq = pd.DataFrame()
        Xq['Start Time Seconds From Midnight'] = [startTimeSec]
        Xq['Vehicle Battery Capacity'] = df["Vehicle Battery Capacity"]

        # Estimate charging duration
        estDuration = self.estimateChargingDuration(Xq)

        # Estimate energy consumption
        Xq["ChargingDuration"] = [estDuration]
        estEnergy = self.estimateEnergyConsumptions(Xq)

        return estDuration[0], estEnergy[0]
Esempio n. 34
0
def ts_rf(n, fea, step, ntrees, njobs):
    #Random Forest Model for time series prediction
    #from sklearn import svm
    import math
    from sklearn import metrics
    import matplotlib.pyplot as plt
    from scipy.linalg import hankel
    import numpy as np
    from sklearn.ensemble.forest import RandomForestRegressor
    #input data from csv file
    #use n datapoints
    #n=1100
    #    # of features of training set
    ##        fre=50
    #    # how many steps to predict
    #step=29
    #fea=50
    path = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls.txt'
    path1 = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls_nor.txt'
    result_tem = []
    date = []
    with open(path) as f:
        next(f)
        for line in f:
            item = line.replace('\n', '').split(' ')
            result_tem.append(float(item[1]))
            date.append(item[2])
    mean = np.mean(result_tem)
    sd = np.std(result_tem)
    result = (result_tem - mean) / sd
    #form hankel matrix
    X = hankel(result[0:-fea - step + 1], result[-1 - fea:-1])
    y = result[fea + step - 1:]
    #split data into training and testing
    Xtrain = X[:n]
    ytrain = y[:n]
    Xtest = X[n:]
    ytest = y[n:]
    # random forest
    rf = RandomForestRegressor(n_estimators=ntrees, n_jobs=njobs)
    rf_pred = rf.fit(Xtrain, ytrain).predict(Xtest)
    #a = rf.transform(Xtrain,'median')

    #plot results
    LABELS = [
        x[-6:]
        for x in date[n + fea + step - 1:n + fea + step - 1 + len(ytest)]
    ]
    t = range(n, n + len(ytest))
    #    plt.show()
    #    plt.plot(t,y_lin1,'r--',t,ytest,'b^-')
    #    plt.plot(t,y_lin2,'g--',t,ytest,'b^-')
    ypred = rf_pred * sd + mean
    ytest = ytest * sd + mean
    line1, = plt.plot(t, ypred, 'r*-')
    plt.xticks(t, LABELS)
    line2, = plt.plot(t, ytest, 'b*-')
    #            plt.xlim([500,510])
    plt.legend([line1, line2], ["Predicted", "Actual"], loc=2)

    #plt.show()
    #plt.plot(xrange(n),result[0:n],'r--',t,y_lin3,'b--',t,ytest,'r--')

    y_true = ytest
    y_pred = ypred
    metrics_result = {
        'rf_MAE': metrics.mean_absolute_error(y_true, y_pred),
        'rf_MSE': metrics.mean_squared_error(y_true, y_pred),
        'rf_MAPE': np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    }
    print metrics_result
Esempio n. 35
0
def train_random_forest(X, Y):
    rf = RandomForestRegressor(n_estimators=20)
    rf.fit(X, Y)
    return rf
Esempio n. 36
0
    x_test = X[-7:]
    y_test = Y[-7:]
    ###

    RigeLinearCV = linear_model.RidgeCV(cv=10)
    rcv = RigeLinearCV.fit(x_train, y_train)
    y_pre_rcv = rcv.predict(x_oob)
    ###
    params_rf = {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 2,
        'n_jobs': 4
    }

    rf = RandomForestRegressor(**params_rf)
    rf.fit(x_train, y_train)
    y_pre_rf = rf.predict(x_oob)
    ###
    y_pre_diff = mean_normal_weekend_diff(Y[-21:-14], xday[-35:-14],
                                          xweekend[-35:-14], -14, -7)
    ###

    Y_test.append(y_test)
    #y_pre_diff = mean_normal_weekend_diff(Y,xday,xweekend,-21,-14)

    ###
    loss_rcv = Evaluation([y_pre_rcv], [y_oob])
    loss_rf = Evaluation([y_pre_rf], [y_oob])
    loss_diffmean = Evaluation([y_pre_diff], [y_oob])
Esempio n. 37
0
def _2011x2011_ (data_path):

    ##### LOADING #####
    sys.stdout.write("Loading data... ")

    # Load data from .csv file
    with open(data_path+'_X.csv') as data_file:
        reader = csv.reader(data_file)

        # Initialize lists for data and class labels
        data =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            data.append([float(x) for x in row])

    with open(data_path+'_y.csv') as labels_file:
        reader = csv.reader(labels_file)

        # Initialize lists for data and class labels
        val_ind =[]
        # skip header
        next(reader, None)
        # For each row of the csv file
        for row in reader:
            val_ind.append(row)

    sys.stdout.write("done\n")


    ##### TRAINING #####
    # splitting
    data_train, data_test, val_ind_train, val_ind_test \
        = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42)

    # Cutting date/ ASS/ number value from labels
    date_train = [x[0] for x in val_ind_train]
#    ASS_train = [x[1] for x in val_ind_train]
    val_train = [float(x[1]) for x in  val_ind_train]
    date_test = [x[0] for x in val_ind_test]
#    ASS_test = [x[1] for x in val_ind_test]
    val_test = [float(x[1]) for x in val_ind_test]

    sys.stdout.write("Training regressor... ")
    reg = RandomForestRegressor()
#    reg = skl.tree.DecisionTreeRegressor()
#    reg = skl.linear_model.LinearRegression()
    reg.fit(data_train, val_train)
    sys.stdout.write("done\n")


    ##### PREDICTION #####
    sys.stdout.write("Predicting... ")
    val_predicted = reg.predict(data_test)
    sys.stdout.write("done\n")

    ##### ERROR #####
    df = pd.DataFrame()
    df['date'] = pd.to_datetime(date_test)
#    df['ASS'] = ASS_test
    df['original'] = val_test
    df['predicted'] = val_predicted.tolist()
    df = df.set_index('date')

#    df = df.loc[df['ASS'] == 'CAT'] # one example
    
    df.info()
    
    df.plot()
    plt.show()
    
    print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))
Esempio n. 38
0
                            matplotlib=True)
        else:  # model.skl_model should be RandomForestClassifier
            features = [
                feature.name for feature in self.dataset.domain.attributes
            ]
            explainer = shap.TreeExplainer(model.skl_model)
            shap_values = explainer.shap_values(X)
            for c in range(len(shap_values)):
                shap.force_plot(explainer.expected_value[c],
                                shap_values[c][idx, :],
                                X[idx, :],
                                feature_names=features,
                                matplotlib=True)


class Task:
    """
    Task class to perform computations in parallels
    """
    def __init__(self):
        self.future = None
        self.watcher = None


if __name__ == "__main__":  # pragma: no cover
    data = Table('housing')
    rf = SKL_RF(n_estimators=10)
    rf.fit(data.X, data.Y)
    model_rf = RandomForestRegressor(rf)
    WidgetPreview(OWShapSingle).run(set_data=data, set_model=model_rf)
Esempio n. 39
0
auto = auto_mapper.fit_transform(auto_df)

store_pkl(auto_mapper, "Auto.pkl")

auto_X = auto[:, 0:7]
auto_y = auto[:, 7]

print(auto_X.dtype, auto_y.dtype)

def predict_auto(regressor):
    mpg = DataFrame(regressor.predict(auto_X), columns = ["mpg"])
    return mpg

auto_tree = DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5)
auto_tree.fit(auto_X, auto_y)

store_pkl(auto_tree, "DecisionTreeAuto.pkl")
store_csv(predict_auto(auto_tree), "DecisionTreeAuto.csv")

auto_forest = RandomForestRegressor(random_state = 13, min_samples_leaf = 5)
auto_forest.fit(auto_X, auto_y)

store_pkl(auto_forest, "RandomForestAuto.pkl")
store_csv(predict_auto(auto_forest), "RandomForestAuto.csv")

auto_regression = LinearRegression()
auto_regression.fit(auto_X, auto_y)

store_pkl(auto_regression, "RegressionAuto.pkl")
store_csv(predict_auto(auto_regression), "RegressionAuto.csv")
Esempio n. 40
0
def RandomForest(df, queryFile):
    model = RandomForestRegressor(random_state=0, n_estimators=200, n_jobs=-1)
    MLRegression(model, df, queryFile)
    return
Esempio n. 41
0
def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]

        print("Location: " + str(location) + ", location2: " + str(location2s))

        trainPreds = defaultdict(list)
        testPreds = defaultdict(list)

        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(
                    location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf=9,
                                              n_estimators=59,
                                              n_jobs=-1,
                                              random_state=42)
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)
                for x in testPrediction:
                    testPreds[tag].append(x)

        t2Y = []
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(
                location, location2, "location", data, all_features, "target")
            t2Y = t2Y + trainY2

        labelt2Y = []

        for i in range(0, len(t2Y)):
            bestModel = 0
            bestAbs = abs(t2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(t2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelt2Y.append(bestModel)

        print("#labelt2Y:" + str(len(labelt2Y)))
        tX2 = []
        testX = []
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, tX, testY = splitDataForXValidationSampled2(
                location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
            for row in tX:
                testX.append(row)

        for pred in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i])

        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)

        model = RandomForestClassifier(random_state=42,
                                       n_estimators=100,
                                       max_depth=15)
        model.fit(reducedTrainX2, labelt2Y)

        for pred in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i])

        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)

        pred = model.predict(reducedTestX)

        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))

        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)

    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
 def predict_proba(self, X):
     pred = RandomForestRegressor.predict(self, X)
     result = numpy.zeros([len(X), 2])
     result[:, 1] = special.expit(pred / 1000.)
     result[:, 0] = 1. - result[:, 1]
     return result
Esempio n. 43
0
def train_random_forest(X, Y):
    rf = RandomForestRegressor(n_estimators=20)
    rf.fit(X, Y)
    return rf
Esempio n. 44
0
    output.close()
    
all_tags, all_features = getTagAndFeatures(['T','W', 'A', 'R', 'L', 'B'])

print(str(all_features))
    
for location in locations:
    print("location: " + str(location))
    # save down trainX, trainY, testX, testY
    trainX, testX, trainY, testY, _, _ = splitDataForXValidation(location, "location", data, all_features, "target", timestampData)
    print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY)))
    
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv", all_features, trainX)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv", all_features, testX)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainY.csv", ["target"], trainY)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testY.csv", ["target"], testY)
    
    for dataGroup in generateAllDataGroups():
        tag, features = getTagAndFeatures(dataGroup)
        trainX, testX, trainY, testY, _, _ = splitDataForXValidation(location, "location", data, features, "target", timestampData)
        model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
        model.fit(trainX, trainY)
        trainPrediction = model.predict(trainX)
        testPrediction = model.predict(testX)
        trainRmse = str(rmseEval(trainY, trainPrediction)[1])
        testRmse = str(rmseEval(testY, testPrediction)[1])
        print("\t" + tag + ": #train: " + str(len(trainY)) + ", #test:" + str(len(testY)) + ", trainRMSE: " + trainRmse + ", testRMSE: " + testRmse)
        writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainPred_" + tag + ".csv", ["trainPred_" + tag], trainPrediction)
        writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testPred_" + tag + ".csv", ["testPred_" + tag], testPrediction)
        
            
store = store.drop("Assortment", 1).join(
    pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x))
)

train["StateHoliday"] = [mychange(x) for x in train.StateHoliday]
test["StateHoliday"] = [mychange(x) for x in test.StateHoliday]

train = train.drop("StateHoliday", 1).join(
    pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)
test = test.drop("StateHoliday", 1).join(
    pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x))
)

train = pd.merge(train, store, on="Store")
test = pd.merge(test, store, on="Store")

repeat = 1
print("Splitting data...")
for i in range(repeat):
    features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]]
    rf = RandomForestRegressor(n_estimators=100)
    print("Starting training...")
    rf.fit(train[features].fillna(-1), train.LogSale)

    test["mypred"] = rf.predict(test[features].fillna(-1))
    test["mypred"] = np.exp(test["mypred"]) - 1

test["Sales"] = test.mypred
test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)
# In[10]:

Train = Train.fillna(0)
Test = Test.fillna(0)

# print Train.head()
# print Test.head()


# In[11]:

print 'Train Random Forests!'

from sklearn.ensemble.forest import RandomForestRegressor
RF = RandomForestRegressor(n_estimators = 500, random_state = 0)


# In[12]:

Rows = np.random.choice(Train.index.values, 400000)
Sampled_Train = Train.ix[Rows]
Sample_Train_Target = Train_Target.ix[Rows]

# RF.fit(Sampled_Train, Sample_Train_Target)
RF.fit(Train, Train_Target)


# In[ ]:

print 'Predict!'
Esempio n. 47
0
	store_pkl(pipeline, name + ".pkl")
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name + ".csv")

build_auto(AdaBoostRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 17), "AdaBoostAuto")
build_auto(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 2), "DecisionTreeAuto", compact = True)
build_auto(BaggingRegressor(DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5), random_state = 13, n_estimators = 3, max_features = 0.5), "DecisionTreeEnsembleAuto")
build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
build_auto(ElasticNetCV(random_state = 13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state = 13, min_samples_leaf = 5), "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state = 13, init = None), "GradientBoostingAuto", compact = True)
build_auto(LassoCV(random_state = 13), "LassoAuto")
build_auto(OptimalLGBMRegressor(objective = "regression", n_estimators = 17, num_iteration = 11), "LGBMAuto", compact = True)
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(BaggingRegressor(LinearRegression(), random_state = 13, max_features = 0.75), "LinearRegressionEnsembleAuto")
build_auto(RandomForestRegressor(random_state = 13, min_samples_leaf = 3), "RandomForestAuto", compact = True)
build_auto(RidgeCV(), "RidgeAuto")
build_auto(OptimalXGBRegressor(objective = "reg:linear", ntree_limit = 31), "XGBAuto", compact = True)

auto_na_X, auto_na_y = load_auto("AutoNA.csv")

auto_na_X["cylinders"] = auto_na_X["cylinders"].fillna(-1).astype(int)
auto_na_X["model_year"] = auto_na_X["model_year"].fillna(-1).astype(int)
auto_na_X["origin"] = auto_na_X["origin"].fillna(-1).astype(int)

def build_auto_na(regressor, name):
	mapper = DataFrameMapper(
		[([column], [ContinuousDomain(missing_values = None), Imputer()]) for column in ["acceleration", "displacement", "horsepower", "weight"]] +
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year", "origin"]]
	)
	pipeline = PMMLPipeline([
Esempio n. 48
0
for i in range(repeat):
    newtrain, newtest = train_test_split(train, test_size = 0.2)
    newtrain = pd.DataFrame(newtrain, columns = cols)
    newtest = pd.DataFrame(newtest, columns = cols)
    
    #test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day']))
    #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x)))  
    #newtest = pd.merge(newtest,store, on="Store")
    #newtest.drop(['Date'],axis = 1,inplace=True) 
    
    #assert(np.sum(newtrain.var()==0)==0)
    #
    #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) )
    features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']]
    #
    rf = RandomForestRegressor(n_estimators=100)
    print('Starting training...')
    rf.fit(newtrain[features].fillna(-1),newtrain.LogSale)
    print('Predicting train values...')
    newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1))
    newtrain['mypred'] = np.exp(newtrain['mypred'])-1
    train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred)
    print('train set error',train_error)
    newtest['mypred'] = rf.predict(newtest[features].fillna(-1))
    newtest['mypred'] = np.exp(newtest['mypred'])-1
    test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred)
    print('test set error',test_error)
    train_results.append(train_error)
    test_results.append(test_error)

print('mean train error', np.mean(train_results))
Esempio n. 49
0
# We aren't clasifying or separating samples according to their metadata, so we can just use all of the samples
x = data.abun_df[otu].values
y = [float(data.meta_df.loc[smpl, 'age']) for smpl in data.abun_df.index]  # note: want to make sure our y vector is in the same order as the x vector (i.e. that each x and y are for the same sample)
r, p_val = scipy.stats.stats.spearmanr(x,y)

# Look at scatter plot of OTU abundance vs. age to visualize the correlation 
fig, ax= plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('OTU #' + otu)
ax.set_ylabel('Age')
ax.text(0.01,0.95, r'$\rho$ = {:.2f}'.format(r), transform=ax.transAxes)

#%% 3. Build a Random Forest Regressor

## 3.1 Build the regressor
rfreg = RandomForestRegressor(n_estimators=1000, oob_score=True)

# We aren't classifying samples here, so we can just use the whole OTU table to build our regression
X = data.abun_df.values
Y = [float(data.meta_df.loc[smpl, 'BMI']) for smpl in data.abun_df.index]
rfreg = rfreg.fit(X,Y)

## 3.1.1 Look at true vs. predicted values from out of bag estimations
fig, ax = plt.subplots()
ax.scatter(Y, rfreg.oob_prediction_)
ax.set_xlabel('True')
ax.set_ylabel('Predicted')
ax.set_title('RF regression on BMI')

## 3.2 Look at the important features in the regression by inspecting their coefficient weights
feats = pd.DataFrame(index=data.abun_df.columns, columns=['importance'], data=rfreg.feature_importances_)
Esempio n. 50
0
output = open(OUTPUT_FILE, 'w')
output.write("step,rmse_tw,rmse_twa,rmse_combined,accuracy\n")

output_log = open(OUTPUT_LOG_FILE, 'w')

log(output_log, "Generating Rmse RF+TW and Rmse RF+TWA")

allPredictionTW = []
allPredictionTWA = []

for location in locations:
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, tw_features, "target", timestampData)
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
    model.fit(trainX, trainY)
    testPredictionTW = model.predict(testX)
    rmse = str(rmseEval(testY, testPredictionTW)[1])
    log(output_log, "\tTW rmse: " + rmse)
    for x in testY:
        allObs.append(x)
    for x in testPredictionTW:
        allPredictionTW.append(x)
    trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation(
        location, "location", data, twa_features, "target", timestampData)
    model = RandomForestRegressor(min_samples_leaf=9,
                                  n_estimators=59,
                                  n_jobs=-1,
                                  random_state=42)
Esempio n. 51
0
features = [
    'Store', 'SchoolHoliday', 'Promo', 'cmp msr', 'IsPromotionMonth', 'Year',
    'Month', 'Day', 'DayOfTheWeek', 'WeekOfTheYear', 'StoreType',
    'CompetitionOpenSinceMonth', 'CompetitionDistance', 'PromoOpen'
]  # Features used for prediction

feature_engineering(rossman)
feature_engineering(rossman_test)

X = rossman[features]
y = rossman.Sales  # The value we are going to predict

train_features, test_features, train_predict, test_predict = train_test_split(
    X, y)

randomForest = RandomForestRegressor(n_estimators=35)
randomForest.verbose = True
randomForest.fit(X, y)

errorValue = cross_validation.cross_val_score(randomForest,
                                              rossman[features],
                                              y,
                                              scoring='mean_squared_error',
                                              cv=3)

predicted_value = randomForest.predict(test_features)
predicted_value = np.array(predicted_value)

test_predict = np.array(test_predict)
finalResult = randomForest.predict(rossman_test)
Esempio n. 52
0
class MLCms:
    """

    """
    def __init__(self, config_file=''):
        # Parse config file
        self.parser = SafeConfigParser()
        self.parser.read(config_file)

        # machine learning specific variables
        self.classify = constants.DO_CLASSIFICATION  # Regress or classify?
        self.vars_features = constants.fixed_vars
        self.vars_target = constants.ML_TARGETS

        if self.classify:
            self.var_target = constants.ML_TARGETS
            self.task = 'classification'
            self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)
        else:
            self.var_target = constants.ML_TARGETS
            self.task = 'regression'
            self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)  # SVR()

        # Get path to input
        self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl

        # Output directory is <dir>_<classification>_<2014>
        self.path_out_dir = constants.out_dir
        utils.make_dir_if_missing(self.path_out_dir)

        # Model pickle
        self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle
        self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'

    def output_model_importance(self, gs, name_gs, num_cols):
        """

        :param gs:
        :param name_gs:
        :param num_cols:
        :return:
        """
        rows_list = []
        name_vars = []

        feature_importance = gs.best_estimator_.named_steps[name_gs].feature_importances_
        importances = 100.0 * (feature_importance / feature_importance.max())

        std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        # Store feature ranking in a dataframe
        for f in range(num_cols):
            dict_results = {'Variable': self.vars_features[indices[f]], 'Importance': importances[indices[f]]}
            name_vars.append(self.vars_features[indices[f]])
            rows_list.append(dict_results)

        df_results = pd.DataFrame(rows_list)
        num_cols = 10 if len(indices) > 10 else len(indices)  # Plot upto a maximum of 10 features
        plot.plot_model_importance(num_bars=num_cols, xvals=importances[indices][:num_cols],
                                   std=std[indices][:num_cols], fname=self.task + '_importance_' + self.crop,
                                   title='Importance of variable (' + self.country + ' ' + self.crop_lname + ')',
                                   xlabel=name_vars[:num_cols], out_path=self.path_out_dir)

        df_results.to_csv(self.path_out_dir + os.sep + self.task + '_importance_' + self.crop + '.csv')

    def get_data(self):
        """

        :return:
        """
        df = pd.read_csv(self.path_inp)
        cols = [col for col in df.columns if col not in self.vars_features]
        # cols.extend(['DI', 'PI'])

        # Add information on PI and DI of soils
        # iterate over each row, get lat and lon
        # Find corresponding DI and PI

        lat_lons = zip(df['Long_round'], df['Lat_round'])
        vals_di = []
        vals_pi = []
        # for idx, (lon, lat) in enumerate(lat_lons):
        #     print idx, len(lat_lons)
        #     vals_pi.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\PI.tif',
        #                                            lon, lat, replace_ras=False))
        #     vals_di.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\DI.tif',
        #                                      lon, lat, replace_ras=False))
        #
        # df['DI'] = vals_di
        # df['PI'] = vals_pi
        df = df[cols]

        data = df.as_matrix(columns=cols[1:])
        target = df.as_matrix(columns=[self.var_target]).ravel()
        # Get training and testing splits
        splits = train_test_split(data, target, test_size=0.2)

        return cols, splits

    def train_ml_model(self):
        """

        :return:
        """
        logger.info('#########################################################################')
        logger.info('train_ml_model')
        logger.info('#########################################################################')

        ######################################################
        # Load dataset
        ######################################################
        cols, splits = self.get_data()
        data_train, data_test, target_train, target_test = splits

        # clf =  ExtraTreesRegressor(500, n_jobs=constants.ncpu)
        # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
        # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
        # data = df_train.as_matrix(columns=cols[1:])  # convert dataframe column to matrix
        # #data = preprocessing.scale(data)
        # target = df_train.as_matrix(columns=[self.var_target]).ravel()  # convert dataframe column to matrix
        # clf.fit(data, target)
        #
        # predict_val = clf.predict(after.as_matrix(columns=cols[1:]))
        # results = compute_stats.ols(predict_val.tolist(), after_target.tolist())
        # print results.rsquared
        # import matplotlib.pyplot as plt
        # plt.scatter(after_target, predict_val)
        # plt.show()
        # pdb.set_trace()
        if not os.path.isfile(self.path_pickle_model):
            # For details in scikit workflow: See http://stackoverflow.com/questions/
            # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea
            # TODO Separate out a dataset so that even the grid search cv can be tested
            ############################
            # Select features from model
            ############################
            logger.info('Selecting important features from model')
            if self.classify:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            else:
                rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
            feat_selection = SelectFromModel(rf_feature_imp)

            pipeline = Pipeline([
                      ('fs', feat_selection),
                      ('clf', self.model),
                    ])

            #################################
            # Grid search for best parameters
            #################################
            C_range = np.logspace(-2, 10, 13)
            gamma_range = np.logspace(-9, 3, 13)
            logger.info('Tuning hyperparameters')
            param_grid = {
                'fs__threshold': ['mean', 'median'],
                'fs__estimator__max_features': ['auto', 'log2'],
                'clf__max_features': ['auto', 'log2'],
                'clf__n_estimators': [1000, 2000]
                #'clf__gamma': np.logspace(-9, 3, 13),
                #'clf__C': np.logspace(-2, 10, 13)
            }

            gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan)
            # Fir the data before getting the best parameter combination. Different data sets will have
            # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination.
            gs.fit(data_train, target_train)
            logger.info(gs.best_params_)

            data_test = pd.DataFrame(data_test, columns=cols[1:])

            # Update features that should be used in model
            selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]])
            cols = selected_features[0]
            data_test = data_test[cols]

            # Update model with the best parameters learnt in the previous step
            self.model = gs.best_estimator_.named_steps['clf']

            predict_val = self.model.predict(data_test)
            results = compute_stats.ols(predict_val.tolist(), target_test.tolist())
            print results.rsquared
            print cols
            plt.scatter(target_test, predict_val)
            plt.show()
            pdb.set_trace()
            ###################################################################
            # Output and plot importance of model features, and learning curves
            ###################################################################
            self.output_model_importance(gs, 'clf', num_cols=len(cols[1:]))

            if constants.plot_model_importance:
                train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold,
                                                                        n_jobs=constants.ncpu)
                plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve',
                                         ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir)

            # Save the model to disk
            logger.info('Saving model and features as pickle on disk')
            with open(self.path_pickle_model, 'wb') as f:
                cPickle.dump(self.model, f)
            with open(self.path_pickle_features, 'wb') as f:
                cPickle.dump(self.vars_features, f)
        else:
            # Read model from pickle on disk
            with open(self.path_pickle_model, 'rb') as f:
                logger.info('Reading model from pickle on disk')
                self.model = cPickle.load(f)

            logger.info('Reading features from pickle on disk')
            self.vars_features = pd.read_pickle(self.path_pickle_features)

        return df_cc

    def do_forecasting(self, df_forecast, mon_names, available_target=False, name_target='yield'):
        """
        1. Does classification/regression based on already built model.
        2. Plots confusion matrix for classification tasks, scatter plot for regression
        3. Plots accuracy statistics for classification/regression
        :param df_forecast:
        :param mon_names:
        :param available_target: Is target array available?
        :param name_target: Name of target array (defaults to yield)
        :return:
        """
        data = df_forecast.as_matrix(columns=self.vars_features)  # convert dataframe column to matrix
        predicted = self.model.predict(data)

        if available_target:
            expected = df_forecast.as_matrix(columns=[name_target]).ravel()
            if not self.classify:  # REGRESSION
                # Compute stats
                results = compute_stats.ols(predicted.tolist(), expected.tolist())
                bias = compute_stats.bias(predicted, expected)
                rmse = compute_stats.rmse(predicted, expected)
                mae = compute_stats.mae(predicted, expected)

                # Plot!
                plot.plot_regression_scatter(expected, np.asarray(predicted),
                                             annotate=r'$r^{2}$ ' + '{:0.2f}'.format(results.rsquared) + '\n' +
                                             'peak NDVI date: ' + self.time_peak_ndvi.strftime('%b %d'),
                                             xlabel='Expected yield',
                                             ylabel='Predicted yield',
                                             title=mon_names + ' ' + str(int(df_forecast[self.season].unique()[0])),
                                             fname=self.task + '_' + '_'.join([mon_names]) + '_' + self.crop,
                                             out_path=self.path_out_dir)

                # global expected vs predicted
                if self.debug:
                    # any non-existing index will add row
                    self.df_global.loc[len(self.df_global)] = [np.nanmean(expected), np.nanmean(predicted), mon_names,
                                                               self.forecast_yr]

                return predicted, {'RMSE': rmse, 'MAE': mae, r'$r^{2}$': results.rsquared, 'Bias': bias}
            else:  # CLASSIFICATION
                # Convert from crop condition class (e.g. 4) to string (e.g. exceptional)
                expected, predicted = compute_stats.remove_nans(expected, predicted)
                cm = confusion_matrix(expected, predicted, labels=self.dict_cc.keys()).T

                # Compute and plot class probabilities
                proba_cc = self.model.predict_proba(data)
                df_proba = pd.DataFrame(proba_cc, columns=self.dict_cc.values())
                plot.plot_class_probabilities(df_proba, fname='proba_' + '_'.join([mon_names]) + '_' + self.crop,
                                              out_path=self.path_out_dir)

                # Plot confusion matrix
                plot.plot_confusion_matrix(cm, normalized=False, fname='cm_' + '_'.join([mon_names]) + '_' + self.crop,
                                           xlabel='True class', ylabel='Predicted class', ticks=self.dict_cc.values(),
                                           out_path=self.path_out_dir)

                # Normalize and plot confusion matrix
                cm_normalized = normalize(cm.astype(float), axis=1, norm='l1')
                plot.plot_confusion_matrix(cm_normalized, fname='norm_cm_' + '_'.join([mon_names]) + '_' + self.crop,
                                           xlabel='True class', ylabel='Predicted class', normalized=True,
                                           ticks=self.dict_cc.values(), out_path=self.path_out_dir)

                score_accuracy = accuracy_score(expected, predicted) * 100.0
                score_precision = precision_score(expected, predicted, average='weighted') * 100.0
                return predicted, {'Accuracy': score_accuracy, 'Precision': score_precision}
        else:
            return predicted, {'RMSE': np.nan, 'MAE': np.nan, r'$r^{2}$': np.nan, 'Bias': np.nan,
                               'Nash-Sutcliff': np.nan}
Esempio n. 53
0
        'random_state': 3
    }
    adb = AdaBoostRegressor(**params_adb)
    adb.fit(x_train, y_train)
    y_pre_adb = adb.predict(x_test)
    adb_pre.append(y_pre_adb)
    params_rf = {
        'n_estimators': 500,
        'max_depth': 10,
        'min_samples_split': 2,
        'warm_start': True,
        'n_jobs': 4,
        'oob_score': True,
        'max_features': 'log2'
    }
    rf = RandomForestRegressor(**params_rf)
    rf.fit(x_train, y_train)
    y_pre_rf = rf.predict(x_test)
    rf_pre.append(y_pre_rf)

    ###

    RigeLinearCV = linear_model.RidgeCV(cv=8,
                                        normalize=True,
                                        gcv_mode='auto',
                                        scoring='neg_mean_absolute_error')
    rcv = RigeLinearCV.fit(x_train, y_train)
    y_pre_rcv = rcv.predict(x_test)
    rcv_pre.append(y_pre_rcv)

    br = BayesianRidge(n_iter=300)
Esempio n. 54
0
    'Soft drinks (inc. fizzy and ready to drink fruit drinks)',
    'Alcoholic drink, tobacco and narcotics', 'Alcoholic drinks',
    'Spirits and liqueurs (brought home)',
    'Wines, fortified wines (brought home)',
    'Beer, lager, ciders and perry (brought home)', 'Alcopops (brought home)',
    'Tobacco and narcotics1', 'Cigarettes',
    'Cigars, other tobacco products and narcotics'
]

predictors = feats_of_interest + food_feats1 + food_feats2
target = "admitted"

#print len(predictors)

# you can change rf to linear_model.LinearRegression() ... RandomForestRegressor() is another version
rf = RandomForestRegressor()
X = merge_df[predictors]
Y = merge_df[target]
Y = np.array(Y)
## split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
#print rf.fit(X_train, Y_train)
#print np.mean((rf.predict(X_test) - Y_test) ** 2)
# regressor for all persons as target
#print regr.fit(X_train, Y_train)
#print regr.coef_
#print np.mean((regr.predict(X_test) - Y_test) ** 2)
# regressor for male as target
#print merge_df.columns

#perform random forest with top k features, print the predictor error