def fillAgeNa(data):
    #使用随机森林算法进行预测来填充
    columns=data.columns
    if 'Survived' in columns:
        tr=data[data['Age'].notna()]
        te=data[data['Age'].isna()]
        tr_x=tr[['Survived','Pclass','Sex','SibSp','Parch','Fare','Embarked']]
        tr_y=tr[['Age']]
        tr_y=tr_y.values.ravel()#把tr_y转化为一维向量输入
        te_x = te[['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']]
        rfModel=rfr(n_estimators=100)#采用默认参数的随机森林模型
        rfModel.fit(tr_x,tr_y)
        Age_predict=rfModel.predict(te_x)
    else:
        tr = data[data['Age'].notna()]
        te = data[data['Age'].isna()]
        tr_x = tr[[ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']]
        tr_y = tr[['Age']]
        tr_y = tr_y.values.ravel()  # 把tr_y转化为一维向量输入
        te_x = te[[ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']]
        rfModel = rfr(n_estimators=100)  # 采用默认参数的随机森林模型
        rfModel.fit(tr_x, tr_y)
        Age_predict = rfModel.predict(te_x)
    data.loc[data['Age'].isna(),'Age']=Age_predict
    data['Age']=data['Age'].map(lambda x: int(x)) #年龄取整数
    return data
def best_model(xt, xv, yt, yv):
	models = []

	name_dt = "DecisionTreeRegressor"
	model_dt = dtr(random_state=1) # decision tree
	model_dt.fit(xt, yt)
	models.append({'name': name_dt, 'model': model_dt, 'mae': get_mae(model_dt, xv, yv)})

	name_rf = "RandomForestRegressor"
	model_rf = rfr(random_state=1) # random forest
	model_rf.fit(xt, yt)
	models.append({'name': name_rf, 'model': model_rf, 'mae': get_mae(model_rf, xv, yv)})

	name_xgb = "XGBRegressor"
	model_xgb = xgb(random_state=1, n_estimators=10000, learning_rate=0.01) # xgboost
	model_xgb.fit(xt, yt, early_stopping_rounds=10, eval_set=[(xv, yv)], verbose=False)
	models.append({'name': name_xgb, 'model': model_xgb, 'mae': get_mae(model_xgb, xv, yv)})
	
	print("\n")
	for m in models:
		print("Model {} has MAE {}".format(m.get('name'), m.get('mae')))

	min_mae = min(i['mae'] for i in models)
	best_model = [m for m in models if m.get('mae') == min_mae]
	print("\nBest model pick: ", best_model[0].get('name'))
	print("\n")

	return best_model[0].get('model')
def fill_missing_rf(X, y, to_fill):
    """
    使用随机森林填补一个特征的缺失值的函数

    参数:
    X:要填补的特征矩阵
    y:完整的,没有缺失值的标签
    to_fill:字符串,要填补的那一列的名称
    """

    #构建我们的新特征矩阵和新标签
    df = X.copy()
    fill = df.loc[:, to_fill]
    df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1)

    # 找出我们的训练集和测试集
    Ytrain = fill[fill.notnull()]
    Ytest = fill[fill.isnull()]
    Xtrain = df.iloc[Ytrain.index, :]
    Xtest = df.iloc[Ytest.index, :]

    #用随机森林回归来填补缺失值
    from sklearn.ensemble import RandomForestRegressor as rfr
    rfr = rfr(n_estimators=100
              )  # random_state=0,n_estimators=200,max_depth=3,n_jobs=-1
    rfr = rfr.fit(Xtrain, Ytrain)
    Ypredict = rfr.predict(Xtest)

    return Ypredict
Beispiel #4
0
    def create_model(self, filepath):
        train_data = pd.read_csv(filepath)
        self.rf_model = rfr(random_state=1)
        X = train_data[self.features]
        y = train_data.SalePrice

        self.rf_model.fit(X, y)
Beispiel #5
0
def forward_multiRF_solver(X, y): 
    xtr, xte, ytr, yte = train_test_split(X, y.ravel(), shuffle=True, random_state=0)
    rf = rfr(n_estimators=20)
    rf.fit(xtr, ytr)
    
    plt.figure()
    p = np.random.choice(pi_line)
    
    u = cb.init_u(p) + cb.init_u(np.random.choice(pi_line))
    
    _, abs_work = LW_solver(u, "u_test", write=True)
    
    fetch_real_u = lambda it : np.load(osp.join(abs_work, "u_test_it%d.npy"%(it)))
    
    u_nNext = []
    
    for it in range(1, cb.itmax) :
        if it > 1 :
            u = u_nNext
            u_nNext = []
            
        xs = np.array(u).reshape(1,-1)
        u_nNext.append(rf.predict(xs))
        
        u_nNext.insert(0, u[-2])
        u_nNext.insert(len(u), u[1])
        
        u_nNext = np.array(u_nNext).ravel()
        
        plt.clf()        
        plt.plot(cb.line_x[1:cb.Nx-1], fetch_real_u(it+1)[1:cb.Nx-1], label="True it = %d" %(it+1), c='k')
        plt.plot(cb.line_x[1:cb.Nx-1], u_nNext[1:cb.Nx-1], label="RF Predicted at it = %d" %(it), marker='o', fillstyle = 'none', linestyle= 'none', c="green")
        plt.legend()
        plt.pause(2)
Beispiel #6
0
def get_regression_model(algo, poly_Order=2, **kwargs):
    # for key in kwargs: print(key) sys.exit()
    print_mod_info = False
    ### Regression models
    ### https://stackoverflow.com/questions/12860841/python-import-in-if

    if algo == 'XGR':
        mod = xgr(**kwargs)
    elif algo == 'RFR':
        mod = rfr(**kwargs)
    elif algo == 'ABR':
        mod = abr(**kwargs)
    elif algo == 'P1R':
        mod = LinearRegression(**kwargs)
    elif algo == 'P2R':
        mod = make_pipeline(PolynomialFeatures(poly_Order), Ridge(**kwargs))
    elif algo == 'ANN':
        mod = MLPRegressor(**kwargs)
    elif algo == 'ELN':
        mod = ElasticNet(**kwargs)  # add parameters later
    elif algo == 'E2R':
        mod = make_pipeline(PolynomialFeatures(poly_Order),
                            ElasticNet(**kwargs))
    elif algo == 'PLS':
        mod = PLSRegression(**kwargs)
    else:
        print('Algorithm has not yet been added to the menu.')
        sys.exit()

    return mod
def getModel(X_train, y_train, sorted_scores, numFeatures, estimators):
    np.random.seed(42)
    included_features = np.array(
        sorted_scores)[:,
                       0][:numFeatures]  # ordered list of important features
    X = X_train[included_features]

    mean_rfrs = []
    std_rfrs_upper = []
    std_rfrs_lower = []
    # yt = [i for i in Y["SalePrice"]]

    # for each number of estimators, fit the model and find the results for 8-fold cross validation
    for i in estimators:
        model = rfr(n_estimators=i, max_depth=None)
        scores_rfr = cross_val_score(model,
                                     X,
                                     y_train,
                                     cv=10,
                                     scoring="explained_variance")
        print("estimators:", i)
        #     print('explained variance scores for k=10 fold validation:',scores_rfr)
        print("Est. explained variance: %0.2f (+/- %0.2f)" %
              (scores_rfr.mean(), scores_rfr.std() * 2))
        print("")
        mean_rfrs.append(scores_rfr.mean())
        std_rfrs_upper.append(scores_rfr.mean() +
                              scores_rfr.std() * 2)  # for error plotting
        std_rfrs_lower.append(scores_rfr.mean() -
                              scores_rfr.std() * 2)  # for error plotting
    return mean_rfrs, std_rfrs_upper, std_rfrs_lower
Beispiel #8
0
def rfrModel(train_X, test_X, train_y, test_y):
    """using random forest reegressor"""
    from sklearn.ensemble import RandomForestRegressor as rfr
    model = rfr(random_state=1)  # defining the model
    model.fit(train_X, train_y)  # fitting the model on training sets
    y_pred = model.predict(test_X)  #predicting on test values
    '''Evaluating your model using mean absolute error'''
    return mae(test_y, y_pred)
Beispiel #9
0
def randomRegression(x, y, testX, testY):

    model = rfr()

    model.fit(x, y)

    print("Fitting Complete. Displaying Results... / 모델 피팅 성공. 결과 출력...")

    print("R^2 Score:",model.score(testX, testY))
Beispiel #10
0
def createPeakTestModel(train, test):
    forest = rfr()
    forest.fit(train[:, :-1], train[:, -1])
    scores = cross_val_score(forest, test[:, :-1], test[:, -1])
    print test.shape
    print train.shape
    print scores.mean()
    for x in xrange(test.shape[0]):
        print "%i vs %i given %s" % (test[x, -1],
                                     forest.predict(test[x, :-1].reshape(
                                         1, -1)), test[x, :-1])
Beispiel #11
0
def model_randomforest_regressor(X_train, X_test, y_train, y_test):
    model_name = f'model_{count}_randomforest_regressor'

    model = rfr()
    model.fit(X_train, y_train)
    model.independentcols = independentcols

    score = model.score(X_test, y_test)

    print(f'{model_name} accuracy: {score}')
    joblib.dump(model, f'model/{model_name}.joblib')
Beispiel #12
0
    def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False):
        size = 1.3 * self.report_width // 10

        models = {}
        models["Linear regressor"]                  = lr()
        models["Lasso regressor"]                   = lassor()
        models["Lasso CV regressor"]                = lassocvr()
        models["Ridge regressor"]                   = rr(alpha=0, normalize=True)
        models["Ridge CV regressor"]                = rcvr(alphas = alphas)
        models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform')
        models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance')
        models["K nearest neighbors regressor K5"]  = knnr(n_neighbors=5)
        models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10)
        models["SGD regressor"]                     = sgdr(max_iter=10000, warm_start=True)
        models["Decision tree regressor"]           = dtr()
        models["Decision tree regressor D3"]        = dtr(max_depth=3)
        models["Random forest regressor"]           = rfr()
        models["Ada boost regressor"]               = abr()
        models["Gradient boost regressor"]          = gbr()
        models["Support vector regressor"]          = svr()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        #kf = StratifiedKFold(n_splits=folds, shuffle=True)
        kf = KFold(n_splits=folds)
        results = []
        names = []
        for model_name in models:
            cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Regressor': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Regressor Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
def linear_regression_model(train, validation, alpha, depth=None):

    X_train = train['X'].values[:, 1:]
    y_train = np.ravel(train['y'].values)
    X_validation = validation['X'].values[:, 1:]
    y_validation = np.ravel(validation['y'].values)
    models = {
        'type': ['ridge', 'decision tree', 'random forest'],
        'model': [
            Ridge(alpha=0.1,
                  fit_intercept=fit_intercept,
                  normalize=normalize,
                  max_iter=max_iter,
                  tol=tol,
                  random_state=random_state),
            dt(criterion='mse',
               splitter='best',
               max_depth=depth,
               min_samples_split=2,
               min_samples_leaf=1,
               random_state=random_state),
            rfr(n_estimators=100,
                criterion='mse',
                max_depth=depth,
                min_samples_split=2,
                min_samples_leaf=1,
                random_state=random_state)
        ],
        'score_train': [],
        'score_valid': [],
        'mse_train': [],
        'mse_valid': []
    }
    y_train_predict = []
    y_valid_predict = []
    for i in np.arange(0, len(models['type']), 1):
        m = models['model'][i]
        m.alpha = alpha
        m.fit(X_train, y_train)
        models['score_train'].append(m.score(X_train, y_train))
        models['score_valid'].append(m.score(X_validation, y_validation))
        y_train_predict.append(m.predict(X_train))
        y_valid_predict.append(m.predict(X_validation))
        models['mse_train'].append(mse(y_train, y_train_predict[i]))
        models['mse_valid'].append(mse(y_validation, y_valid_predict[i]))
    print('models: ', models['type'])
    print('R2 training:', models['score_train'])
    print('R2 validation:', models['score_valid'])
    print('MSE training:', models['mse_train'])
    print('MSE validation:', models['mse_valid'])
    return models
Beispiel #14
0
def regressor(file, X, Y, x, y):
    param = []
    acc = []
    criterion = ['mse', 'mae']
    for i in it.product(n_estimators, criterion, max_depth, min_samples_split,
                        min_samples_leaf, min_weight_fraction_leaf,
                        max_features, max_leaf_nodes, min_impurity_decrease,
                        min_impurity_split, bootstrap, oob_score, n_jobs,
                        random_state, verbose, warm_start):
        # print(*i)
        forest = rfr(*i)
        forest.fit(X, Y)
        # print('Accuracy: ' + str(forest.score(x, y)) + '\n')
        acc.append(forest.score(x, y))
        param.append([*i])
    _results(file, acc, param)
Beispiel #15
0
def set_missing_ages(df):
    # 取出已有的数值型特征
    age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]

    # 把乘客分成已知年龄和未知年龄
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # 训练集
    y = known_age[:, 0]
    X = known_age[:, 1:]

    regressor = rfr(random_state=0, n_estimators=2000, n_jobs=-1)
    regressor.fit(X, y)

    return regressor
def fill_missing_rf(X, y, to_fill):
    df = X.copy()
    fill = df.loc[:, to_fill]
    df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1)
    # 找出我们的训练集和测试集
    Ytrain = fill[fill.notnull()]
    Ytest = fill[fill.isnull()]
    Xtrain = df.iloc[Ytrain.index, :]
    Xtest = df.iloc[Ytest.index, :]
    # 用随机森林回归来填补缺失值
    from sklearn.ensemble import RandomForestRegressor as rfr

    rfr = rfr(n_estimators=100)
    rfr = rfr.fit(Xtrain, Ytrain)
    Ypredict = rfr.predict(Xtest)
    return Ypredict
Beispiel #17
0
    def process(self):
        data = self.parameters.get('in')
        target_col = self.parameters.get('target_col', None)
        features_col = self.parameters.get('features_col', None)
        kwargs = self.parameters.get('parameters', None)

        if features_col is None:
            features_col = list(set(data.columns) - set([target_col]))

        targets = np.array(data[target_col])
        features = np.array(data[features_col])

        rf_model = rfr(**kwargs)
        rf_model.fit(features, targets)

        return rf_model
def runRF(train_1_x, test_x, hold_out, sub_test, fobj):
    ntrees = 100
    njobs = 2
    exp=30

    c_train_1_x = train_1_x[(train_1_x[:,-1]<=exp)]
    c_train_y = c_train_1_x[:,-1]
    c_train_x = c_train_1_x[:,:-1]

    rf_model = rfr(n_estimators=ntrees, n_jobs=-1)
    est = rf_model.fit(c_train_x, c_train_y)
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'RF Train-1 Error: %r\n' % (error)
    fobj.write('RF Train-1 Error: %r\n' % (error))
    valid_y_pred = est.predict(test_x)
    hold_y = est.predict(hold_out)
    sub_y = est.predict(sub_test)
    return est, valid_y_pred, hold_y, sub_y
def runRF(train_1_x, test_x, hold_out, sub_test, fobj):
    ntrees = 100
    njobs = 2
    exp = 30

    c_train_1_x = train_1_x[(train_1_x[:, -1] <= exp)]
    c_train_y = c_train_1_x[:, -1]
    c_train_x = c_train_1_x[:, :-1]

    rf_model = rfr(n_estimators=ntrees, n_jobs=-1)
    est = rf_model.fit(c_train_x, c_train_y)
    train_y_pred = est.predict(c_train_x)
    error = mt.mean_absolute_error(c_train_y, train_y_pred)
    print 'RF Train-1 Error: %r\n' % (error)
    fobj.write('RF Train-1 Error: %r\n' % (error))
    valid_y_pred = est.predict(test_x)
    hold_y = est.predict(hold_out)
    sub_y = est.predict(sub_test)
    return est, valid_y_pred, hold_y, sub_y
def get_regression_model(algo, settings, print_mod_info=False):
    ### Regression models
    ### https://stackoverflow.com/questions/12860841/python-import-in-if
    if algo == 'XGR':
        mod = xgr(n_estimators=settings[0], max_depth=settings[1])
        if print_mod_info: print('XGBoost:', mod)
    elif algo == 'RFR':
        mod = rfr(n_estimators=settings[0])
        if print_mod_info: print('Random Forest:', mod)
    elif algo == 'ABR':
        mod = abr(n_estimators=settings[0])
        if print_mod_info: print('AdaBoost:', mod)
    elif algo == 'P1R':
        mod = LinearRegression()
        if print_mod_info: print('Linear:', mod)
    elif algo == 'P2R':
        mod = make_pipeline(PolynomialFeatures(settings[0]), Ridge())
        if print_mod_info: print('Poly 2:', mod)
    elif algo == 'ANN':
        mod = MLPRegressor(
            solver='lbfgs',
            hidden_layer_sizes=(settings[0], settings[1]),  # (137,73), 
            tol=settings[2])
        if print_mod_info: print('Neural Net Regression:', mod)
    elif algo == 'ELN':
        mod = ElasticNet(alpha=settings[0],
                         l1_ratio=settings[1])  # add parameters later
        if print_mod_info: print('Elastic Net Regression:', mod)
    elif algo == 'E2R':
        mod = make_pipeline(
            PolynomialFeatures(settings[0]),
            ElasticNet(alpha=settings[1], l1_ratio=settings[2]))
        if print_mod_info: print('Poly 2:', mod)
    elif algo == 'PLS':
        mod = PLSRegression(n_components=settings[0])
        if print_mod_info: print('Partial Least Squares Regression:', mod)
    else:
        print('Algorithm not setup yet.')
        sys.exit()

    return mod
Beispiel #21
0
    def fit(self, losses, configs=None):

        if configs is None:
            configs = [[]] * len(times)

        # convert learning curves into X and y data

        X = []
        y = []

        for l, c in zip(losses, configs):
            l = self.apply_differencing(l)

            for i in range(self.order, len(l)):
                X.append(np.hstack([l[i - self.order:i], c]))
                y.append(l[i])

        self.X = np.array(X)
        self.y = np.array(y)

        self.rfr = rfr().fit(self.X, self.y)
Beispiel #22
0
def main():
  # read train data
  filenames = ["ct_rac_S000_JT00_2013.csv", 
  "ny_rac_S000_JT00_2013.csv", 
  "ca_rac_S000_JT00_2013.csv",
  "mt_rac_S000_JT00_2013.csv"]

  print "loading ct..."
  X, y, ct_d = load(filenames[0])

  return 
  # print "loading ny..."
  # ny_X, ny_y, ny_d = load(filenames[1])
  # print "loading ca..."
  # ca_X, ca_y, ca_d = load(filenames[2])
  # print "loading mt..."
  # mt_X, mt_y, mt_d = load(filenames[3])

    # lr = LinearRegression()
    # lr.fit(X, y)
    # predict_y = lr.predict(X)
    
    # print "ct: ", getmse(y, predict_y)
    
    # predict_y = lr.predict(ny_X)
    # print "ny: ", getmse(ny_y, predict_y)
    
    # predict_y = lr.predict(ca_X)
    # print "ca: ", getmse(ca_y, predict_y)
    
    # predict_y = lr.predict(mt_X)
    # print "mt: ", getmse(mt_y, predict_y)
  


  print "training data..."
  rtree = rfr(n_estimators = 79, max_features = 10, oob_score = True)    

  rtree.fit(X, y)
  print rtree.feature_importances_  
def regression(X, y):

    print(X.shape, y.shape)
    score = 0
    folds = 4
    forest = rfr(n_estimators=5)

    # Ensuring label percentage balance when K-folding
    skf = KFold(X.shape[0], shuffle=True, n_folds=folds)
    for train_index, test_index in skf:
        Xtrain, Xtest = X[train_index], X[test_index]
        ytrain, ytest = y[train_index], y[test_index]

        Xtrain = np.array(Xtrain, dtype='float64')
        Xtest = np.array(Xtest, dtype='float64')
        #Xtrain[np.isinf(Xtrain)] = 0
        forest.fit(Xtrain, ytrain)

        error = 0
        errorList = []
        predictions = []
        for i in range(0, Xtest.shape[0]):
            a = np.transpose(Xtest[i, :].reshape(Xtest[i, :].shape[0], 1))

            pr = forest.predict(a)

            temp_err = np.absolute(pr - ytest[i]) * 60
            errorList.append(temp_err)
            predictions.append(pr)
            error += temp_err

        visualize(ytest, errorList, predictions)

        print('Average error in minutes: {0}'.format(error / Xtest.shape[0]))
        print('Max/min/median error: {0} , {1} , {2}'.format(
            max(errorList), min(errorList), np.median(errorList)))
        del errorList[:]
        del predictions[:]
def regression(X,y):
	
	print(X.shape,y.shape)
	score = 0
	folds=4
	forest = rfr(n_estimators=5)
		
	# Ensuring label percentage balance when K-folding
	skf = KFold( X.shape[0],shuffle=True, n_folds=folds)
	for train_index,test_index in skf:
		Xtrain,Xtest = X[train_index], X[test_index]
		ytrain,ytest = y[train_index], y[test_index]
		
		Xtrain = np.array(Xtrain,dtype='float64')
		Xtest = np.array(Xtest,dtype='float64')
		#Xtrain[np.isinf(Xtrain)] = 0
		forest.fit(Xtrain,ytrain)


		error=0
		errorList =[]
		predictions= []
		for i in range(0,Xtest.shape[0]):
			a= np.transpose(Xtest[i,:].reshape(Xtest[i,:].shape[0],1))
			
			pr = forest.predict(a)

			temp_err=np.absolute(pr-ytest[i])*60
			errorList.append(temp_err)	
			predictions.append(pr)
			error += temp_err

		visualize(ytest,errorList,predictions)

		print('Average error in minutes: {0}'.format(error/Xtest.shape[0]))
		print('Max/min/median error: {0} , {1} , {2}'.format(max(errorList),min(errorList),np.median(errorList)))
		del errorList[:]
		del predictions[:]
Beispiel #25
0
def select_model(model):

    pca = PCA(n_components=5)
    svr = SVR()
    ss = StandardScaler()
    rf = rfr()

    if model == 'svr':
        pip_svm = Pipeline([
            ('pca', pca),
            ('ss', ss),
            #('rf',rf)
            ('svm', svr),
        ])
        params_svr = {
            "pca__n_components": [i for i in range(1, 6)],
            "svm__kernel": ['rbf'],
            "svm__gamma": [10**i for i in range(-4, 0)],
            "svm__C": [10**i for i in range(1, 4)]
        }
        return pip_svm, params_svr

    else:
        pip_rf = Pipeline([('pca', pca), ('ss', ss), ('rf', rf)
                           #('svm',svr),
                           ])
        params_rf = {
            #"pca__n_components":[i for i in range(1, len(x.columns))],
            'rf__n_estimators': [50, 100, 300],
            'rf__max_features': [i for i in range(1, 6)],
            'rf__random_state': [0],
            'rf__n_jobs': [-1],
            #'rf__min_samples_split' : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
            'rf__max_depth': [30, 40, 50, 100]
        }

        return pip_rf, params_rf
Beispiel #26
0
def fillMissingValue(df, fy, WaytoFillNan = Env_var.get('WaytoFillNan')):
    train_data_temp = df[df.iloc[:,fy].notnull()]  
    test_data_temp = df[df.iloc[:,fy].isnull()]  
    train_y=train_data_temp.iloc[:,fy]
    train_X=train_data_temp.copy()
    train_X = train_X.drop(train_X.columns[fy], axis = 1)
    test_X = test_data_temp.copy()
    test_X = test_X.drop(test_X.columns[fy], axis =1)
    mixed_X = Imputer().fit_transform(train_X.append(test_X, ignore_index=True))
    length_train = len(train_X)
    train_X = mixed_X[:length_train,:]
    test_X = mixed_X[length_train:,:]
    
    if (WaytoFillNan == 'rfr'):
        print ("Try to fill-up value with rfr")
        rfr_regressor=rfr(n_estimators=100, verbose = 5, n_jobs = -1)
        rfr_regressor.fit(train_X,train_y)
        y_pred = rfr_regressor.predict(test_X)
        print (y_pred)
        df.iloc[:,fy] = df.iloc[:,fy].fillna(value = pd.Series(data = y_pred))

    elif (WaytoFillNan == 'mean'):
        print ("Try to fill-up value with mean")
        df.iloc[:,fy] = df.iloc[:,fy].fillna(value = np.mean(df.iloc[:,fy]))
        
    elif (WaytoFillNan == 'ffill'):
        df.iloc[:,fy] = df.iloc[:,fy].fillna(method = 'ffill')

    elif (WaytoFillNan == 'bfill'):
        df.iloc[:,fy] = df.iloc[:,fy].fillna(method = 'bfill')

    elif (WaytoFillNan == 'knn'):
        impute = KnnIm()
        df.iloc[:,fy] = (impute.knn(X=df, column=fy, k=10))[:,fy]
    
    return df
def fill_missing_rf(X, y, to_fill):
    """
    使用随机森林填补一个特征的缺失值的函数
    参数:
    X:要填补的特征矩阵
    y:完整的,没有缺失值的标签
    to_fill:字符串,要填补的那一列的名称
    """

    # 构建我们的新特征矩阵和新标签
    df = X
    # 缺失值特征: 待填充列
    fill = df.loc[:, to_fill]
    # 将除 缺失值特征: 待填充列 之外的其他特征 与 因变量Y 按列拼接组成 临时特征集合
    df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1)

    # 找出训练集和测试集
    # 缺失值特征: 待填充列 非空值为 训练集Y
    Ytrain = fill[fill.notnull()]
    # 缺失值特征: 待填充列 空值为 测试集Y
    Ytest = fill[fill.isnull()]

    # 注意: 数据集是做过索引恢复重置的,所以能使用DataFrame.iloc[index], 如果数据集索引是乱的,只能使用DataFrame.loc[index]
    # 按 缺失值特征: 待填充列 非空值 的索引 取出 训练集X
    Xtrain = df.iloc[Ytrain.index, :]
    # 按 缺失值特征: 待填充列 空值 的索引 取出 测试集X
    Xtest = df.iloc[Ytest.index, :]

    # 用随机森林回归来填补缺失值
    from sklearn.ensemble import RandomForestRegressor as rfr
    rfr = rfr(n_estimators=100
              )  # random_state=0,n_estimators=200,max_depth=3,n_jobs=-1
    rfr = rfr.fit(Xtrain, Ytrain)
    Ypredict = rfr.predict(Xtest)  # <class 'numpy.ndarray'> 索引自动重排了: [0:29140]

    return Ypredict
X_train = data[0:1460, :]
'''from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y = [y_train]
y = np.transpose(y)
y_train = sc_y.fit_transform(y)

from sklearn.svm import SVR
svr = SVR(kernel = 'rbf', C = 40)
svr.fit(X_train, y_train)'''

from sklearn.ensemble import RandomForestRegressor as rfr
regressor = rfr(n_estimators=285, random_state=42)  #0.14751
regressor.fit(X_train, y_train)
regressor.score(X_train, y_train)

X_test = data[1460:, :]
#score = {['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']}
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=regressor,
                             X=X_train,
                             y=y_train,
                             scoring='neg_mean_squared_log_error',
                             cv=5)
accuracies.mean()
accuracies.std()

from sklearn.model_selection import GridSearchCV
#training_weights = training_data['weight']
training_data, training_location_data, training_response, training_weights = preprocess_dataframe( training_data)

print "Shape: ", training_data.shape

### CREATE MODEL ###
bl = Blender( verbose = True, training_fraction = 0.95)



bl.add_model( lclR.LocalRegression(k = 500 , regressor = SmartSVR, params = {'gamma':0.0001}), "SmrtSVR")
bl.add_model( lclR.LocalRegression(k = 500 , regressor = SmartSVR, params = {'gamma':0.001, "C":50}), "LooseSmrtSVR")
bl.add_model( lclR.LocalRegression(k = 500, regressor = sklm.ElasticNet, params = {'alpha':0.0001, "normalize":True}), "ElNet500" )
bl.add_model( lclR.LocalRegression(k = 500, regressor = rfr, params={"n_jobs":5, "n_estimators": 10 } ), "rfr50est")
bl.add_model( lclR.LocalRegression( k=500, regressor = sklm.Lasso, params={"alpha":0.01, "normalize":True}), "lasso" )
bl.add_model( rfr( n_jobs=150, n_estimators=200 ), "Globalrfr50est")
bl.add_model( lclR.LocalRegression( k=750, regressor = sklm.Ridge, feature_selection = True, params={ 'alpha':0.001, 'normalize':True} ), "RidgeWithFeatureSelection" )
bl.add_model( lclR.LocalRegression( k=750, regressor = sklm.Ridge, feature_selection = False, params={"alpha":0.1, "normalize":True}), "Ridge")
bl.add_model( lclR.LocalRegression( k = 250, regressor = sklm.Ridge, feature_selection = False, params={"alpha":1.5, "normalize":True} ), "LocalRidge")
lc = [ training_location_data.values ]

bl.fit( training_data.values, training_response.values, {"SmrtSVR": lc, "ElNet500":lc, "RidgeWithFeatureSelection":lc, "rfr50est":lc, "lasso":lc, "Ridge":lc, "LooseSmrtSVR":lc, "LocalRidge":lc } )
 

lc = [test_location_data.values]

print
print "Coefficients:"
print bl.coef_
print "begin prediction"
prediction = bl.predict( test_data.values, { "lasso":lc, "rfr50est":lc, "SmrtSVR": lc, "ElNet500":lc, "RidgeWithFeatureSelection":lc, "Ridge":lc, "LooseSmrtSVR":lc, "LocalRidge":lc } )
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
x = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""

# Fitting the Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor as rfr
regressor = rfr(n_estimators=300, random_state=0)
regressor.fit(x, y)

# Predicting a new result
y_pred = regressor.predict([[6.5]])

# Visualising the Regression results (for higher resolution and smoother curve)
Beispiel #31
0
	except IndexError:
	    print("Need file name as argument")
	return x, y




x, y = loadData()
x_val = x[:1000]
y_val = y[:1000]

x = x[1000:]
y = y[1000:]


model = rfr(n_estimators=10, criterion='mse', n_jobs=7, verbose=0)

kf = KFold(n_splits=5)
d = {}
for train, test in kf.split(x):
	x_train, x_test = x[train], x[test]
	y_train, y_test = y[train], y[test]
	w = model.fit(x_train, y_train)
	pred_test = w.predict(x_test)
	m_test = msle(y_test, pred_test)
	print("msle for testing set is: ", m_test)
	pred = w.predict(x_val)
	#print(pred[:100], y_val[:100])
	m_val = msle(y_val, pred)
	print("msle for validation set is ", m_val )
	d[m_test*0.3 + m_val*0.7] = w
Beispiel #32
0
model = RadiusNeighborsRegressor(radius=0.5, p=2)
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=10,max_depth=8,\
min_samples_split=2)
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor(n_estimators=400)
from sklearn.ensemble import GradientBoostingRegressor
model=GradientBoostingRegressor(n_estimators=100,\
learning_rate=0.1,max_depth=10)
from sklearn.ensemble import BaggingRegressor
mb = model
model=BaggingRegressor(base_estimator=mb,n_estimators=20,bootstrap=1,\
bootstrap_features=1,max_samples=0.3,max_features=0.3)
model = LR(C=0.004)
model = LR(C=0.01, penalty='l1')
model=rfr(n_estimators=2000,max_depth=1,min_samples_leaf=20,\
min_samples_split=100)
model=BR(alpha_1=1e2,alpha_2=3e2,lambda_1=1e-9,\
lambda_2=1e-9,compute_score=False)
model_dtr=dtr(max_depth=3,min_samples_leaf=5,max_leaf_nodes=200,\
min_weight_fraction_leaf=0.05)
model=dtr(max_depth=4,min_samples_leaf=5,max_leaf_nodes=200,\
min_weight_fraction_leaf=0.05)

model = dtr(max_depth=3,
            min_samples_leaf=2,
            max_leaf_nodes=20,
            splitter='random')

model=AdaBoostRegressor(n_estimators=50,learning_rate=0.01,\
loss='square',base_estimator=mb)
Beispiel #33
0
with open(dataset, 'r') as fh:
    for line in fh.readlines():
        (time,luma,brightness) = line.split(' ')
        X.append([float(time),float(luma)])
        Y.append(float(brightness.rstrip()))
X = np.array(X)
Y = np.array(Y)
stderr.write(' done\n')
stderr.flush()

# create predictive model
trees = 2000
stderr.write('model fitting averaging votes of %s randomized'
             ' trees..' % trees)
stderr.flush()
mymodel = rfr(n_estimators=trees, random_state=0, n_jobs=-1)
mymodel.fit(X,Y)
stderr.write(' done\n')
stderr.flush()

# generate testing dataset
stderr.write('define test dataset..')
stderr.flush()
T = []
Tentries = 0
for time in range(int(np.min(X[:,0])+0.5), int(np.max(X[:,0])+1.5), 500):
    for luma in range(int(np.min(X[:,1]+0.5)), int(np.max(X[:,1])+1.5), 100):
        T.append([time, luma])
        Tentries += 1
        if (Tentries % 5000) == 0:
            stderr.write('.')
                          scoring='neg_mean_absolute_error')

grid_result = gridsearch.fit(X_train, y_train)
grid_pred = gridsearch.predict(X_test)
grid_rmse = mean_squared_error(y_test, grid_pred)
print('Decision Tree MAE: ' + str(sum(abs(grid_pred - y_test))/(len(y_test))))
print('Decision Tree RMSE: ' + str(np.sqrt(grid_rmse)))
print(grid_result.best_params_)
print(abs(grid_result.best_score_))


#--------------------------RANDOM FOREST GRIDSEARCH----------------------------------------
""" the best values for each parameter came out as: 'max_depth'=14,
'min_samples_leaf'=1, 'min_samples_split'=2, and 'n_estimators'=12 using Dataset 3 """

gridsearch = GridSearchCV(estimator=rfr(random_state=4), cv=5,
                          param_grid={
                              'n_estimators':[10,20,40,70,100,500,1000],
 """we selected the n_estimators at approximately the point of diminishing
 performance returns, which is why the optimal value ended up being so low"""
                              'max_depth':[10,20,30,40,50],
                              'min_samples_split':[2,3,4,5],
                              'min_samples_leaf':[1,2,3,4,5]
                              },
                          scoring='neg_mean_absolute_error')

grid_result = gridsearch.fit(X_train, y_train)
grid_pred = gridsearch.predict(X_test)
grid_rmse = mean_squared_error(y_test, grid_pred)
print('Random Forest MAE: ' + str(sum(abs(grid_pred - y_test))/(len(y_test))))
print('Random Forest RMSE: ' + str(np.sqrt(grid_rmse)))
import pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor as rfr

train = pd.read_csv('day.csv')
train = train.sample(frac=1, random_state=0, axis=0)

ytrain_ = train['cnt']
Xtrain_ = train.drop(['cnt', 'registered', 'casual', 'instant', 'dteday'], axis=1)
m = Xtrain_.shape[0]
split_point = 600

Xtrain = Xtrain_[:split_point]
Xtest = Xtrain_[split_point:]
ytrain = ytrain_[:split_point]
ytest = ytrain_[split_point:]

mtrain = Xtrain.shape[0]
mtest = Xtest.shape[0]
n = Xtrain.shape[1]

r = rfr(random_state=0, n_estimators=9, max_depth=7, min_samples_split=4)
r.fit(Xtrain, ytrain)
important_features = pd.Series(data=r.feature_importances_,index=Xtrain.columns)
important_features.sort_values(ascending=False,inplace=True)
print(important_features)

train_acc = r.score(Xtrain, ytrain)*100
test_acc = r.score(Xtest, ytest)*100

print("Train accuracy : %.2f" % train_acc)
print("Test accuracy : %.2f" % test_acc)