Example #1
0
def main():
    fi = open('25-75_microcap_list.txt', 'r')
    symbols = []
    for i in fi:
        symbols.append(i.strip())
    #symbols = symbols[0:6]

    train, test = get_data(symbols, n = 30, flag = 1, blag = 12)

    train = train.replace([np.inf, -np.inf], np.nan)
    test = test.replace([np.inf, -np.inf], np.nan)

    train = train.dropna(axis=0)
    test = test.dropna(axis=0)

    print 'Fitting\n'
    m = RandomForestRegressor(n_estimators=250, n_jobs=1)
    m.fit(train.ix[:,6:], train.ix[:,5])
    print 'Predicting\n'
    preds = m.predict(test.ix[:,5:])

    result = test.ix[:,:4]
    result['Prediction'] = preds
    result = result.sort('Prediction', ascending=False)
    print result.head()
    result.to_csv('trade_result.csv', sep = ',', index = False)
Example #2
0
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    
    return df, rfr
Example #3
0
def get_preds(features, trees=3000, depth=19):  # features is the number of latents features that I want the nmf to run on
    # Create dataframes
    df = get_nmf(k=features)
    df_full = add_yahoo_to_df(df)
    df_train = add_dummies(df_full)   # Why aren't you using df_full?

    df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is
    df_test_full = add_yahoo_to_df(df_test)
    df_test_full = add_dummies(df_test_full)

    # Create models
    X_model_class, y_model_class = get_classifier_data(df_full)
    rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth)
    rf_class.fit(X_model_class, y_model_class)
    #
    X_model_regress, y_model_regress = get_regressor_data(df_full)
    rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth)
    rf_regress.fit(X_model_regress, y_model_regress)

    # Get X and y values
    X_classify, y_classify  = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11']))
    X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11']))

    # Run models

    classifier_preds = rf_class.predict(X_classify)
    classifier_accuracy = accuracy_score(classifier_preds, y_classify)

    regressor_preds = rf_regress.predict(X_regress)
    regressor_mse = mean_squared_error(regressor_preds, y_regress)

    # I want to return the number of features, k, along with the accuracy of the classifier
    # and the MSE of the regressor.  This will give me an idea of how well things are doing
    # based on the number of features.
    return [features, classifier_accuracy, regressor_mse]
Example #4
0
def train_year(train_fea, trees):
    values = train_fea['SaleYear'].values
    years = sorted(list(set(values)))
    rfs =[]
    for i in range(0, len(years)):
        print 'train model %d' % (years[i])
        rf = RandomForestRegressor(n_estimators=trees, n_jobs=1, compute_importances = True)
        y = train_fea[train_fea['SaleYear']==years[i]]
        y_fea = y.copy()
        del y_fea['SalePrice']
        rf.fit(y_fea, y["SalePrice"])
        rfs.append(rf)
    errors = None
    for i in range(1, len(years)):
        pairs = get_pairs(years, i)
        for p in pairs:
            print 'compare %d, %d' % (p[0], p[1])
            y1 = train_fea[train_fea['SaleYear']==p[0]]
            y2 = train_fea[train_fea['SaleYear']==p[1]]
            y1_fea, y2_fea = y1.copy(), y2.copy()
            del y1_fea['SalePrice']
            del y2_fea['SalePrice']
            rf = rfs[years.index(p[0])]
            y2_p = rf.predict(y2_fea)
            y2_r = np.array([v for v in y2['SalePrice']])
            error_rates = np.array(map(lambda x,y: math.fabs(x-y)/y, y2_p, y2_r))
            if type(errors)==types.NoneType:
                errors = pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i])
            else:
                errors = errors.append(pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i]))
    errors_list = []
    for i in range(1, len(years)):
        errors_list.append(errors.ix[i]['mean'].mean())
    return rfs, errors_list
def cross_validate(features_target):
    features = features_target[0]
    target = features_target[1]
    rf = RandomForestRegressor(
        n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, compute_importances=True, random_state=1
    )

    cv = cross_validation.KFold(len(features), n_folds=10, indices=False)

    # iterate through the training and test cross validation segments and
    # run the classifier on each one, aggregating the results into a list
    results = []
    i = 1
    for traincv, testcv in cv:
        print "Running fold " + str(i)
        fit = rf.fit(features[traincv], target[traincv])
        predictions = fit.predict(features[testcv])
        predictions = predictions.flatten()

        for j in range(len(predictions)):
            results.append((target[testcv][j], predictions[j]))

        importance(rf)
        i = i + 1

    combined_auc(results)
def pred_pH(train, val, test, all_vars, loop):
    data = (val, test, train)
    # variable selection
    pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001)
    univ_selector = SelectKBest(score_func = f_regression, k = 1200)
    univ_selector.fit(train[all_vars], train['pH'])
    pvals = univ_selector.get_support()
    chosen =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x] | pvals[x]:
            chosen.append(all_vars[x])
    lass_only =  []
    for x in range(0, len(all_vars)):
        if pH_lassoed_vars[x]:
            lass_only.append(all_vars[x])
    # nearest randomforest
    neigh = RandomForestRegressor(n_estimators=100)
    neigh.fit(train.ix[:, chosen], train['pH'])
    for dset in data:
        dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen])  
    # lasso
    lass = Lasso(alpha=.000000275, positive=True)
    lass.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_las_prds'] = lass.predict(dset[all_vars])
    # ridge
    pH_ridge = RidgeCV(np.array([.6]), normalize=True)
    pH_ridge.fit(train[all_vars], train['pH'])
    for dset in data:
        dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars])
    # combination
    models= [ 'pH_rdg_prds', 'pH_las_prds', 
              'pH_for_prds', 'pH_for_prds' ] 
    name = 'pH_prds' + str(object=loop)
    write_preds(models, name, train, val, test, 'pH')
def pipeline():
        val = data[data.watch==1]
        val_a_b = val[['item_id','store_code','a','b']]
        val_y = val.label
        val_x = val.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train = data[(data.watch!=1)&(data.watch!=0)]
        train_y = train.label

        
        a = list(train.a)
        b = list(train.b)
        train_weight = []
        for i in range(len(a)):
            train_weight.append(min(a[i],b[i]))
        train_weight = np.array(train_weight)

        train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1)

        train_x.fillna(train_x.median(),inplace=True)
        val_x.fillna(val_x.median(),inplace=True)
        

        model = RandomForestRegressor(n_estimators=500,max_depth=5,max_features=0.6,n_jobs=-1,random_state=1024)

	#train
	model.fit(train_x,train_y, sample_weight=train_weight)


	#predict val set
	val_a_b['pred'] = model.predict(val_x)
	val_a_b['y'] = val_y
	cost = cal_cost(val_y.values,val_a_b.pred.values,val_a_b.a.values,val_a_b.b.values)
        val_a_b.to_csv('val_{0}.csv'.format(cost[1]),index=None)
def backward_best_features_per_cluster(X, Y, all_feature_metadata):
    best_features_per_cluster = {}
    for c in sorted(X['cluster'].unique()):
        seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c].ALSFRS_slope
        print "cluster:", c, "with size:", seg_X.shape, "with mean target:", seg_Y.mean(), "std:", seg_Y.std()
        seg_Y = seg_Y.fillna(seg_Y.mean())
        
        model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000).fit(seg_X, seg_Y)
        print "best we can do with all features:", np.sqrt(np.mean((model.predict(seg_X) - seg_Y) ** 2))

        selected_fams = set(all_feature_metadata.keys())
        selected_derived = set([])
        for fam in selected_fams:
            selected_derived.update([der for der in all_feature_metadata[fam]['derived_features']])
        while len(selected_fams) > 6:
            score_per_family = {}
            t1 = time.time()
            for family, fm in all_feature_metadata.iteritems():
                if family in selected_fams:
                    X_feature_fam = seg_X[list(selected_derived - set(fm["derived_features"]))]
                    model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000).fit(
                        X_feature_fam, seg_Y)
                    score_per_family[family] = np.sqrt(np.mean((model.predict(X_feature_fam) - seg_Y) ** 2))
            t_lasso_cv = time.time() - t1
            worst_fam = sorted(score_per_family.items(), key=operator.itemgetter(1), reverse=True)[0]
            print "removing worst family:", worst_fam, "time:", t_lasso_cv
            selected_fams.remove(worst_fam[0])
            selected_derived = set([])
            for fam in selected_fams:
                selected_derived.update([der for der in all_feature_metadata[fam]['derived_features']])
        best_features_per_cluster[c] = list(selected_fams)                          
    return best_features_per_cluster
def do_regression(df, j, i, k): # input is a pandas dataframe with columns as needed below
			# output is a regression object trained to the data in the input dataframe

	
	# convert dataframe info into a vector
				
	y   = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'count' ].astype(int).values
	x_1 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'humidity' ].astype(int).values
	x_2 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'temp' ].astype(int).values
	x = zip(x_1, x_2)
				
	## Create linear regression object
	#regr = linear_model.LinearRegression()
	
	# create random forest object, should include all parameters
	regr = RandomForestRegressor(n_estimators= 100)
	#forest = DecisionTreeRegressor(max_depth = 4)
	
	## Train the model using the training sets
	
	regr.fit(x, y)



	return regr
Example #10
0
def fill_missing_age(df):
    #把已有的数值型特征取出来丢进Random Forest Regressor 中
    age_df = df[['Age','Fare','Parch','SibSp','Pclass']]
    #print age_df

    #把乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()
    # print "known_age......."
    # print known_age
    # print "unknown age ........"
    # print unknown_age

    # 目标年龄
    y=known_age[:,0]

    # 特征属性值
    x=known_age[:,1:]

    #fit 到RandomForestRegressor之中
    RFR=RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1)
    RFR.fit(x,y)

    #用得到的模型进行未知年龄结果预测
    predictedAge= RFR.predict(unknown_age[:,1::])

    #用预测的结果填补原缺失数据
    df.loc[(df.Age.isnull()),'Age']=predictedAge
    return df,RFR
Example #11
0
def RFscore_one(x,y,id):
    folds=3
    print "RFscore " + id
    r = range(len(x))
        
    np.random.shuffle(r)
    x = x[r]
    y = y[r]
    x = (x - np.mean(x)) / np.std(x)
    y = (y - np.mean(y)) / np.std(y)
    
    x = np.array(x, ndmin=2)
    y = np.array(y, ndmin=2)
    
    x = x.T
    y = y.T
    
    rf = RandomForestRegressor(n_estimators=50, verbose=0,n_jobs=1,min_samples_split=10,compute_importances=True,random_state=1)
    fit = rf.fit(x,y)

    s = fit.score(x,y)
    
    cv = cross_validation.KFold(len(x), n_folds=folds, indices=False)
    score = 0
    median = dist(y)
    for traincv, testcv in cv:
        fit = rf.fit(x[traincv], y[traincv])
        score += fit.score(x[testcv], y[testcv])

    score /= folds
    score /= median
    return score
Example #12
0
def cross_val(seq, ft):
    n_folds = 10
    X, y = load_train_data(seq, ft)

    print('%d-fold cross validation. Dataset: %d samples, %d features' % (n_folds, X.shape[0], X.shape[1]))

    kf = KFold(len(y), n_folds=n_folds)
    n_est = range(30, 110, 20)

    results = []
    for n_estimators in n_est:
        scores = []
        for i, (train, test) in enumerate(kf):
            rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=mp.cpu_count())
            # the (default) score for each regression tree in the ensemble is regression
            # r2 determination coefficient (e.g., how much variance in y is explained
            # by the model)
            # https://www.khanacademy.org/math/probability/regression/regression-correlation/v/r-squared-or-coefficient-of-determination
            rf.fit(X[train], y[train])

            if False:
                y_pred = rf.predict(X[test])
                score = mean_squared_error(y_pred, y[test])
            else:
                score = rf.score(X[test], y[test])
            scores.append(score)
        scores = np.array(scores)
        print("n_estimators=%d; accuracy (R^2 score): %0.2f (+/- %0.2f)" % (n_estimators, scores.mean(), scores.std() * 2))
        results.append([seq, ft, X.shape[0], n_estimators, scores.mean(), scores.std()*2])
    return results
Example #13
0
 def fit(self, X, y, **kwargs):
     for key, value in kwargs.iteritems():
         if key in self.INITPARAMS.keys():
             self.INITPARAMS[key] = value
     model = RandomForestRegressor(**self.INITPARAMS)
     model.fit(X, y)
     self.model = model
Example #14
0
def main():
	train = pd.read_csv('../train.csv', parse_dates=['datetime'])
	train['hour'] = pd.DatetimeIndex(train['datetime']).hour
	train['weekday'] = pd.DatetimeIndex(train['datetime']).weekday
        train['isweekend'] = 0
        train.loc[(train['weekday']==5) | (train['weekday']==6), 'isweekend'] = 1
        
	test = pd.read_csv('../test.csv', parse_dates=['datetime'])
	test['hour'] = pd.DatetimeIndex(test['datetime']).hour
	test['weekday'] = pd.DatetimeIndex(test['datetime']).weekday
        test['isweekend'] = 0
        test.loc[(test['weekday']==5) | (test['weekday']==6), 'isweekend'] = 1


	results = pd.DataFrame(columns=['datetime', 'count'])	
	for hour, test_subset in test.groupby(test['hour']):
	    train_subset = train[train['hour'] == hour]
	    model = RandomForestRegressor(n_estimators=100)
	    model.fit(np.array(get_features(train_subset)), np.array(train_subset['count']))
	    predictions = model.predict(np.array(get_features(test_subset)))
	    dt = test_subset['datetime']
	    predictions = pd.Series(predictions, index=dt.index)
	    res = pd.concat([dt, predictions], axis=1)
	    res.columns=['datetime', 'count']
	    results = pd.concat([results, res])

	results['count'] = results['count'].astype('int64')
	results = results.sort('datetime')
	results.to_csv('../submissions/seventhSubmission.csv', index=False)
def regression(X_train, y_train, X_test, y_test):
    """
Train the regressor from Scikit-Learn.
"""
    # Random forest regressor w/ param optimization
    params = {'n_estimators':1000, 'criterion':'mse', 'max_depth':20, 'min_samples_split':1, #'estimators':400, depth:20
              'min_samples_leaf':1, 'max_features':2, 'bootstrap':True, 'oob_score':False, #'max_features':'log2'
              'n_jobs':32, 'random_state':0, 'verbose':0, 'min_density':None, 'max_leaf_nodes':None}
    if config.DEBUG: params['verbose'] = 1

    regr = RandomForestRegressor(**params)

    # Train the model using the training sets
    regr.fit(X_train, y_train)
    return regr

    # Plot the resutls
    save_semeval_data.plot_results(regr, params, X_test, y_test, feature_names)

    if config.DEBUG:
        # Show the mean squared error
        print("Residual sum of squares: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2))
        # Explained variance score: 1 is perfect prediction
        print('Variance score: %.2f' % regr.score(X_test, y_test))
    
    return regr
Example #16
0
def randomforest(data, targets, num, fnum):
    """
    7:1205
    """
    model = RandomForestRegressor(n_estimators=num, verbose=0, oob_score=True, compute_importances=True, n_jobs=10, criterion="mse", max_features=fnum)
    model.fit(data, targets)
    return model
Example #17
0
File: model.py Project: kymo/kaggle
class RandomForestModel(Model):
	""" random forest model """
	def __init__(self, *argv, **args):
		super(RandomForestModel, self).__init__(*argv)
        
		self.rf = RandomForestRegressor(**args)
	
	def pretreat_feature(self):
		# pre-handle about the feature data
		pass

	def train(self):
		# train the samples
		self.rf.fit(self.x, self.y)
	
	def assess(self):
		# assess the regression model
		error = 0.0
		for j in range(len(self.test_x)):
			pre_val = self.predict(self.test_x[j])
			error += (pre_val - self.test_y[j]) ** 2
		print 'Training Error: ', error
		
    
	def predict(self, x):
		# predic the output of the x		
		return self.rf.predict(x)

	def validate(self):
		# use cross-validation to choose the best meta-parameter
		pass
Example #18
0
 def buildForest(self, X_train, y_train):
     NUM_TREES = 100
     NUM_JOBS = 1
     FEATURES_IN_EACH_TREE = "sqrt"
     rf = RandomForestRegressor(n_estimators=NUM_TREES, verbose=1, n_jobs=NUM_JOBS, max_features=FEATURES_IN_EACH_TREE, oob_score=True, max_depth=25)
     rf.fit_transform(X_train, y_train)
     return rf
Example #19
0
File: trade.py Project: iswdp/trade
def main():
    fi = open('45-165caps.txt', 'r')
    symbols = []
    for i in fi:
        symbols.append(i.strip())
    #symbols = symbols[0:6]

    train, test = build_data(symbols, n = 200, flag = 1, blag = 20)

    train = train.replace([np.inf, -np.inf], np.nan)
    test = test.replace([np.inf, -np.inf], np.nan)

    train = train.dropna(axis=0)
    test = test.dropna(axis=0)

    #print train.head().T
    #print test.head().T

    print 'Fitting\n'
    m = RandomForestRegressor(n_estimators=500, n_jobs=10)
    m.fit(train.ix[:,5:], train.ix[:,4])
    print 'Predicting\n'
    preds = m.predict(test.ix[:,4:])

    result = test.ix[:,:4]
    result['Prediction'] = preds
    result = result.sort('Prediction', ascending=False)
    print result.head()
    result.to_csv('trade_result.csv', sep = ',', index = False)
    def _create_random_forest(self, current_param={}):
        combined_param = dict(self.params, **current_param)
        clf = RandomForestRegressor()
        clf.set_params(**combined_param)
        clf = clf.fit(self.Xtr, self.Ytr)

        return clf
Example #21
0
	def rf_regressor(self):
		X = X.toarray() # Convert X from sparse to array
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

		model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42)
		model.fit(X_train, y_train)
		return model.score(X_test, y_test).round(2)
Example #22
0
def do_rf(filename):
    df, Y = create_merged_dataset(filename)
    rf = RandomForestRegressor(n_estimators=100)
    X = df.drop(['driver', 'trip'], 1)
    rf.fit(X, Y)
    probs = rf.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
Example #23
0
def random_learning(labels, train, test):
    label_log=np.log1p(labels)
    clf=RandomForestRegressor(n_estimators=50, n_jobs=3)
    model=clf.fit(train, label_log)
    preds1=model.predict(test)
    preds=np.expm1(preds1)
    return  preds
Example #24
0
def main():
    # read in  data, parse into training and target sets
    cols, train = read_data("../TrainingSet/ACT12_competition_training.csv", 1)
    target = np.array([x[0] for x in train])

    train = filter_cols(train, cols, "../selected/selected_12.txt")
    # print("Train: ", len(train), " cols:", len(train[0]))
    train = np.array(train)

    # In this case we'll use a random forest, but this could be any classifier
    cfr = RandomForestRegressor(n_estimators=500, max_features=(len(train[0]) // 3), n_jobs=8)

    # Simple K-Fold cross validation. 10 folds.
    cv = cross_validation.KFold(len(train), k=5, indices=False)

    # iterate through the training and test cross validation segments and
    # run the classifier on each one, aggregating the results into a list
    results = []
    for traincv, testcv in cv:
        ft = cfr.fit(train[traincv], target[traincv])
        pred = ft.predict(train[traincv])
        print pred[:10]
        score = ft.score(train[traincv], target[traincv])
        results.append(score)
        print "\tFold %d: %f" % (len(results), score)

    # print out the mean of the cross-validated results
    print "Results: " + str(np.array(results).mean())
Example #25
0
    def train_with_features(self, features):
        X = self.data_folder.truncate(self.A, features)

        rfc = RandomForestRegressor()
        rfc.fit(X, self.target)

        return rfc
    def test_rrf_vs_sklearn_reg(self):
        """Test R vs. sklearn on boston housing dataset. """
        from sklearn.datasets import load_boston
        from sklearn.cross_validation import train_test_split
        from sklearn.metrics import mean_squared_error
        from sklearn.ensemble import RandomForestRegressor

        boston = load_boston()
        X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                            test_size=0.2, random_state=13)

        n_samples, n_features = X_train.shape
        mtry = int(np.floor(0.3 * n_features))
        # do 100 trees
        r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0,
                                'mtry': mtry, 'corr.bias': False,
                                'sampsize': n_samples, 'random_state': 1234})
        r_rf.fit(X_train, y_train)
        y_pred = r_rf.predict(X_test)
        r_mse = mean_squared_error(y_test, y_pred)

        p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False,
                                     max_features=mtry, random_state=1)
        p_rf.fit(X_train, y_train)
        y_pred = p_rf.predict(X_test)
        p_mse = mean_squared_error(y_test, y_pred)
        print('%.4f vs %.4f' % (r_mse, p_mse))
        # should be roughly the same (7.6 vs. 7.2)
        np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
Example #27
0
    def refit_from_scratch(self):
        """ Create a new model directly from the database, rather
         than rely on the one saved from last time."""
        # In the background fit a much larger random forest.
        self.threaded_fit = ThreadedFit()
        self.threaded_fit.signal_finished.connect(self.__init__)
        self.threaded_fit.start()

        temp_model = RandomForest(max_features="sqrt", n_jobs=-1)
        temp_enc   = CountVectorizer()
        X = []   # binary matrix the presence of tags
        Z = []   # additional numerical data
        Y = []   # target (to predict) values
        db_size = self.db.size()
        for data in self.db.yield_some(250):
            feedback = data["feedback"]
            tags     = data[  "tags"  ]
            if feedback and tags:
                Y.append(   feedback   )
                X.append(" ".join(tags))
                Z.append(self.fmt_numerical(data))

        X = temp_enc.fit_transform(X)
        X = hstack((X, coo_matrix(Z)))
        self.allX = X
        pca = PCA(min(X.shape[0], 200))
        reduced_X = pca.fit_transform(X.todense())
        temp_model.fit(reduced_X, Y)

        self.pca   = pca
        self.model = temp_model
        self.enc   = temp_enc
def get_kernel(train_data, test_data, label):

    #Define forest (n_estimators = number of trees)
    forest = RandomForestRegressor(n_estimators=1000, warm_start = True)
    forest = forest.fit(train_data, label)

    dataset = np.concatenate((train_data, test_data), axis=0)

    SAMPLE_SIZE = len(dataset)
    M = 100

    #Loop that generates samples of the PDF
    kernel_list = np.empty([M, SAMPLE_SIZE, SAMPLE_SIZE])
    for m in range(M):
        print("Building partial kernel: {}".format(m))
        kernel_list[m,:,:] = get_partial_kernel(forest, dataset)

    #Average the samples to compute the kernel
    kernel = np.mean(kernel_list, axis=0)

    # B = np.zeros((SAMPLE_SIZE, SAMPLE_SIZE))
    # I = np.identity(SAMPLE_SIZE)
    # alpha = 0.1

    # for m in range(M):
    #     B += np.linalg.inv(kernel_list[m,:,:] + alpha * I)

    # B *= M
    # return B

    return kernel
def round2(X, y):
    # Set parameters
    min_score = {}
    for tree in [50, 100, 200, 500]:
        for feature in ['auto', 'log2']:
            model = RandomForestRegressor(n_estimators=tree, max_features=feature)
            n = len(y)

            # Perform 5-fold cross validation
            scores = []
            kf = KFold(n, n_folds=5, shuffle=True)

            # Calculate root mean squared error for train/test for each fold
            for train_idx, test_idx in kf:
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]
                model.fit(X_train, y_train)
                prediction = model.predict(X_test)
                rmse = np.sqrt(mean_squared_error(y_test, prediction))
                scores.append(rmse)
            if len(min_score) == 0:
                min_score['estimator'] = tree
                min_score['max_feature'] = feature
                min_score['scores'] = scores
            else:
                if np.mean(scores) < np.mean(min_score['scores']):
                    min_score['estimator'] = tree
                    min_score['max_feature'] = feature
                    min_score['scores'] = scores

            print "Estimator:", tree
            print "Max Features:", feature
            print scores
            print np.mean(scores)
    return min_score
def build_random_forest_regressor(X_test, X_train_full, y_train_full):

    print "Building random forest regressor..."

    rf = RandomForestRegressor(n_estimators=800)
    probas_rf = rf.fit(X_train_full, y_train_full).predict(X_test)
    return probas_rf
Example #31
0
"""

from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
X = sc_x.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.2,
                                                    random_state=42,
                                                    shuffle=True)

random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
rand_pred = random_forest.predict(X_test)
print('train score for random_forest:', random_forest.score(X_train, y_train))
print('test score for random_forest:', random_forest.score(X_test, y_test))

y_pred = random_forest.predict(X_test)

from sklearn.model_selection import cross_val_score

clf = RandomForestRegressor()
scores = cross_val_score(clf, X_test, y_test, cv=5)
scores.mean()

#mse in $
mse = mean_absolute_error(y_test, y_pred)
imp_features_model.feature_importances_

# lets plot it
#plot graph of feature importances for better visualization
feat_importances = pd.Series(imp_features_model.feature_importances_,
                             index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.ensemble import RandomForestRegressor

RFRModel = RandomForestRegressor()

#hyperparameter tuning
import numpy as np

n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

from sklearn.model_selection import RandomizedSearchCV

#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
Example #33
0
                              VotingRegressor)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from mlprodict.onnxrt import OnnxInference
from onnxruntime import InferenceSession
from skl2onnx import to_onnx
from skl2onnx.tutorial import measure_time

N = 11000
X, y = make_regression(N, n_features=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01)
print("Train shape", X_train.shape)
print("Test shape", X_test.shape)

reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=1)
reg3 = LinearRegression()
ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])
ereg.fit(X_train, y_train)

#################################
# Measure the processing time
# +++++++++++++++++++++++++++
#
# We use function :func:`skl2onnx.tutorial.measure_time`.
# The page about `assume_finite <https://scikit-learn.org/
# stable/modules/generated/sklearn.config_context.html>`_
# may be useful if you need to optimize the prediction.
# We measure the processing time per observation whether
# or not an observation belongs to a batch or is a single one.
orig_train_X = orig_train[:,:-1]
orig_train_y = orig_train[:,-1]
test_X = test[:,:-1]
test_y = test[:,-1]
print('--------')
train, val = train_test_split(orig_train, test_size = 0.2)
print('train shape : ',train.shape,
      'val shape : ',val.shape)
train_X = orig_train[:,:-1]
train_y = orig_train[:,-1]
val_y = test[:,-1]
val_X = test[:,:-1]
# build model
model_lr = LinearRegression()
model_svr = SVR()
model_rfr = RandomForestRegressor()

# train model
model_lr = model_lr.fit(train_X, train_y)
model_svr = model_svr.fit(train_X, train_y)
model_rfr = model_rfr.fit(train_X, train_y)
# exit(1)
# validate model
predict_lr = model_lr.predict(val_X)
predict_svr = model_svr.predict(val_X)
predict_rfr = model_rfr.predict(val_X)

# model selection based on validation AUC score
print('LR val MSE score : ', mean_squared_error(val_y, predict_lr),
      'SVR val MSE score : ', mean_squared_error(val_y, predict_svr),
      'RFR val MSE score : ', mean_squared_error(val_y, predict_rfr))
Example #35
0
y = crop_pred_dataset.iloc[:, -1].values


#Converting catagorical values
ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder = 'passthrough')
x = ct.fit_transform(x).toarray()

#Splitting training and testing dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

#regression model
# regressor = LinearRegression()
# regressor.fit(x_train, y_train)
# print(regressor.score(x_test,y_test ))

randomregressor = RandomForestRegressor(n_estimators=10, random_state=0)
randomregressor.fit(x_train, y_train)
print(randomregressor.score(x_test,y_test)*100)


dic={'Bajra':0.0,'Banana':0.0,'Barley':0.0,'Bean':0.0,'Black pepper':0.0,'Blackgram':0.0,'Bottle Gourd':0.0,'Brinjal':0.0,
              'Cabbage':0.0,'Cardamom':0.0,'Carrot':0.0,'Castor seed':0.0,'Cauliflower':0.0,'Chillies':0.0,'Colocosia':0.0,'Coriander':0.0,
              'Cotton':0.0,'Cowpea':0.0,'Drum Stick':0.0,'Garlic':0.0,'Ginger':0.0,'Gram':0.0,'Grapes':0.0,'Groundnut':0.0,'Gaur seed':0.0,'Horse-gram':0.0,
              'Jowar':0.0,'Jute':0.0,'Khesari':0.0,'Lady Finger':0.0,'Lentil':0.0,'Linseed':0.0,'Maize':0.0,'Mesta':0.0,'Moong':0.0,'Moth':0.0,'Onion':0.0,
              'Orange':0.0,'Papaya':0.0,'Peas':0.0,'Pineapple':0.0,'Potato':0.0,'Raddish':0.0,'Ragi':0.0,'Rice':0.0,'Safflower':0.0,'Sannhamp':0.0,'Sesamum':0.0,
              'Soyabean':0.0,'Sugarcane':0.0,'Sunflower':0.0,'Sweet potato':0.0,'Tapioca':0.0,'Tomato':0.0,'Turmeric':0.0,'Urad':0.0,'Varagu':0.0,'Wheat':0.0
              }


lis=list(dic)
print('Enter City Name:')
#splitting data into two sets : Training and Testing
X_train, X_test, Y_train, Y_test = train_test_split(Final_PUBG,
                                                    target,
                                                    test_size=0.33,
                                                    random_state=0)
STD = StandardScaler()
X_train = STD.fit_transform(X_train)
X_test = STD.transform(X_test)

#Trainning Model
bp = {
    'criterion': 'mse',
    'max_depth': 10,
    'min_samples_leaf': 3,
    'min_samples_split': 3,
    'n_estimators': 60
}
forest = RandomForestRegressor(criterion=bp['criterion'],
                               min_samples_leaf=bp['min_samples_leaf'],
                               min_samples_split=bp['min_samples_split'],
                               max_depth=bp['max_depth'],
                               n_estimators=bp['n_estimators'],
                               verbose=3,
                               n_jobs=2)
forest.fit(X_train, Y_train)
Y_pred = forest.predict(X_test)

# Explained variance score: 1 is perfect prediction
print('Score: %.2f' % forest.score(X_test, Y_test))
print(mean_absolute_error(Y_test, Y_pred))
#SPLIT THE COMBINED_DF INTO TRAIN AND TEST SETS, THEN RESET THEIR INDICES
train, test = train_test_split(combined_df, test_size=0.2)
train.reset_index(inplace = True, drop = True)
test.reset_index(inplace = True, drop = True)

#CREATE THE X_TRAIN, X_TEST, Y_TRAIN, Y_TEST ARRAYS
y_train = np.asarray(train['Next Year Stock Return'])
X_train = np.asarray(train.drop(columns = ['Next Year Stock Return']))

y_test = np.asarray(test['Next Year Stock Return'])
X_test = np.asarray(test.drop(columns = ['Next Year Stock Return']))

"""RANDOM FOREST REGRESSION MODEL"""

model = RandomForestRegressor(random_state = 0)
model.fit(X_train, y_train)

y_train_predict = model.predict(X_train)

plt.scatter(y_train, y_train_predict)
plt.show()

#MSE OF THE TRAINING SET
mse_train = np.mean(np.square(np.subtract(y_train, y_train_predict)))
print("Training set Mean Squared Error: {}".format(mse_train))

y_test_predict = model.predict(X_test)

plt.scatter(y_test, y_test_predict)
plt.show()
Example #38
0
r2_store, mse_store, mcc_store, f1_store = [], [], [], [] # Empty lists for storing results
mse_bins_store  = []

# Monte Carlo cross validation (MCCV) loop
for rrr in range(50):
    # Resample validation set (uniform distribution)
    train_indices, test_indices = resreg.uniform_test_split(X, y, bins=bins, 
                                            bin_test_size=70, verbose=False, 
                                            random_state=rrr)
    X_train, y_train = X[train_indices,:], y[train_indices]
    X_test, y_test = X[test_indices,:], y[test_indices]
    
    
    # Unpack hyperparameters, resample training data, and fit regressors
    reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \
              RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr)
              
    if strategy=='RO':
        cl, ch, sample_method = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.random_oversample(X_train, y_train, relevance,
                                    relevance_threshold=0.5, over=sample_method,
                                    random_state=rrr)
        reg.fit(X_train, y_train)
    
    elif strategy=='SMOTER':
        cl, ch, sample_method, k = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.smoter(X_train, y_train, relevance, 
                                 relevance_threshold=0.5, k=k, over=sample_method,
                                 random_state=rrr)
Example #39
0
    'C:/Users/wybek/Documents/school/Master/Information Retrieval/project2/data/STR_features.csv',
    index_col=0)

#Features from our data
#features_file = pd.read_csv('./data/Our_LTR.csv', index_col=0)
#features_file = pd.read_csv('./data/Our_STR.csv', index_col=0)

features_file = features_file.set_index('table_id')

#load the qrels dictionary
with open(
        'C:/Users/wybek/Documents/school/Master/Information Retrieval/project2/data/qrels_dict.pickle',
        'rb') as handle:
    qrels = pickle.load(handle)

regression_model = RandomForestRegressor(n_estimators=1000, max_leaf_nodes=4)

#get a list from 1 to 60 which will be split up for k fold cross validation
queries = features_file['query_id'].drop_duplicates()
queries = queries.tolist()

# First count the number of relevant tables in corpus for each query.
num_of_relevant_tables = {}
for query_index, query_string in enumerate(queries):
    qrels_one_query = qrels[query_index + 1]
    num_of_relevant_tables[query_string] = 0
    for t in qrels_one_query:
        if (qrels_one_query[t] > 0):
            num_of_relevant_tables[query_string] += 1

kf = KFold(n_splits=5, random_state=2, shuffle=True)
Example #40
0
def main():
    df = pd.read_csv(FILE)

    # Training and testing split
    y, X = df.iloc[:, 0].values, df.iloc[:, 1:-2].values
    Dcon = X.shape[1]

    # Onehot data
    soil = pd.get_dummies(df.soil)
    landuse = pd.get_dummies(df.landuse, drop_first=True)  # Binary
    Dcat = soil.shape[1] + landuse.shape[1]

    # Stack all X
    X = np.hstack((X, soil, landuse))

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_PROPORTION, random_state=SEED)
    N, D = X_train.shape

    # NOTE: Subsetting
    if SUBSET:
        rand = np.random.RandomState(SEED)
        inds = rand.choice(N, int(N * SUBSET_PROPORTION), replace=False)
        X_train, y_train = X_train[inds], y_train[inds]

    # Initialise estimator
    ss = StandardScaler()

    # Kernel
    l = LENSCALE if SCALAR else LENSCALE * np.ones(Dcon)
    kern = RBF(Dcon, lengthscales=l, active_dims=np.arange(Dcon), ARD=~SCALAR)\
        + Linear(Dcat, active_dims=np.arange(Dcon, Dcon + Dcat), ARD=~SCALAR)\
        + White(D)

    # GP
    if SPARSE_GP:
        gp = SparseGP(kern, fix_inducing=FIX_INDUCING, n_inducing=N_INDUCING)
    else:
        gp = GP(kern)

    # Random Forest
    rf = RandomForestRegressor(n_estimators=10, random_state=SEED)

    # SVM
    # http://scikit-learn.org/stable/modules/svm.html
    clf = SVR(C=1.0, epsilon=0.2, kernel='rbf')

    # Linear model
    br = BayesianRidge()

    models = {'GP': gp, 'RandomForest': rf, 'SVM': clf, 'BayesianRidge': br}
    for name, mod in models.items():
        print("Fitting {}...".format(name))

        if name == 'RandomForest':
            model = mod
        else:
            model = make_pipeline(ss, mod)

        # Train
        model.fit(X_train, y_train)

        # Predict
        if name == 'GP':
            Ey, Sy = model.predict(X_test)
        else:
            Ey = model.predict(X_test)

        # Validate
        r2 = r2_score(y_test, Ey)
        mse = mean_squared_error(y_test, Ey)
        rmse = np.sqrt(mse)
        evs = explained_variance_score(y_test, Ey)

        if name == 'GP':
            nlp = negative_log_proba(y_test, Ey, Sy)
        else:
            nlp = np.inf

        print("{} Results:".format(name))
        print("R2 = {}\nMSE = {}\nRMSE = {}\nEVS = {}\nNLP = {}".format(
            r2, mse, rmse, evs, nlp))

        if name == 'GP':
            print("Kernel parameters:")
            print(gp.kernel)
Example #41
0
data_train_scaled = scaler.fit_transform(data_train[valid_feature])
data_test_scaled = scaler.fit_transform(data_test[valid_feature])

# 降维
pca = PCA(n_components=15)
# data_train_pca = pca.fit_transform(data_train_scaled)    # 降维后数据
# data_test_pca = pca.fit_transform(data_test_scaled)

# 模型评估
print("========= Modeling =========")
# 进行模型交叉验证
models = [
    LinearRegression(),
    Ridge(),
    Lasso(alpha=0.01, max_iter=10000),
    RandomForestRegressor(n_estimators=400),
    GradientBoostingRegressor(),
    SVR(),
    LinearSVR(),
    ElasticNet(alpha=0.001, max_iter=10000),
    SGDRegressor(max_iter=1000, tol=1e-3),
    BayesianRidge(),
    KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
    ExtraTreesRegressor()
]

# for model in models:
#     evalUtil.model_rmse_log(model,data_train_scaled,target_train)

# 模型调参
# model = RandomForestRegressor()
wandb.init(project="Airbnb Tuning",
           name='RF 8',
           notes='dataset_1, robust scaler')

data = pd.read_csv(
    'C:/Users/delim/Desktop/AI in A&F Indiv Assignment/Practical Assessment/Prediction/dataset_8.csv'
)
# Train test split
X = data.iloc[:, data.columns != 'price']
Y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=7)
# Standardize dataset
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

wandb.sklearn.plot_regressor(model, X_train, X_test, y_train, y_test)
wandb.sklearn.plot_outlier_candidates(model, X_train, y_train)
wandb.sklearn.plot_residuals(model, X_train, y_train)

RMSE = sqrt(mean_squared_error(y_test, y_pred))
MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)
wandb.log({"RMSE": RMSE, "MSE": MSE, "MAE": MAE})
Example #43
0
def main():
    boston = loadData()
    X = boston.data
    Y = boston.target
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=33)
    print('The max target value is', np.max(boston.target))
    print('The min target value is ', np.min(boston.target))
    print('The average target value is ', np.mean(boston.target))

    # 对数据进行标准化处理
    ss_X = StandardScaler()
    ss_Y = StandardScaler()
    X_train = ss_X.fit_transform(X_train)
    X_test = ss_X.transform(X_test)
    Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1))
    Y_test = ss_Y.transform(Y_test.reshape(-1, 1))

    # 导入线性回归模型并训练
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    lr_Y_predict = lr.predict(X_test)

    # 导入SDGR模型并训练
    sgdr = SGDRegressor()
    sgdr.fit(X_train, Y_train.ravel())
    sgdr_Y_predict = sgdr.predict(X_test)

    # 评估模型性能 进行对比 发现模型自带评价score等价于r2_score
    print('-----------------------------------------------------------------------')
    print('The value of default measurement of LinerRegression is ',
          lr.score(X_test, Y_test))
    print('The value of R-squared of LinerRegression is ',
          r2_score(Y_test, lr_Y_predict))
    print('The mean squared error of LinerRegression is ', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict)))
    print('The mean absolute error of LinerRegression is ', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict)))
    print('-----------------------------------------------------------------------')
    print('The value of default measurement of SGDRession is ',
          sgdr.score(X_test, Y_test))
    print('The value of R-squared of SGDRession is ',
          r2_score(Y_test, sgdr_Y_predict))
    print('The mean squared error of SGDRession is ', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict)))
    print('The mean absolute error of SGDRession is ', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict)))

    # SVM Regression
    # 线性核函数SVR
    liner_svr = SVR(kernel='linear')
    liner_svr.fit(X_train, Y_train.ravel())
    liner_svr_y_predict = liner_svr.predict(X_test)

    # 多项式核函数SVR
    poly_svr = SVR(kernel='poly')
    poly_svr.fit(X_train, Y_train.ravel())
    poly_svr_y_predict = poly_svr.predict(X_test)

    # 径向基核函数SVR
    rbf_svr = SVR(kernel="rbf")
    rbf_svr.fit(X_train, Y_train.ravel())
    rbf_svr_y_predict = rbf_svr.predict(X_test)

    # 对三种核函数的SVR进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of linear SVR is:', liner_svr.score(X_test, Y_test))
    print('The MSE of linear SVR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict)))
    print('The MAE of linear SVR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict)))
    print('-----------------------------------------------------------------------')
    print('R-square value of poly SVR is:', poly_svr.score(X_test, Y_test))
    print('The MSE of poly SVR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict)))
    print('The MAE of poly SVR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict)))
    print('-----------------------------------------------------------------------')
    print('R-square value of rbf SVR is:', rbf_svr.score(X_test, Y_test))
    print('The MSE of rbf SVR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict)))
    print('The MAE of rbf SVR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict)))

    # 两种K近邻模型
    # 预测方式:平均回归
    uni_knr = KNeighborsRegressor(weights='uniform')
    uni_knr.fit(X_train, Y_train.ravel())
    uni_knr_y_predicrt = uni_knr.predict(X_test)

    # 预测方式:距离加权
    dis_knr = KNeighborsRegressor(weights='distance')
    dis_knr.fit(X_train, Y_train.ravel())
    dis_knr_y_predict = dis_knr.predict(X_test)

    # 对两种k近邻模型进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of uniform-weighted KNR is:',
          uni_knr.score(X_test, Y_test))
    print('The MSE of uniform-weighted KNR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt)))
    print('The MAE of uniform-weighted KNR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt)))
    print('-----------------------------------------------------------------------')
    print('R-square value of distance-weighted KNR is:',
          dis_knr.score(X_test, Y_test))
    print('The MSE of distance-weighted KNR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict)))
    print('The MAE of distance-weighted KNR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict)))

    # 使用回归树模型
    dtr = DecisionTreeRegressor()
    dtr.fit(X_train, Y_train.ravel())
    dtr_y_predict = dtr.predict(X_test)

    # 对回归树进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of DecisionTreeRegressor is:', dtr.score(X_test, Y_test))
    print('The MSE of DecisionTreeRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict)))
    print('The MAE of DecisionTreeRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict)))

    # 使用三种集成模型进行训练
    rfr = RandomForestRegressor()
    rfr.fit(X_train, Y_train.ravel())
    rfr_y_predict = rfr.predict(X_test)

    etr = ExtraTreesRegressor()
    etr.fit(X_train, Y_train.ravel())
    etg_y_predict = etr.predict(X_test)

    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, Y_train.ravel())
    gbr_y_predict = gbr.predict(X_test)

    # 对三种集成模型进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of RandomForestRegressor is:', rfr.score(X_test, Y_test))
    print('The MSE of RandomForestRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict)))
    print('The MAE of RandomForestRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict)))

    print('-----------------------------------------------------------------------')
    print('R-square value of ExtraTreesRegressor is:', etr.score(X_test, Y_test))
    print('The MSE of ExtraTreesRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict)))
    print('The MAE of ExtraTreesRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict)))
    print(np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0))
    print('-----------------------------------------------------------------------')
    print('R-square value of GradientBoostingRegressor is:',
          gbr.score(X_test, Y_test))
    print('The MSE of GradientBoostingRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)))
    print('The MAE of GradientBoostingRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)))
Example #44
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
import pandas as pd

X = pd.read_csv('data/titanic.csv')
y = X.pop('Survived')

X['Age'].fillna(X.Age.mean(),
                inplace=True)  # tots els forats els canvies per el promig

numeric_variables = list(X.dtypes[
    X.dtypes != "object"].index)  #agafa nomes les comunes que tenen valors

model = RandomForestRegressor(n_estimators=100,
                              oob_score=True,
                              random_state=42)
model.fit(X[numeric_variables], y)

model.oob_score_

y_oob = model.oob_prediction_
print "c-stat: ", roc_auc_score(y, y_oob)

X.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)


def clean_cabin(x):
    try:
        return x[0]
    except TypeError:
        return 'None'
accuracy_ANN = 1 - result
print("Accuracy : {}".format(accuracy_ANN))
epochs_hist.history.keys()
plt.plot(epochs_hist.history['loss'])
plt.title('Model Loss Progress During Training')
plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.legend(['Training Loss'])\

from sklearn.tree import DecisionTreeRegressor
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_model.fit(X_train,y_train)
accuracy_DecisionTree = DecisionTree_model.score(X_test,y_test)
accuracy_DecisionTree
from sklearn.ensemble import RandomForestRegressor
RandomForest_model = RandomForestRegressor(n_estimators=100, max_depth =10)
RandomForest_model.fit(X_train,y_train)
accuracy_RandomForest = RandomForest_model.score(X_train,_train)
accuracy_RandomForest

Read about regrtession metrics : 1+cam photos (mean absolute eroor ,mean square error,RMS error)

y_predict = LinearREgression_model.predict(X_test)
plt.plot(y_test,y_predict, '>',color ='r')
y_predict_orig = scaler_y.inverse_transform(y_predict)
y_test_orig = scaler_y.inverse_transform(y_test)
pit.plot(y_test_orig,y_predict_orig,'>',color ='r')
k = X_test.shape[1]
3n = len(X_test)
N
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
def reconstructRF():
    """
    run KFOLD method for random forest regression 
    """
    #import packages
    import os
    import numpy as np
    import pandas as pd
    #from sklearn import metrics
    #from scipy import stats
    #import seaborn as sns
    #import matplotlib.pyplot as plt
    #from sklearn.model_selection import KFold
    from datetime import datetime
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    
    
   #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    # #load KFOLD result csv file
    # os.chdir('F:\\06_eraint_results\\sonstig')
    # kf_dat = pd.read_csv('eraint_randForest_kfold.csv')
    # #edit the tg names to be usable later on
    # editName = lambda x: x.split('.csv')[0]
    # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg'])
    
    
    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    

    x = 453
    y = 454

    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        #get the number of PCs used during validation
        # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs']
        pca = PCA(0.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        
        {# #apply 10 fold cross validation
        # kf = KFold(n_splits=10, random_state=29)
        
        # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        # for train_index, test_index in kf.split(X):
        #     X_train, X_test = X_pca[train_index], X_pca[test_index]
        #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
        #     #train regression model
        #     rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1)
        #     lm.fit(X_train, y_train)
            
        #     #predictions
        #     predictions = lm.predict(X_test)
        #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
        #     #                       pd.DataFrame(np.array(y_test))], \
        #     #                      axis = 1)
        #     # pred_obs.columns = ['pred', 'obs']
        #     # combo = pd.concat([combo, pred_obs], axis = 0)    
            
        #     #evaluation matrix - check p value
        #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
        #         print("insignificant correlation!")
        #         continue
        #     else:
        #         #print(stats.pearsonr(y_test, predictions))
        #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
        #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
        #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        # #number of years used to train/test model
        # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
        #                       pred_surge['date'][0]).days/365)
            }
        
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        # corr = np.mean(metric_corr)
        # rmse = np.mean(metric_rmse)
        
        # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
        #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
        #       np.mean(metric_rmse), '\n')
        
        #%%
        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis = 1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis = 1)
        
        
        #standardize predictor data
        dat = pred_for_recon.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred_for_recon['date'], dat_standardized], axis = 1)
        
        X_recon = pred_standardized.iloc[:, 1:]
        
        #apply PCA
        pca = PCA(num_pc) #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)
    
        #%%
        #model preparation
        #defining the rf model with number of trees and minimum leaves
        rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \
                                   random_state = 29)
        rf.fit(X_pca, y)
        
        #get prediction interval
        def pred_ints(model, X_pca_recon, percentile = 95):
            """
            function to construct prediction interval
            taking into account the result of each 
            regression tree
            """
            err_down = [];
            err_up = [];
            preds= [];
            
            for pred in model.estimators_:
                preds.append(pred.predict(X_pca_recon))
            preds = np.vstack(preds).T
            err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \
                                     keepdims = True)
            err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \
                                   keepdims = True)
        
            return err_down.reshape(-1), err_up.reshape(-1)
        
        
        #compute 95% prediction intervals
        err_down, err_up = pred_ints(rf, X_pca_recon, percentile = 95);
        #reconstructed surge goes here
        truth = rf.predict(X_pca_recon);
        
        correct = 0.;
        for i, val in enumerate(truth):
            if err_down[i] <= val <= err_up[i]:
                correct +=1
        print(correct*100/len(truth), '\n')
        
        
        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], \
                               pd.DataFrame([truth, err_down, err_up]).T], axis = 1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']
        
        {#plot - optional
        # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
        # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
        # sns.set_context('notebook', font_scale = 2)
        # plt.figure()
        # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
        # plt.scatter(surge['date'], surge['surge'], color = 'blue')
        #prediction intervals
        # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
        # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
        #confidence intervals
        # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
        # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())
regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)])
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())

print('=== Random Forest ===')
regr = RandomForestRegressor(max_depth=2, random_state=0)
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())
regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)])
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
Example #48
0
# n_iter_search = 50
# regr1 = RandomizedSearchCV(regr1, param_distributions=param_dist,
#                             n_iter=n_iter_search, cv=3,n_jobs=-1)

# regr1.fit(x_train, y_train)

# print(regr1.best_params_)
# output - {'n_estimators': 200, 'min_samples_split': 3,'max_depth=40'}


# In[5]:


from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(n_jobs=-1,min_samples_split=3,n_estimators=200,max_depth=40)
model_rf.fit(x_train,y_train)

# import joblib
# joblib.dump(model_rf,"D:/python pycharm/Imarticus _Project/model_rf.pkl")


y_pred_train_rf = model_rf.predict(x_train)
y_pred_test_rf = model_rf.predict(x_test)

from sklearn.metrics import mean_squared_error

rmse_train_rf = np.sqrt(mean_squared_error(y_train,y_pred_train_rf))
rmse_test_rf = np.sqrt(mean_squared_error(y_test,y_pred_test_rf))

print("RMSLE value of Training Data is {a}".format(a=rmse_train_rf))
def correct_with_regression(df_load,
                            dikt_errors,
                            prefix=None,
                            prefix_plot=None,
                            bool_plot_corrections=None,
                            bool_plot_trash=None):
    """
    Learn a predictor for each bad site
    to predict the irrelevant values 
    from the values of the other sites 
    that do not have irrelevant values
    """
    print('correct_with_regression - ', end='')
    fname_load = os.path.join(prefix, 'df_corrected_load.csv')
    fname_trash = os.path.join(prefix, 'trash_sites.pkl')
    try:
        df_corrected_load = pd.read_csv(
            fname_load,
            index_col=0,
            #header    = [0],
        )
        df_corrected_load.index = pd.to_datetime(df_corrected_load.index)
        with open(fname_trash, 'rb') as f:
            trash_sites = pickle.load(f)
        print('Loaded df_corrected_load and trash_sites')
    except Exception as e:
        print('\n{0}'.format(colored(e, 'red')))
        print('df_corrected_load not loaded')
        bad_sites = sorted(
            set([site for k, v in dikt_errors.items() for site in v]))
        df_corrected_load = df_load.copy()
        trash_sites = []
        X = df_load[sorted(set(df_load.columns) - set(bad_sites))]
        assert not pd.isnull(X).sum().sum()
        for ii, site in enumerate(bad_sites):
            print('\r{0:6} / {1:6} - '.format(ii, len(bad_sites)), end='')
            y = df_load[site]
            flags = {
                dd: error_type
                for error_type in dikt_errors
                for ii, dd in dikt_errors[error_type].get(site, [])
            }
            samples_unkown = [
                (ii, dd) for error_type in dikt_errors
                for ii, dd in dikt_errors[error_type].get(site, [])
            ]
            ind_unknown, dates_unknown = list(zip(*samples_unkown))
            ind_unknown = sorted(ind_unknown)
            dates_unknown = sorted(dates_unknown)
            ind_known = [
                ii for ii in range(y.shape[0]) if ii not in ind_unknown
            ]  # Indices corresponding to sane observations
            assert not pd.isnull(y.iloc[ind_known]).sum()
            if len(ind_known) == 0:
                trash_sites.append((site, 'dates_known empty'))
                df_corrected_load = df_corrected_load.drop(site, axis=1)
                print('{0:6} ->  drop because dates known empty'.format(site))
                continue
            shuffled_ind_known = ind_known.copy()
            np.random.shuffle(shuffled_ind_known)
            cut = int(0.9 * len(shuffled_ind_known))
            # Divide the sane observations into a training and a test sets
            ind_train = sorted(shuffled_ind_known[:cut])
            ind_test = sorted(shuffled_ind_known[cut:])
            # Train
            y_train = y.iloc[ind_train]
            X_train = X.iloc[ind_train]
            # Validation
            y_test = y.iloc[ind_test]
            X_test = X.iloc[ind_test]
            # Pred
            X_pred = X.iloc[ind_unknown]
            # Normalization covariates
            X_mean = X_train.mean(axis=0)
            X_std = X_train.std(axis=0)
            X_train = (X_train - X_mean) / X_std
            X_test = (X_test - X_mean) / X_std
            X_pred = (X_pred - X_mean) / X_std
            # Normalization target
            y_mean = y_train.mean(axis=0)
            y_std = y_train.std(axis=0)
            y_train = (y_train - y_mean) / y_std
            assert np.allclose(X_train.sum(), 0)
            assert np.allclose(y_train.sum(), 0)
            regressor = 'rf'  # 'rf' # 'xgb' # 'spams'
            # Assess the quality of a predictor from the other sane sites
            # We de not have a criteria to decide which algorithms is the most
            # appropriate and have used alternatively spams of random forests.
            if regressor == 'rf':
                model = RandomForestRegressor()
                model.fit(X_train, y_train)
                y_hat_train = model.predict(X_train)
                y_hat_test = model.predict(X_test)
                y_hat_pred = model.predict(X_pred)
            elif regressor == 'xgb':
                model = xgb.XGBRegressor()
                model.fit(X_train, y_train)
                y_hat_train = model.predict(X_train)
                y_hat_test = model.predict(X_test)
                y_hat_pred = model.predict(X_pred)
            elif regressor == 'spams':
                hprm = {
                    'loss': 'square',
                    'numThreads': -1,
                    'verbose': False,
                    'lambda1': 0.03 * X_train.shape[0],
                    'lambda2': 0.1,  # For elastic_net
                    'it0': 10,  # nb_iter between two dual gap computations
                    'max_it': int(
                        1e4
                    ),  # (optional, maximum number of iterations, 100 by default)
                    'L0':
                    0.1,  # (optional, initial parameter L in fista, 0.1 by default, should be small enough)
                    'regul': 'l2',
                    'tol': 1e-4,
                    'intercept':
                    False,  #(optional, do not regularize last row of W, false by default)
                    'compute_gram': True,
                    'return_optim_info': True
                }
                beta0 = np.zeros(
                    (X_train.shape[1], 1),
                    dtype=np.float64,
                    order="F",
                )
                beta_cen, optim_info = spams.fistaFlat(
                    np.asfortranarray(y_train, dtype=np.float64).reshape(
                        (-1, 1)),
                    np.asfortranarray(X_train, dtype=np.float64),
                    beta0,
                    **hprm,
                )
                beta = beta_cen[:, 0]
                y_hat_train = X_train @ beta
                y_hat_test = X_test @ beta
                y_hat_pred = X_pred @ beta
            y_train = y_train * y_std + y_mean
            y_hat_train = y_hat_train * y_std + y_mean
            y_hat_test = y_hat_test * y_std + y_mean
            y_hat_pred = y_hat_pred * y_std + y_mean
            rr_train = 1 - (
                (y_train - y_hat_train)**2).mean() / y_train.std()**2
            rr_test = 1 - ((y_test - y_hat_test)**2).mean() / y_test.std()**2
            if not (
                    rr_train > 0.9 and rr_test > 0.5
            ):  # If the performances are not good enough on the training and the test sets, drop the site
                trash_sites.append((
                    site,
                    'rr_train = {rr_train:.2} - rr_test = {rr_test:.2}'.format(
                        rr_train=rr_train,
                        rr_test=rr_test,
                    )))
                df_corrected_load = df_corrected_load.drop(site, axis=1)
                print(
                    '{0:6} ->  drop because prediction not good enough - rr_train = {rr_train:.2} - rr_test = {rr_test:.2}'
                    .format(
                        site,
                        rr_train=rr_train,
                        rr_test=rr_test,
                    ))
                continue
            if bool_plot_corrections:
                plot_tools.plot_corrections(
                    y,
                    dates_unknown,
                    y_hat_pred,
                    os.path.join(
                        prefix_plot,
                        'corrections',
                    ),
                    regressor,
                    rr_test,
                    flags,
                )
            print(
                '{0:6} -> {1:5} values corrected - rr_train = {rr_train:.2} - rr_test = {rr_test:.2}'
                .format(
                    site,
                    len(ind_unknown),
                    rr_train=rr_train,
                    rr_test=rr_test,
                ))
            df_corrected_load[site].iloc[ind_unknown] = y_hat_pred
        df_corrected_load.to_csv(fname_load)
        with open(fname_trash, 'wb') as f:
            pickle.dump(trash_sites, f)
    if bool_plot_trash:
        plot_tools.plot_trash(
            trash_sites,
            df_load,
            os.path.join(
                prefix_plot,
                'trash_sites',
            ),
        )  # Plot the sites that are discarded
    print(
        'done - df_corrected_load.shape = {0} - len(trash_sites) = {1}\n{2}'.
        format(df_corrected_load.shape, len(trash_sites),
               '#' * tools.NB_SIGNS), )
    return df_corrected_load, trash_sites
Example #50
0
"""
Train an RF regressor
In the following exercises you'll predict
bike rental demand in the Capital Bikeshare program in Washington, 
D.C using historical weather data from the Bike Sharing Demand dataset 
available through Kaggle. For this purpose, you will be using the random forests 
algorithm. As a first step, you'll define a random forests regressor and fit it to the training set.

The dataset is processed for you and split into 80% train and 20% test. The features matrix X_train and the array y_train are available in your workspace.
"""

# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Instantiate rf
rf = RandomForestRegressor(n_estimators=25, random_state=2)

# Fit rf to the training set
rf.fit(X_train, y_train)
"""
Evaluate the RF regressor
You'll now evaluate the test set RMSE of the random forests regressor
rf that you trained in the previous exercise.

The dataset is processed for you and split into 80% train and 20% test. 
The features matrix X_test, as well as the array y_test are available in your workspace. 
In addition, we have also loaded the model rf that you trained in the previous exercise.
"""

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE
Example #51
0
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from scripts.learn import machineLearning
from scripts.learn.machineLearning import tuneNValue

# ______________________________________________________
# HOW LONG WILL MY CLASSIFIER TAKE TO RUN ON MY MACHINE?
# ______________________________________________________
#
# Create a basic classifier
classifier = RandomForestRegressor(random_state=43)
classifierName = "randomForest"  # No spaces, this will be a file name

# Add the names of all data files you want to use to this list
jsonFileNames = [
    'chicago_weather_sentiment_clean_grouped.json',
    'denver_weather_sentiment_clean_grouped.json',
    'detroit_weather_sentiment_clean_grouped.json',
    'houston_weather_sentiment_clean_grouped.json',
    'manhattan_weather_sentiment_clean_grouped.json',
    'phoenix_weather_sentiment_clean_grouped.json',
    'sanFrancisco_weather_sentiment_clean_grouped.json',
    'seattle_weather_sentiment_clean_grouped.json',
]


# We want to see how long it will take to train our classifier
# This will make a file called <classifierName>_number_data_points.csv
def tuneRandomForestNValues():
    nValues = [
Example #52
0
    def _miss_forest(self, Ximp, mask):
        """The missForest algorithm"""

        # Count missing per column
        if isinstance(Ximp, pd.DataFrame):
            Ximp = Ximp.values
        col_missing_count = mask.sum(axis=0)

        # Get col and row indices for missing
        missing_rows, missing_cols = np.where(mask)
        rf_regressor = rf_classifier = n_catmissing = None
        if self.num_idx.size:
            # Only keep indices for numerical vars
            keep_idx_num = np.in1d(missing_cols, self.num_idx)
            missing_num_rows = missing_rows[keep_idx_num]
            missing_num_cols = missing_cols[keep_idx_num]

            # Make initial guess for missing values
            col_means = np.full(Ximp.shape[1], fill_value=np.nan)
            col_means[self.num_idx] = deepcopy(
                self.statistics_.get('col_means'))
            Ximp[missing_num_rows,
                 missing_num_cols] = np.take(col_means, missing_num_cols)

            # Reg criterion
            reg_criterion = self.criterion if type(self.criterion) == str \
                else self.criterion[0]

            # Instantiate regression model
            rf_regressor = RandomForestRegressor(
                n_estimators=self.n_estimators,
                criterion=reg_criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                bootstrap=self.bootstrap,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose,
                warm_start=self.warm_start)
        # If needed, repeat for categorical variables
        if self.cat_idx.size:
            # Calculate total number of missing categorical values (used later)
            n_catmissing = np.sum(mask[:, self.cat_idx])

            # Only keep indices for categorical vars
            keep_idx_cat = np.in1d(missing_cols, self.cat_idx)
            missing_cat_rows = missing_rows[keep_idx_cat]
            missing_cat_cols = missing_cols[keep_idx_cat]

            # Make initial guess for missing values
            col_modes = np.full(Ximp.shape[1], fill_value=np.nan)
            col_modes[self.cat_idx] = self.encoded_col_modes[self.cat_idx]
            Ximp[missing_cat_rows,
                 missing_cat_cols] = np.take(col_modes, missing_cat_cols)

            # Classfication criterion
            clf_criterion = self.criterion if type(self.criterion) == str \
                else self.criterion[1]

            # Instantiate classification model
            rf_classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                criterion=clf_criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                bootstrap=self.bootstrap,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose,
                warm_start=self.warm_start,
                class_weight=self.class_weight)

        # 2. misscount_idx: sorted indices of cols in X based on missing count
        misscount_idx = np.argsort(col_missing_count)
        # Reverse order if decreasing is set to True
        if self.decreasing is True:
            misscount_idx = misscount_idx[::-1]

        # 3. While new_gammas < old_gammas & self.iter_count_ < max_iter loop:
        self.iter_count_ = 0
        gamma_new = 0
        gamma_old = np.inf
        gamma_newcat = 0
        gamma_oldcat = np.inf
        col_index = np.arange(Ximp.shape[1])

        while (
                gamma_new < gamma_old or gamma_newcat < gamma_oldcat) and \
                self.iter_count_ < self.max_iter:

            # 4. store previously imputed matrix
            Ximp_old = deepcopy(Ximp)
            if self.iter_count_ != 0:
                gamma_old = gamma_new
                gamma_oldcat = gamma_newcat
            # 5. loop
            for s in misscount_idx:
                # Column indices other than the one being imputed
                s_prime = np.delete(col_index, s)

                # Get indices of rows where 's' is observed and missing
                obs_rows = np.where(~mask[:, s])[0]
                mis_rows = np.where(mask[:, s])[0]

                # If no missing, then skip
                if len(mis_rows) == 0:
                    continue

                # Get observed values of 's'
                yobs = Ximp[obs_rows, s]

                # Get 'X' for both observed and missing 's' column
                xobs = Ximp[np.ix_(obs_rows, s_prime)]
                xmis = Ximp[np.ix_(mis_rows, s_prime)]

                # 6. Fit a random forest over observed and predict the missing
                if self.cat_idx is not None and s in self.cat_idx:
                    yobs = yobs.astype('int32')
                    rf_classifier.fit(X=xobs, y=yobs)
                    # 7. predict ymis(s) using xmis(x)
                    ymis = rf_classifier.predict(xmis)
                    # 8. update imputed matrix using predicted matrix ymis(s)
                    Ximp[mis_rows, s] = ymis
                else:
                    yobs = yobs.astype('float32')
                    rf_regressor.fit(X=xobs, y=yobs)
                    # 7. predict ymis(s) using xmis(x)
                    ymis = rf_regressor.predict(xmis)
                    # 8. update imputed matrix using predicted matrix ymis(s)
                    Ximp[mis_rows, s] = ymis

            # 9. Update gamma (stopping criterion)
            if self.cat_idx is not None:
                gamma_newcat = np.sum(
                    (Ximp[:, self.cat_idx] !=
                     Ximp_old[:, self.cat_idx])) / n_catmissing
            if self.num_idx is not None:
                gamma_new = np.sum(
                    (Ximp[:, self.num_idx] - Ximp_old[:, self.num_idx])**
                    2) / np.sum((Ximp[:, self.num_idx])**2)

            logger.debug(f"MissForest Coverage Iteration: {self.iter_count_}")
            self.iter_count_ += 1

        return Ximp
Example #53
0
poly_reg.fit(X_poly, Y_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, Y_train)'''

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
model2 = SVR(kernel='rbf')

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
model3 = DecisionTreeRegressor(random_state=50)

from sklearn.ensemble import RandomForestRegressor
model4 = RandomForestRegressor(n_estimators=300, random_state=50)

from xgboost import XGBRegressor
model5 = XGBRegressor()

from sklearn.neighbors import KNeighborsRegressor
model6 = KNeighborsRegressor(n_neighbors=15)

from sklearn.linear_model import Ridge, Lasso

model7 = Ridge()

model8 = Lasso()

from vecstack import stacking
Example #54
0
                      columns=[KOI_DISPOSITION, KOI_PDISPOSITION],
                      drop_first=True)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# #Test and train data splitting
y = data[KOI_SCORE]

x = data.drop([KOI_SCORE], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=9)

# #Definition of the models to include in the analysis ad optimization process
models = {
    "RandomForest": RandomForestRegressor(n_estimators=200),
    "Gradient Boosting": GradientBoostingRegressor(),
    #"K Neighbors": KNeighborsRegressor(),
    #"Decision Tree": DecisionTreeRegressor(),
    "Neural Network": MLPRegressor((20, 20, 20), max_iter=1000, random_state=1)
}

train_accuracies = pd.DataFrame(index=models.keys(),
                                columns=[AVERAGE, PCT_STANDARD_DEVIATION])
test_accuracies = pd.DataFrame(index=models.keys(),
                               columns=[AVERAGE, PCT_STANDARD_DEVIATION])

# #------------------------------------------------------------------------------
for model_name, model in models.items():
    cv = cross_validate(model, x, y, cv=50, n_jobs=-1)
    avg_train_accuracy = (np.mean(
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

#target column i.e price range

#                                       RF using Normalisation | accuracy = 91.45863
#                       n_estimators = 50, log2, acc = 91.46870
#                        n_estimators = 50, log2, acc =     91.48331 - considering time

from sklearn.ensemble import RandomForestRegressor
x = st.slider('Choose number of estimators for Random Forest Algorithm',
              min_value=10,
              max_value=150)
regressorRF_Norm = RandomForestRegressor(n_estimators=x,
                                         random_state=0,
                                         max_features="log2",
                                         oob_score=True)
if st.button('Train Random Forest model'):

    regressorRF_Norm.fit(train_normalized, y)

    y_pred_rf_Norm = regressorRF_Norm.predict(test_normalized)
    st.subheader('Predictions are:')
    st.write(y_pred_rf_Norm)

#
#out_norm = pd.DataFrame(y_pred_rf_Norm,columns=['air_pollution_index'])
#out_norm.to_csv('submission_norm_t.csv',sep=',')

#                                              ends here
Example #56
0
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)
Example #57
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

# Modelos de AI

# - Regressão Linear
# - RandomForest (Árvore de Decisão)

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

lin_reg = LinearRegression()
rf_reg = RandomForestRegressor()

rf_reg.fit(x_train, y_train)
lin_reg.fit(x_train, y_train)

from sklearn import metrics  # R² --> 0% --- 100%

test_pred_lin = lin_reg.predict(x_test)
test_pred_rf = rf_reg.predict(x_test)

r2_lin = metrics.r2_score(y_test, test_pred_lin)
mse_lin = metrics.mean_squared_error(y_test, test_pred_lin)

print(f"R² da Regressão Linear: {r2_lin}")
print(f"MSE da Regressão Linear: {mse_lin}")
Example #58
0
X = train.iloc[:, [2, 4, 5, 6, 9, 10]]
Y = y2
#Y = y1
validation_size = 0.20
seed = 7

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'neg_mean_squared_error'

# Spot Check Algorithms
models = []
models.append(('LR', LinearRegression()))
models.append(('RF', RandomForestRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVM', SVR()))

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 X,
                                                 Y,
                                                 cv=kfold,
                                                 scoring=scoring)
Example #59
0
    def evaluateModel(self, technique, train, test):
        if technique == 'Linear Regression':
            # Create linear regression object
            regr = linear_model.LinearRegression()
        elif technique == 'Kernel Ridge':
            regr = KernelRidge(alpha=1.0)
        elif technique == 'Ridge Regression':
            regr = linear_model.Ridge(alpha=.7)
        elif technique == 'Decision Tree':
            regr = tree.DecisionTreeRegressor(max_depth=10, min_samples_leaf=1)
        elif technique == "Random Forest":
            regr = RandomForestRegressor()
        elif technique == 'Gaussian Process':
            kernel = RationalQuadratic(length_scale=1.0, alpha=100)

            regr = gaussian_process.GaussianProcessRegressor(kernel=kernel,
                                                             alpha=1)

        # min_max_scaler = preprocessing.MaxAbsScaler()
        #
        # train = min_max_scaler.fit_transform(train)
        # test = min_max_scaler.fit_transform(test)

        # train = preprocessing.scale(train)
        # test = preprocessing.scale(test)

        print('Total dataset size: ', len(self.data_array))
        print('Train points', len(train))
        print('Test points', len(test))

        train_data_X = train[:, :-1]
        train_data_X[:, 0] = train_data_X[:, 0] / np.max(train_data_X[:, 0])
        train_data_X[:, 1] = train_data_X[:, 1] / np.max(train_data_X[:, 1])
        train_data_Y = train[:, -1]

        test_data_X = test[:, :-1]
        test_data_X[:, 0] = test_data_X[:, 0] / np.max(test_data_X[:, 0])
        test_data_X[:, 1] = test_data_X[:, 1] / np.max(test_data_X[:, 1])

        test_data_Y = test[:, -1]

        # Train the model using the training sets
        regr.fit(train_data_X, train_data_Y)

        # Make predictions using the testing set
        test_data_Y_predictions = regr.predict(test_data_X)
        print(
            "________________--------------------_________________------------------------"
        )
        print(test_data_Y_predictions)

        print('--------Started-----------')
        for actual, pred in zip(test_data_Y, test_data_Y_predictions):
            print('Actual ' + str(actual) + ', predict ' + str(pred))
        print('--------Ended-----------')

        mean_sq_error = mean_squared_error(test_data_Y,
                                           test_data_Y_predictions)
        r2_score_value = r2_score(test_data_Y, test_data_Y_predictions)

        # # The mean squared error
        print("Mean squared error: %.2f" % mean_sq_error)
        # Explained variance score: 1 is perfect prediction
        print('Variance score: %.2f' % r2_score_value)

        test_concat = np.concatenate([test_data_Y, test_data_Y_predictions])

        normalized_mean_squared_error = sqrt(mean_sq_error) / (
            sum(test_concat) / len(test_concat))
        print(normalized_mean_squared_error)
        return [self.split * 100, r2_score_value]
def validateRF():
    """
    run KFOLD method for regression 
    """

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 113
    y = 114

    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        #filter only .csv files
        tgNames = []
        for file in glob.glob("*.csv"):
            tgNames.append(file)

        tg_name = sorted(tgNames)[tg]
        print(tg_name)

        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            print("this tide gauge is already taken care of")
            return "file already analyzed!"

        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)

        metric_corr = []
        metric_rmse = []
        #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #train regression model
            rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \
                                      min_samples_leaf = 1)
            rf.fit(X_train, y_train)

            #predictions
            predictions = rf.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)

            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                print()
                metric_rmse.append(
                    np.sqrt(metrics.mean_squared_error(y_test, predictions)))

        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)

        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')

        #original size and pca size of matrix added
        new_df = pd.DataFrame(
            [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis=0)

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)