Ejemplo n.º 1
0
def mul_dtree(X, Y2):
    forest = ExtraTreesRegressor(n_estimators=5,
                             compute_importances=True,
                             random_state=0)
    forest.fit(X[:200], Y2[:200])
    forest.predict(X[200:])
    print Y2[200:]
Ejemplo n.º 2
0
    def train(self, verbose=False, training_data=None):
        n_estimators = 50
        n_samples = 5000

        trainingDataDict = self._getTrainingData(numSamples=n_samples)

        X = np.array(trainingDataDict['rot_line_test_deriv'], dtype=np.float32)
        y = np.array(trainingDataDict['solution_data'][0], dtype=np.float32)
        dtr0 = ExtraTreesRegressor(n_estimators=n_estimators)
        dtr0 = dtr0.fit(X, y)

        X = np.array(trainingDataDict['rot_line_test_deriv'], dtype=np.float32)
        y = np.array(trainingDataDict['solution_data'][1], dtype=np.float32)
        dtr1 = ExtraTreesRegressor(n_estimators=n_estimators)
        dtr1 = dtr1.fit(X, y)

        X = np.array(trainingDataDict['scaled_img'], dtype=np.float32)
        y = np.array(trainingDataDict['solution_data'][0], dtype=np.float32)
        str0 = ExtraTreesRegressor(n_estimators=n_estimators)
        str0 = str0.fit(X, y)

        X = np.array(trainingDataDict['scaled_img'], dtype=np.float32)
        y = np.array(trainingDataDict['solution_data'][1], dtype=np.float32)
        str1 = ExtraTreesRegressor(n_estimators=n_estimators)
        str1 = str1.fit(X, y)


        trainingDataDict = self._getTrainingData(startPos=n_samples+1, numSamples=n_samples)

        dtr0Pred = [dtr0.predict(trainingDataDict['rot_line_test_deriv'][i]) for i in range(len(trainingDataDict['rot_line_test_deriv']))]
        dtr1Pred = [dtr1.predict(trainingDataDict['rot_line_test_deriv'][i]) for i in range(len(trainingDataDict['rot_line_test_deriv']))]
        str0Pred = [str0.predict(trainingDataDict['scaled_img'][i]) for i in range(len(trainingDataDict['scaled_img']))]
        str1Pred = [str1.predict(trainingDataDict['scaled_img'][i]) for i in range(len(trainingDataDict['scaled_img']))]

        X = np.array([[dtr0Pred[i][0], str0Pred[i][0]] for i in xrange(len(dtr0Pred))], dtype=np.float32)
        y = np.array(trainingDataDict['solution_data'][0], dtype=np.float32)
        ftr0 = ExtraTreesRegressor(n_estimators=n_estimators)
        ftr0 = ftr0.fit(X, y)

        X = np.array([(dtr1Pred[i][0], str1Pred[i][0]) for i in xrange(len(dtr1Pred))], dtype=np.float32)
        y = np.array(trainingDataDict['solution_data'][1], dtype=np.float32)
        ftr1 = ExtraTreesRegressor(n_estimators=n_estimators)
        ftr1 = ftr1.fit(X, y)

        self.dtr0 = dtr0
        self.dtr1 = dtr1
        self.str0 = str0
        self.str1 = str1
        self.ftr0 = ftr0
        self.ftr1 = ftr1

        self.areModelsTrained = True
def estimate():
    from loadData import loadSets
    from helper import splitDataset, separateTargetFromTrain
    from sklearn.ensemble import ExtraTreesRegressor
    import numpy as np
    import math

    best_rmsle = 2
    best_i = 0
    
    trainingSet, testingSet = loadSets()
    testingSet = None

    trainingData, testingData = splitDataset(trainingSet, 0.6)
    testingData, validationData = splitDataset(testingData, 0.5)
    trainingSet = None
    
    trainingTarget, trainingFeatures = separateTargetFromTrain(trainingData)
    testingTarget, testingFeatures = separateTargetFromTrain(testingData)
    validationTarget, validationFeatures = separateTargetFromTrain(validationData)

    testingTarget = testingTarget.values
    validationTarget = validationTarget.values
    
    trainingData = None
    testingData = None
    validationData = None    
    
    for i in range(2000, 3001, 1000):
        model = ExtraTreesRegressor(n_estimators = i, n_jobs = -1)
        model.fit(trainingFeatures, trainingTarget)
        
        predictions = model.predict(testingFeatures)
                
        cost = pow(np.log(predictions + 1) - np.log(testingTarget + 1), 2)
        rmsle = math.sqrt(np.mean(cost))
        print i, " estimators: ", rmsle
        
        if rmsle < best_rmsle:
            best_rmsle = rmsle
            best_i = i
            
    print "Best: ", best_i, " estimators with rmsle: ", best_rmsle
    
    model = ExtraTreesRegressor(n_estimators = best_i, n_jobs = -1)
    model.fit(trainingFeatures, trainingTarget)
    predictions = model.predict(validationFeatures)
            
    cost = pow(np.log(predictions + 1) - np.log(validationTarget + 1), 2)
    rmsle = math.sqrt(np.mean(cost))
    
    print "Final model cost: ", rmsle
Ejemplo n.º 4
0
def do_etrees(filename):
    df, Y = create_merged_dataset(filename)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5, random_state=SEED)
    X = df.drop(['driver', 'trip'], 1)
    etree.fit(X, Y)
    probs = etree.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
Ejemplo n.º 5
0
def predict_with_one(X, out_file_name):
    n_samples, n_features = X.shape
    iter_num = 3
    div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0)
    model = ExtraTreesRegressor(n_estimators=5)
    score_matrix = np.zeros((n_features, n_features))

    t = time()
    round_num = 0
    for train, test in div:
        round_num += 1
        train_samples = X[np.array(train)]
        test_samples = X[np.array(test)]
        for i in range(n_features):
            for j in range(n_features):
                X_train = train_samples[:, i:i+1]
                X_test = test_samples[:, i:i+1]
                y_train = train_samples[:, j]
                y_test = test_samples[:, j]
        # for i in range(len(fl)):
        #     for j in range(len(fl)):
        #         if fl[j][1]-fl[j][0] != 1:
        #             continue
        #         X_train = train_samples[:, fl[i][0]:fl[i][1]]
        #         X_test = test_samples[:, fl[i][0]:fl[i][1]]
        #         y_train = train_samples[:, fl[j][0]]
        #         y_test = test_samples[:, fl[j][0]]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                score_matrix[i, j] += mae
                print('Round', round_num, '|', i, j, mae, time()-t)
    np.savetxt(os.path.join(CODE_PATH, out_file_name),
               score_matrix/iter_num, fmt='%.3f', delimiter=',')
Ejemplo n.º 6
0
def build_extra_tree_regressor(X_test, X_train_full, y_train_full):


    print "Building ExtraTrees regressor..."
    etr = ExtraTreesRegressor(n_estimators=500)
    etr.fit(X_train_full, y_train_full)
    etr_predict = etr.predict(X_test)

    return etr_predict
Ejemplo n.º 7
0
 def classify(self):
     """Perform classification"""
     clf = ETRegressor(n_estimators=500, min_samples_split=5, min_samples_leaf=2)
     #pca = PCA(n_components = 400)
     #self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata)
     #self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata)
     #print self._ClassifyDriver__traindata.shape
     clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
     self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
Ejemplo n.º 8
0
def extra_trees_regressor(x, y, n_estimators, max_depth):
    kf = KFold(len(x), n_folds=3)
    scores = []
    for train_index, test_index in kf:
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
        clf.fit(X_train, y_train)
        scores.append(mean_squared_error(clf.predict(X_test), y_test) ** 0.5)
    return np.mean(scores)
def reg_skl_etr(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                              max_features=param['max_features'],
                              n_jobs=param['n_jobs'],
                              random_state=param['random_state'])
    etr.fit(X_tr, y_reg_tr)
    pred = etr.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
Ejemplo n.º 10
0
class MyExtraTreeReg(MyRegressor):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesRegressor(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesRegressor(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    def predict(self, Xtest, option = None):
      return self._extree.predict(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._extree.feature_importances_

        std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Extra Tree Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()


    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._extree.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Extra tree feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
Ejemplo n.º 11
0
def algorithm_ExtraTrees(X_train,Y_train,X_validation,Y_validation, seed=7):


    # 训练模型
    scaler = StandardScaler().fit(X_train)
    rescaledX = scaler.transform(X_train)
    gbr = ExtraTreesRegressor(n_estimators=80)
    gbr.fit(X=rescaledX, y=Y_train)
    # 评估算法模型
    rescaledX_validation = scaler.transform(X_validation)
    predictions = gbr.predict(rescaledX_validation)
    print(mean_squared_error(Y_validation, predictions))
class ModelERT:

    def __init__(self, model_set_name, i_fold):
        self.model_set_name = model_set_name
        self.i_fold = i_fold

    def set_params(self, prms):
        self.prms = prms

    def set_data(self, labels_tr, labels_te, data_tr, data_te):
        self.labels_tr = labels_tr
        self.labels_te = labels_te
        self.data_tr = data_tr
        self.data_te = data_te

    def train(self):
        print "start ert"
        self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"],
                                         verbose=1,
                                         random_state=self.prms["random_state"],
                                         n_estimators=int(self.prms["n_estimators"]),
                                         max_features=self.prms["max_features"])
        self.model.fit(self.data_tr.values, self.labels_tr)

    def predict(self):
        return self.model.predict(self.data_te.values)

    def predict_train(self):
        return self.model.predict(self.data_tr.values)

    def dump_model(self):
        pass

    def dump_pred(self, pred, name):
        folder = config.get_model_folder(self.model_set_name, self.i_fold)
        Files.mkdir(folder)
        path = config.get_model_path(self.model_set_name, name, self.i_fold)
        joblib.dump(pred, path)
Ejemplo n.º 13
0
def dummie_columns_extra_trees(train, test):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    predicting_columns = list(train._get_numeric_data().columns.values)
    predicting_columns.remove("LISTPRICE")
    predicting_columns.remove("SOLDPRICE")
    rf = ExtraTreesRegressor(
        n_estimators=300, n_jobs=-1)
    rf.fit(train[predicting_columns], train["SOLDPRICE"])
    score = rf.score(test[predicting_columns], test["SOLDPRICE"])
    predictions = rf.predict(test[predicting_columns])
    sample_predictions(test, predictions)
    print "Accuracy: {}\n".format(score)
    return score, predictions
Ejemplo n.º 14
0
def baseline_extra(train_x, train_y,
                   test_x, test_y, n, d,
                   result_path="review_baseline_extra.txt"):
    predict = []
    clf = ExtraTreesRegressor(n_estimators=n,
                              max_depth=d,
                              random_state=0)
    clf = clf.fit(train_x, train_y)
    predict = clf.predict(test_x).tolist()
    result = pd.DataFrame([], columns=['review_count', 'predict'])
    result['review_count'] = test_y
    result['predict'] = predict
    result.to_csv(result_path, index=False)
    rmse = mean_squared_error(predict, test_y) ** 0.5
    return rmse
Ejemplo n.º 15
0
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    rf = ExtraTreesRegressor(
        n_estimators=300,
        n_jobs=-1
    )
    rf.fit(data_train_x, data_train_y)
    sample_predictions(rf.predict(data_test_x), data_test_y)
    score = rf.score(data_test_x, data_test_y)
    cross_validated_scores = cross_val_score(
        rf, data_test_x, data_test_y, cv=5)
    print "MSE Accuracy: {}".format(score)
    print "MSE Across 5 Folds: {}".format(cross_validated_scores)
    print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
Ejemplo n.º 16
0
def predict_for(output, cycles, tests, raw_tests, inputs):
    x_train, x_test, y_train, y_test = train_test_split(cycles[inputs],
                                                        cycles[output],
                                                        test_size=0.25,
                                                        random_state=33)
    scaler_x  = StandardScaler().fit(x_train)
    scaler_t  = StandardScaler().fit(tests)
    x_train   = scaler_x.transform(x_train)
    x_test    = scaler_x.transform(x_test)
    tests     = scaler_t.transform(tests)

    clf_et = ExtraTreesRegressor(n_estimators=10,
                                 compute_importances=True, random_state=42)
    clf_et.fit(x_train, y_train)

    ps = clf_et.predict(tests)
    return {dt: int(round(p)) for dt, p in zip(raw_tests['datetime'], ps)}
Ejemplo n.º 17
0
def baseline_extra_leave_one_out(train_raw_x, test_raw_x, test_ids, n=40, d=40, result_path="baseline_extra.txt"):
    predict = []
    for test_id in test_ids:
        train_x = train_raw_x[train_raw_x.business_id != test_id]
        train_y = train_raw_x[train_raw_x.business_id != test_id].stars.as_matrix()
        train_x = train_x.drop(["business_id", "stars"], 1).as_matrix()
        clf = ExtraTreesRegressor(n_estimators=n, max_depth=d, random_state=0)
        clf = clf.fit(train_x, train_y)
        test_x = test_raw_x[test_raw_x.business_id == test_id]
        test_x = test_x.drop(["business_id", "stars"], 1).as_matrix()
        predict.append(clf.predict(test_x)[0])
    result = pd.DataFrame([], columns=["stars", "predict"])
    result["stars"] = test_raw_x.stars
    result["predict"] = predict
    result = result.sort("stars", ascending=0)
    result.to_csv(result_path, index=False)
    rmse = mean_squared_error(predict, test_raw_x.stars.as_matrix()) ** 0.5
    return rmse
def buildModelOheETR(train_data, eval_data, train_labels, seed):
    train_data = sparse.csr_matrix(train_data)
    eval_data = sparse.csr_matrix(eval_data)
    clf = ExtraTreesRegressor(n_estimators=500, max_depth=38, min_samples_leaf=2,min_samples_split=6,\
        max_features='auto', n_jobs=-1, random_state=seed, verbose=1)
    clf.fit(train_data, train_labels)
    preds = clf.predict(eval_data)
    preds = np.expm1(preds)

    # transform -ve preds to 0
    for i in range(preds.shape[0]):
        if preds[i] < 0:
            preds[i] = 0
            
    # convert back to log1p
    preds = np.log1p(preds)
            
    return((model,preds))
Ejemplo n.º 19
0
    def predict(class_id):
        print "predicting: ", class_id
        salaries_idx = np.where(salaries_enc == class_id)
        valid_idx = np.where(valid_salaries_enc == class_id)

        if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0:
            return [], None

        classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                        verbose=0,
                                        n_jobs=4, # 2 jobs on submission / 4 on valid test
                                        oob_score=False,
                                        min_samples_split=min_samples_split,
                                        random_state=3465343)

        print features[salaries_idx[0], :].shape
        print salaries[salaries_idx].shape
        classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx])
        predictions_part = classifier.predict(validation_features[valid_idx[0]])
        return predictions_part, valid_idx
Ejemplo n.º 20
0
def get_result():
    ngram_range = (1, 2)
    max_df = 0.75
    max_features = 2000
    v = CountVectorizer(
        ngram_range=ngram_range,
        max_df=max_df,
        max_features=max_features)
    x = v.fit_transform(rats_tr.comments.fillna('')).todense()
    y = rats_tr.quality
    n_estimators = 40
    max_depth = 20
    clf = ExtraTreesRegressor(n_estimators=n_estimators,
                              max_depth=max_depth,
                              random_state=0)
    clf.fit(x, y)

    t_x = v.transform(rats_te.comments.fillna('')).todense()
    t_y = clf.predict(t_x)
    submit = pd.DataFrame(data={'id': rats_te.id, 'quality': t_y})
    submit.to_csv('ridge_submit.csv', index=False)
    def predict(class_id, param):
        print "predicting: ", class_id
        param += "\npredicting: %s\n" % (le_features[col_index].classes_[class_id],)
        salaries_idx = np.where(feature_category == class_id)
        valid_idx = np.where(validation_features_category == class_id)
        param += "Salaries len: %d, valid len: %d\n" % (len(salaries_idx[0]), len(valid_idx[0]))

        if len(salaries_idx[0]) == 0 or len(valid_idx[0]) == 0:
            return [], None, param

        classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                        verbose=0,
                                        n_jobs=4, # 2 jobs on submission / 4 on valid test
                                        oob_score=False,
                                        min_samples_split=min_samples_split,
                                        random_state=3465343)

        print features[salaries_idx[0], :].shape
        print salaries[salaries_idx].shape
        print validation_features[0].shape
        classifier.fit(features[salaries_idx[0], :], salaries[salaries_idx])
        predictions_part = classifier.predict(validation_features[valid_idx[0]])
        return predictions_part, valid_idx, param
Ejemplo n.º 22
0
def load_model():#make it load once when the service starts. called only once.
	#load_the model
         f = open('bpinall.txt','r').readlines()
         num_rows=len(f)
         num_col=len(f[0].split(','))
         x = np.zeros((num_rows,num_col),dtype=float)
         y=np.zeros((num_rows),dtype=float)
         for i,line in enumerate(f):
           line=line.strip('\r\n').strip()
           if line.count(',')>0:
            x[i]=[float(p) for p in line.split(',')]
         f2=open('bpoutall.txt','r').readlines()
         for i,line in enumerate(f2):
             line=line.strip('\r\n')
             y[i]=float(line)
         clf=ExtraTreesRegressor(verbose=0)
         print (x)
         clf.fit(x[:-1],y[:-1])
         pq=clf.predict(x[-1])
         print (pq,y[-1])
         #global clfp
         pickle.dump(clf,open('modelb.pkl','wb'))
         return pq
Ejemplo n.º 23
0
gbr_tr_fit = GradientBoostingRegressor(n_estimators =10,max_depth=7)
gbr_tr_fit = gbr_tr_fit.fit(transformed_train_gbr,target_train)
mix_test_list += [pd.Series(gbr_tr_fit.predict(transformed_test_gbr),index=data_test_in.id.astype(int),name='gbr_tr')]
mix_train_list += [pd.Series(gbr_tr_fit.predict(transformed_train_gbr),index=data_train_in.id.astype(int),name='gbr_tr')]
end_gbr_tr = time.clock()
print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr

start_xfr_tr = time.clock()
xfr= ExtraTreesRegressor(n_estimators =10,max_depth=7)
xfr_tr = xfr.fit(data_train,target_train)
transformed_train_xfr = xfr_tr.transform(data_train,threshold="0.35*mean")
print >> log, 'transformed_train_xfr',transformed_train_xfr.shape
transformed_test_xfr = xfr_tr.transform(data_test,threshold="0.35*mean")
xfr_tr_fit = ExtraTreesRegressor(n_estimators =10,max_depth=7)
xfr_tr_fit = xfr_tr_fit.fit(transformed_train_xfr,target_train)
mix_test_list += [pd.Series(xfr_tr_fit.predict(transformed_test_xfr),index=data_test_in.id.astype(int),name='xfr_tr')]
mix_train_list += [pd.Series(xfr_tr_fit.predict(transformed_train_xfr),index=data_train_in.id.astype(int),name='xfr_tr')]
end_xfr_tr = time.clock()
print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr

start_gbr_cat = time.clock()
gbr_cat_fit = GradientBoostingRegressor(n_estimators =10,max_depth=7)
gbr_cat_fit = gbr_cat_fit.fit(data_train[catcol],target_train)
mix_test_list += [pd.Series(gbr_cat_fit.predict(data_test[catcol]),index=data_test_in.id.astype(int),name='gbr_cat')]
mix_train_list += [pd.Series(gbr_cat_fit.predict(data_train[catcol]),index=data_train_in.id.astype(int),name='gbr_cat')]
end_gbr_cat = time.clock()
print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat

start_xfr_cat = time.clock()
xfr_cat_fit = ExtraTreesRegressor(n_estimators =10,max_depth=7)
xfr_cat_fit = xfr_cat_fit.fit(data_train[catcol],target_train)
    if (method==11):
        print('Ridge')
        str_method = 'Ridge'
        r = Ridge()
        
    if (method==12):
        print('Huber')
        str_method = 'Huber'
        r = HuberRegressor(fit_intercept=True, alpha=0.065, max_iter=160, epsilon=1.2)
        
        
    r.fit(x1[col], y1)


    a1 = NWRMSLE(y2, r.predict(x2[col]), x2['perishable'])
    # part of the output file name
    N1 = str(a1)
    
    test['transactions'] = r.predict(test[col])
    test['transactions'] = test['transactions'].clip(lower=0.+1e-15)

    col = [c for c in x1 if c not in ['id', 'unit_sales','perishable']]
    y1 = x1['unit_sales'].values
    y2 = x2['unit_sales'].values


    # set a new seed to generate random numbers
    ra2 = round(method + 547*method + 182*method) 
    np.random.seed(ra2)
Ejemplo n.º 25
0
    new_final = new_final.append(final[final.index == i])
testeco = pd.concat([test, new_final], axis=1)
testeco.to_csv('testeco_lstm.csv')
print("test data after combining :" + str(testeco.shape))

#Now train the model
test = pd.read_csv("testeco_lstm.csv")
train = pd.read_csv("traineco_lstm.csv")
gg = train.fillna(train.median())
y = gg['target']
X = gg.drop(['id', 'target'], axis=1)
print("X_shape:" + str(X.shape), " , y_shape :" + str(y.shape))
X_train, X_cv, y_train, y_cv = train_test_split(X,
                                                y,
                                                test_size=0.2,
                                                random_state=42)
from sklearn.ensemble import ExtraTreesRegressor
extra_tree = ExtraTreesRegressor(n_estimators=500, random_state=1234)
extra_tree.fit(X_train, y_train)
ypredictions = extra_tree.predict(X_cv)
print(" Root Mean Absolute Error : ",
      sqrt(mean_squared_error(ypredictions, y_cv)))
extra_tree.fit(X, y)
test2 = test.drop(['id'], axis=1)
test2 = test2.fillna(test2.median())
predictions = extra_tree.predict(test2)
pred = pd.DataFrame(predictions)
pred = pred.set_index([test['id']])
pred.to_csv("extra_tree_500.csv")

#Our best submission is extra_tree_500 giving accuracy-> 0.98098 on leaderboard,By Default ExtraTreesRegressor (n_estimators=500,random_state=1234)
Ejemplo n.º 26
0
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }
    scores = ['neg_mean_absolute_error']
    for score in scores:
        forest = GridSearchCV(ExtraTreesRegressor(random_state=1),
                              tuned_parameters,
                              verbose=10,
                              cv=5,
                              n_jobs=-1,
                              scoring='%s' % score)

        forest.fit(X_train, y_train)
        model_train = forest.predict(X_train)
        model_test = forest.predict(X_test)
        r2_score_train = r2_score(y_train, model_train)
        mse_score_train = mean_squared_error(y_train, model_train)
        mae_score_train = mean_absolute_error(y_train, model_train)
        rmse_score_train = np.sqrt(mse_score_train)
        r2_score_test = r2_score(y_test, model_test)
        mse_score_test = mean_squared_error(y_test, model_test)
        mae_score_test = mean_absolute_error(y_test, model_test)
        rmse_score_test = np.sqrt(mse_score_test)

        dump(forest, 'rf_s_vs_e.pkl')

    if args.dielectric is True:
        if args.outlier_removal is True:
            f = open('hyperpameters_outlier_removal_dielectric.txt', mode='w')
Ejemplo n.º 27
0
class ExtraTreesRegressor(AutoSklearnRegressionAlgorithm):
    def __init__(self,
                 n_estimators,
                 criterion,
                 min_samples_leaf,
                 min_samples_split,
                 max_features,
                 max_leaf_nodes_or_max_depth="max_depth",
                 bootstrap=False,
                 max_leaf_nodes=None,
                 max_depth="None",
                 oob_score=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(ExtraTreesRegressor, self).__init__()
        self.n_estimators = int(n_estimators)
        self.estimator_increment = 10
        if criterion not in ("mse"):
            raise ValueError("'criterion' is not in ('mse'): "
                             "%s" % criterion)
        self.criterion = criterion

        if max_leaf_nodes_or_max_depth == "max_depth":
            self.max_leaf_nodes = None
            if max_depth == "None":
                self.max_depth = None
            else:
                self.max_depth = int(max_depth)
                #if use_max_depth == "True":
                #    self.max_depth = int(max_depth)
                #elif use_max_depth == "False":
                #    self.max_depth = None
        else:
            if max_leaf_nodes == "None":
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(max_leaf_nodes)
            self.max_depth = None

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)

        self.max_features = float(max_features)

        if bootstrap == "True":
            self.bootstrap = True
        elif bootstrap == "False":
            self.bootstrap = False

        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.estimator = None

    def fit(self, X, y, refit=False):
        if self.estimator is None or refit:
            self.iterative_fit(X, y, n_iter=1, refit=refit)

        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)
        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesRegressor as ETR

        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(n_estimators=0,
                                 criterion=self.criterion,
                                 max_depth=self.max_depth,
                                 min_samples_split=self.min_samples_split,
                                 min_samples_leaf=self.min_samples_leaf,
                                 bootstrap=self.bootstrap,
                                 max_features=max_features,
                                 max_leaf_nodes=self.max_leaf_nodes,
                                 oob_score=self.oob_score,
                                 n_jobs=self.n_jobs,
                                 verbose=self.verbose,
                                 random_state=self.random_state,
                                 warm_start=True)
        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(
            X,
            y,
        )
        self.estimator = tmp
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'ET',
            'name': 'Extra Trees Regressor',
            'handles_regression': True,
            'handles_classification': False,
            'handles_multiclass': False,
            'handles_multilabel': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, ),
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100))
        criterion = cs.add_hyperparameter(Constant("criterion", "mse"))
        max_features = cs.add_hyperparameter(
            UniformFloatHyperparameter("max_features", 0.5, 5, default=1))

        max_depth = cs.add_hyperparameter(
            UnParametrizedHyperparameter(name="max_depth", value="None"))

        min_samples_split = cs.add_hyperparameter(
            UniformIntegerHyperparameter("min_samples_split", 2, 20,
                                         default=2))
        min_samples_leaf = cs.add_hyperparameter(
            UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default=1))

        # Unparametrized, we use min_samples as regularization
        # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter(
        # name="max_leaf_nodes_or_max_depth", value="max_depth")
        # CategoricalHyperparameter("max_leaf_nodes_or_max_depth",
        # choices=["max_leaf_nodes", "max_depth"], default="max_depth")
        # min_weight_fraction_leaf = UniformFloatHyperparameter(
        #    "min_weight_fraction_leaf", 0.0, 0.1)
        # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
        #                                              value="None")

        bootstrap = cs.add_hyperparameter(
            CategoricalHyperparameter("bootstrap", ["True", "False"],
                                      default="False"))

        # Conditions
        # Not applicable because max_leaf_nodes is no legal value of the parent
        #cond_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=max_leaf_nodes,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_leaf_nodes")
        #cond2_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=use_max_depth,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_depth")

        #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth,
        #value="True")
        #cs.add_condition(cond_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond2_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond_max_depth)

        return cs
def hyperopt_obj(param, feat_folder, feat_name, trial_counter):

    log_loss_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    year = datetime.datetime.now().year
    for run in range(1, config.n_runs + 1):  # range(start, end)前包括后不包括
        for fold in range(1, config.n_folds + 1):
            rng = np.random.RandomState(year + 1000 * run + 10 * fold)

            #### all the path
            path = "%s/Run%d/Fold%d" % (feat_folder, run, fold)
            save_path = "%s/Run%d/Fold%d" % (output_path, run, fold)
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            # feat: combine feat file
            feat_train_path = "%s/train.feat" % path
            feat_valid_path = "%s/valid.feat" % path
            # # weight
            weight_train_path = "%s/train.feat.weight" % path
            weight_valid_path = "%s/valid.feat.weight" % path
            # info
            info_train_path = "%s/train.info" % path
            info_valid_path = "%s/valid.info" % path
            # cdf
            cdf_valid_path = "%s/valid.cdf" % path
            # raw prediction path (rank)
            raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % (
                save_path, feat_name, trial_counter)  #
            rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % (
                save_path, feat_name, trial_counter)  #

            ## load feat
            X_train, labels_train = load_svmlight_file(
                feat_train_path
            )  # load_svmlight_file: Load datasets in the svmlight / libsvm format into sparse CSR matrix
            X_valid, labels_valid = load_svmlight_file(feat_valid_path)
            # align feat dim
            if X_valid.shape[1] < X_train.shape[1]:
                X_valid = hstack([
                    X_valid,
                    np.zeros((X_valid.shape[0],
                              X_train.shape[1] - X_valid.shape[1]))
                ])
            elif X_valid.shape[1] > X_train.shape[1]:
                X_train = hstack([
                    X_train,
                    np.zeros((X_train.shape[0],
                              X_valid.shape[1] - X_train.shape[1]))
                ])
            X_train = X_train.tocsr(
            )  # tocsr: Convert this matrix to Compressed Sparse Row format
            X_valid = X_valid.tocsr()
            # ## load weight
            weight_train = np.loadtxt(weight_train_path, dtype=float)
            weight_valid = np.loadtxt(weight_valid_path, dtype=float)

            ## load valid info
            info_train = pd.read_csv(info_train_path)
            numTrain = info_train.shape[0]
            info_valid = pd.read_csv(info_valid_path)
            numValid = info_valid.shape[0]
            Y_valid = info_valid["is_duplicate"]
            ## load cdf
            cdf_valid = np.loadtxt(cdf_valid_path, dtype=float)

            # ## make evalerror func 评价函数
            # evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid)
            # evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid)
            # evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid)
            # evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold)
            # evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid)

            ##############
            ## Training ##
            ##############
            ## you can use bagging to stabilize the predictions 还可以使用 bagging 来使模型更加稳定
            preds_bagging = np.zeros((numValid, bagging_size), dtype=float)
            for n in range(bagging_size):
                if bootstrap_replacement:
                    sampleSize = int(
                        numTrain *
                        bootstrap_ratio)  # bootstrap_ratio: 使用训练样本的比例
                    index_base = rng.randint(numTrain, size=sampleSize)
                    index_meta = [
                        i for i in range(numTrain) if i not in index_base
                    ]
                else:
                    randnum = rng.uniform(size=numTrain)  # 产生 0-1 之间的唯一的随机数
                    index_base = [
                        i for i in range(numTrain)
                        if randnum[i] < bootstrap_ratio
                    ]
                    index_meta = [
                        i for i in range(numTrain)
                        if randnum[i] >= bootstrap_ratio
                    ]

                # 如果是xgb则先把数据转换成xgb需要的格式
                if "booster" in param:
                    dvalid_base = xgb.DMatrix(
                        X_valid, label=labels_valid)  # , weight=weight_valid
                    dtrain_base = xgb.DMatrix(
                        X_train[index_base], label=labels_train[index_base]
                    )  # , weight=weight_train[index_base]

                    watchlist = []
                    if verbose_level >= 2:
                        watchlist = [(dtrain_base, 'train'),
                                     (dvalid_base, 'valid')]

                ## various models
                if param["task"] in ["regression", "ranking"]:
                    ## regression & pairwise ranking with xgboost
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , feval=evalerror_regrank_valid
                    pred = bst.predict(dvalid_base)

                if param["task"] in ["classification"]:
                    ## regression & pairwise ranking with xgboost
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , feval=evalerror_regrank_valid
                    pred = bst.predict(dvalid_base)

                elif param["task"] in ["softmax"]:
                    ## softmax regression with xgboost
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , feval=evalerror_softmax_valid
                    pred = bst.predict(dvalid_base)
                    w = np.asarray(range(1, numValid))
                    pred = pred * w[
                        np.
                        newaxis, :]  # np.newaxis: 插入一个维度,等价于w[np.newaxis],这里pred是n*1矩阵,而w[np.newaxis,:]是1*n矩阵,注意w原是数组
                    pred = np.sum(pred, axis=1)

                elif param["task"] in ["softkappa"]:
                    ## softkappa with xgboost 自定义损失函数
                    # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'], watchlist
                    )  # , obj=obj, feval=evalerror_softkappa_valid
                    pred = softmax(bst.predict(dvalid_base))
                    w = np.asarray(range(1, numValid))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param["task"] in ["ebc"]:
                    ## ebc with xgboost 自定义损失函数
                    # obj = lambda preds, dtrain: ebcObj(preds, dtrain)
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , obj=obj, feval=evalerror_ebc_valid
                    pred = sigmoid(bst.predict(dvalid_base))
                    pred = applyEBCRule(pred,
                                        hard_threshold=ebc_hard_threshold)

                elif param["task"] in ["cocr"]:
                    ## cocr with xgboost 自定义损失函数
                    # obj = lambda preds, dtrain: cocrObj(preds, dtrain)
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , obj=obj, feval=evalerror_cocr_valid
                    pred = bst.predict(dvalid_base)
                    pred = applyCOCRRule(pred)

                elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    rf = RandomForestRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        n_jobs=param['n_jobs'],
                        random_state=param['random_state'])
                    rf.fit(X_train[index_base], labels_train[index_base]
                           )  # , sample_weight=weight_train[index_base]
                    pred = rf.predict(X_valid)

                elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    etr = ExtraTreesRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        n_jobs=param['n_jobs'],
                        random_state=param['random_state'])
                    etr.fit(X_train[index_base], labels_train[index_base]
                            )  # , sample_weight=weight_train[index_base]
                    pred = etr.predict(X_valid)

                elif param['task'] == "reg_skl_gbm":
                    ## regression with sklearn gradient boosting regressor
                    gbm = GradientBoostingRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        learning_rate=param['learning_rate'],
                        max_depth=param['max_depth'],
                        subsample=param['subsample'],
                        random_state=param['random_state'])
                    gbm.fit(X_train.toarray()[index_base],
                            labels_train[index_base]
                            )  # , sample_weight=weight_train[index_base]
                    pred = gbm.predict(X_valid.toarray())

                elif param['task'] == "clf_skl_lr":
                    ## classification with sklearn logistic regression
                    lr = LogisticRegression(penalty="l2",
                                            dual=True,
                                            tol=1e-5,
                                            C=param['C'],
                                            fit_intercept=True,
                                            intercept_scaling=1.0,
                                            class_weight='auto',
                                            random_state=param['random_state'])
                    lr.fit(X_train[index_base], labels_train[index_base])
                    pred = lr.predict_proba(X_valid)
                    w = np.asarray(range(1, numValid))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param['task'] == "reg_skl_svr":
                    ## regression with sklearn support vector regression
                    X_train, X_valid = X_train.toarray(), X_valid.toarray()
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)
                    svr = SVR(C=param['C'],
                              gamma=param['gamma'],
                              epsilon=param['epsilon'],
                              degree=param['degree'],
                              kernel=param['kernel'])
                    svr.fit(X_train[index_base], labels_train[index_base]
                            )  # , sample_weight=weight_train[index_base]
                    pred = svr.predict(X_valid)

                elif param['task'] == "reg_skl_ridge":
                    ## regression with sklearn ridge regression
                    ridge = Ridge(alpha=param["alpha"], normalize=True)
                    ridge.fit(X_train[index_base], labels_train[index_base]
                              )  # , sample_weight=weight_train[index_base]
                    pred = ridge.predict(X_valid)

                elif param['task'] == "reg_skl_lasso":
                    ## regression with sklearn lasso
                    lasso = Lasso(alpha=param["alpha"], normalize=True)
                    lasso.fit(X_train[index_base], labels_train[index_base])
                    pred = lasso.predict(X_valid)

                elif param['task'] == 'reg_libfm':
                    ## regression with factorization machine (libfm)
                    ## to array
                    X_train = X_train.toarray()
                    X_valid = X_valid.toarray()

                    ## scale
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)

                    ## dump feat
                    dump_svmlight_file(X_train[index_base],
                                       labels_train[index_base],
                                       feat_train_path + ".tmp")
                    dump_svmlight_file(X_valid, labels_valid,
                                       feat_valid_path + ".tmp")

                    ## train fm
                    cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
                                libfm_exe, feat_train_path+".tmp", feat_valid_path+".tmp", raw_pred_valid_path, \
                                param['dim'], param['iter'])
                    os.system(cmd)
                    os.remove(feat_train_path + ".tmp")
                    os.remove(feat_valid_path + ".tmp")

                    ## extract libfm prediction
                    pred = np.loadtxt(raw_pred_valid_path, dtype=float)
                    ## labels are in [0,1,2,3]
                    pred += 1

                elif param['task'] == "reg_keras_dnn":
                    ## regression with keras' deep neural networks
                    model = Sequential()
                    ## input layer
                    model.add(Dropout(param["input_dropout"]))
                    ## hidden layers
                    first = True
                    hidden_layers = param['hidden_layers']
                    while hidden_layers > 0:
                        if first:
                            dim = X_train.shape[1]
                            first = False
                        else:
                            dim = param["hidden_units"]
                        model.add(
                            Dense(dim,
                                  param["hidden_units"],
                                  init='glorot_uniform'))
                        if param["batch_norm"]:
                            model.add(
                                BatchNormalization((param["hidden_units"], )))
                        if param["hidden_activation"] == "prelu":
                            model.add(PReLU((param["hidden_units"], )))
                        else:
                            model.add(Activation(param['hidden_activation']))
                        model.add(Dropout(param["hidden_dropout"]))
                        hidden_layers -= 1

                    ## output layer
                    model.add(
                        Dense(param["hidden_units"], 1, init='glorot_uniform'))
                    model.add(Activation('linear'))

                    ## loss
                    model.compile(loss='mean_squared_error', optimizer="adam")

                    ## to array
                    X_train = X_train.toarray()
                    X_valid = X_valid.toarray()

                    ## scale
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)

                    ## train
                    model.fit(X_train[index_base],
                              labels_train[index_base],
                              nb_epoch=param['nb_epoch'],
                              batch_size=param['batch_size'],
                              validation_split=0,
                              verbose=0)

                    ##prediction
                    pred = model.predict(X_valid, verbose=0)
                    pred.shape = (X_valid.shape[0], )

                elif param['task'] == "reg_rgf":
                    ## regression with regularized greedy forest (rgf)
                    ## to array
                    X_train, X_valid = X_train.toarray(), X_valid.toarray()

                    train_x_fn = feat_train_path + ".x"
                    train_y_fn = feat_train_path + ".y"
                    valid_x_fn = feat_valid_path + ".x"
                    valid_pred_fn = feat_valid_path + ".pred"

                    model_fn_prefix = "rgf_model"

                    np.savetxt(train_x_fn,
                               X_train[index_base],
                               fmt="%.6f",
                               delimiter='\t')
                    np.savetxt(train_y_fn,
                               labels_train[index_base],
                               fmt="%d",
                               delimiter='\t')
                    np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t')
                    # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')

                    pars = [
                        "train_x_fn=",
                        train_x_fn,
                        "\n",
                        "train_y_fn=",
                        train_y_fn,
                        "\n",
                        #"train_w_fn=",weight_train_path,"\n",
                        "model_fn_prefix=",
                        model_fn_prefix,
                        "\n",
                        "reg_L2=",
                        param['reg_L2'],
                        "\n",
                        #"reg_depth=", 1.01, "\n",
                        "algorithm=",
                        "RGF",
                        "\n",
                        "loss=",
                        "LS",
                        "\n",
                        #"opt_interval=", 100, "\n",
                        "valid_interval=",
                        param['max_leaf_forest'],
                        "\n",
                        "max_leaf_forest=",
                        param['max_leaf_forest'],
                        "\n",
                        "num_iteration_opt=",
                        param['num_iteration_opt'],
                        "\n",
                        "num_tree_search=",
                        param['num_tree_search'],
                        "\n",
                        "min_pop=",
                        param['min_pop'],
                        "\n",
                        "opt_interval=",
                        param['opt_interval'],
                        "\n",
                        "opt_stepsize=",
                        param['opt_stepsize'],
                        "\n",
                        "NormalizeTarget"
                    ]
                    pars = "".join([str(p) for p in pars])

                    rfg_setting_train = "./rfg_setting_train"
                    with open(rfg_setting_train + ".inp", "wb") as f:
                        f.write(pars)

                    ## train fm
                    cmd = "perl %s %s train %s >> rgf.log" % (
                        call_exe, rgf_exe, rfg_setting_train)
                    #print cmd
                    os.system(cmd)

                    model_fn = model_fn_prefix + "-01"
                    pars = [
                        "test_x_fn=", valid_x_fn, "\n", "model_fn=", model_fn,
                        "\n", "prediction_fn=", valid_pred_fn
                    ]

                    pars = "".join([str(p) for p in pars])

                    rfg_setting_valid = "./rfg_setting_valid"
                    with open(rfg_setting_valid + ".inp", "wb") as f:
                        f.write(pars)
                    cmd = "perl %s %s predict %s >> rgf.log" % (
                        call_exe, rgf_exe, rfg_setting_valid)
                    #print cmd
                    os.system(cmd)

                    pred = np.loadtxt(valid_pred_fn, dtype=float)

                ## weighted averageing over different models
                pred_valid = pred
                ## this bagging iteration
                preds_bagging[:,
                              n] = pred_valid  # preds_bagging的第n+1列为pred_valid
                pred_raw = np.mean(preds_bagging[:, :(n + 1)],
                                   axis=1)  # 按行(同行多列)进行平均值
                # pred_rank = pred_raw.argsort().argsort()    # argsort: 获取排序的索引值(index),但索引值本身不排序,第二次是归位
                # pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) # 根据cdf来生成分数
                # kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) # 计算kappa分数
                log_loss_valid = elementwise.log_loss(Y_valid, pred_raw)
                log_loss_valid = log_loss(Y_valid, pred_raw)
                print('Y_valid mean:', np.mean(Y_valid))
                print('pred_raw mean:', np.mean(pred_raw))
                if (n + 1) != bagging_size:
                    print(
                        "              {:>3}   {:>3}   {:>3}   {:>6}   {} x {}"
                        .format(run, fold, n + 1, np.round(log_loss_valid, 6),
                                X_train.shape[0], X_train.shape[1]))
                else:
                    print(
                        "                    {:>3}       {:>3}      {:>3}    {:>8}  {} x {}"
                        .format(run, fold, n + 1, np.round(log_loss_valid, 6),
                                X_train.shape[0], X_train.shape[1]))
            log_loss_cv[run - 1, fold - 1] = log_loss_valid
            ## save this prediction 保存的是单行的预测值
            dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw})
            dfPred.to_csv(raw_pred_valid_path,
                          index=False,
                          header=True,
                          columns=["target", "prediction"])
            # save this prediction 保存的是根据预测值排序之后,然后使用cdf来生成的预测值
            # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank})
            # dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"])

    log_loss_cv_mean = np.mean(log_loss_cv)
    log_loss_cv_std = np.std(log_loss_cv)
    if verbose_level >= 1:
        print("              Mean: %.6f" % log_loss_cv_mean)
        print("              Std: %.6f" % log_loss_cv_std)

    ####################
    #### Retraining ####
    ####################
    #### all the path
    # path = "%s/All" % (feat_folder)
    # save_path = "%s/All" % output_path
    # subm_path = "%s/Subm" % output_path
    # if not os.path.exists(save_path):
    #     os.makedirs(save_path)
    # if not os.path.exists(subm_path):
    #     os.makedirs(subm_path)
    # # feat
    # feat_train_path = "%s/train.feat" % path
    # feat_test_path = "%s/test.feat" % path
    # # weight
    # # weight_train_path = "%s/train.feat.weight" % path
    # # info
    # info_train_path = "%s/train.info" % path
    # info_test_path = "%s/test.info" % path
    # # cdf
    # cdf_test_path = "%s/test.cdf" % path
    # # raw prediction path (rank)
    # raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    # rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    # # submission path (is_duplicate as in [0, 1])
    # subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, log_loss_cv_mean, log_loss_cv_std)
    #
    # #### load data
    # ## load feat
    # X_train, labels_train = load_svmlight_file(feat_train_path)
    # X_test, labels_test = load_svmlight_file(feat_test_path)
    # if X_test.shape[1] < X_train.shape[1]:
    #     X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))])
    # elif X_test.shape[1] > X_train.shape[1]:
    #     X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))])
    # X_train = X_train.tocsr()
    # X_test = X_test.tocsr()
    # ## load train weight
    # # weight_train = np.loadtxt(weight_train_path, dtype=float)
    # ## load test info
    # info_train = pd.read_csv(info_train_path)
    # numTrain = info_train.shape[0]
    # info_test = pd.read_csv(info_test_path)
    # numTest = info_test.shape[0]
    # id_test = info_test["id"]
    #
    # ## load cdf
    # cdf_test = np.loadtxt(cdf_test_path, dtype=float)
    # # ## 评价函数
    # # evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test)
    # # evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test)
    # # evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test)
    # # evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold)
    # # evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test)
    #
    # ## bagging
    # preds_bagging = np.zeros((numTest, bagging_size), dtype=float)
    # for n in range(bagging_size):
    #     if bootstrap_replacement:
    #         sampleSize = int(numTrain*bootstrap_ratio)
    #         #index_meta = rng.randint(numTrain, size=sampleSize)
    #         #index_base = [i for i in range(numTrain) if i not in index_meta]
    #         index_base = rng.randint(numTrain, size=sampleSize)
    #         index_meta = [i for i in range(numTrain) if i not in index_base]
    #     else:
    #         randnum = rng.uniform(size=numTrain)
    #         index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio]
    #         index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio]
    #
    #     # 如果是xgb则先把数据转换成xgb需要的格式
    #     if "booster" in param:
    #         dtest = xgb.DMatrix(X_test, label=labels_test)
    #         dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base])   # , weight=weight_train[index_base]
    #
    #         watchlist = []
    #         if verbose_level >= 2:
    #             watchlist  = [(dtrain, 'train')]
    #
    #     ## train
    #     if param["task"] in ["regression", "ranking"]:
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , feval=evalerror_regrank_test
    #         pred = bst.predict(dtest)
    #
    #     elif param["task"] in ["softmax"]:
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , feval=evalerror_softmax_test
    #         pred = bst.predict(dtest)
    #         w = np.asarray(range(1,numValid))
    #         pred = pred * w[np.newaxis,:]
    #         pred = np.sum(pred, axis=1)
    #
    #     elif param["task"] in ["softkappa"]:
    #         #  自定义损失函数
    #         # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , obj=obj, feval=evalerror_softkappa_test
    #         pred = softmax(bst.predict(dtest))
    #         w = np.asarray(range(1,numValid))
    #         pred = pred * w[np.newaxis,:]
    #         pred = np.sum(pred, axis=1)
    #
    #     elif param["task"]  in ["ebc"]:
    #         #  自定义损失函数
    #         # obj = lambda preds, dtrain: ebcObj(preds, dtrain)
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , obj=obj, feval=evalerror_ebc_test
    #         pred = sigmoid(bst.predict(dtest))
    #         pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)
    #
    #     elif param["task"]  in ["cocr"]:
    #         #  自定义损失函数
    #         obj = lambda preds, dtrain: cocrObj(preds, dtrain)
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , obj=obj, feval=evalerror_cocr_test
    #         pred = bst.predict(dtest)
    #         pred = applyCOCRRule(pred)
    #
    #     elif param['task'] == "reg_skl_rf":
    #         ## random forest regressor
    #         rf = RandomForestRegressor(n_estimators=param['n_estimators'],
    #                                    max_features=param['max_features'],
    #                                    n_jobs=param['n_jobs'],
    #                                    random_state=param['random_state'])
    #         rf.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base]
    #         pred = rf.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_etr":
    #         ## extra trees regressor
    #         etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
    #                                   max_features=param['max_features'],
    #                                   n_jobs=param['n_jobs'],
    #                                   random_state=param['random_state'])
    #         etr.fit(X_train[index_base], labels_train[index_base])    # , sample_weight=weight_train[index_base]
    #         pred = etr.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_gbm":
    #         ## gradient boosting regressor
    #         gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'],
    #                                         max_features=param['max_features'],
    #                                         learning_rate=param['learning_rate'],
    #                                         max_depth=param['max_depth'],
    #                                         subsample=param['subsample'],
    #                                         random_state=param['random_state'])
    #         gbm.fit(X_train.toarray()[index_base], labels_train[index_base])  #, sample_weight=weight_train[index_base]
    #         pred = gbm.predict(X_test.toarray())
    #
    #     elif param['task'] == "clf_skl_lr":
    #         lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5,
    #                                 C=param['C'], fit_intercept=True, intercept_scaling=1.0,
    #                                 class_weight='auto', random_state=param['random_state'])
    #         lr.fit(X_train[index_base], labels_train[index_base])
    #         pred = lr.predict_proba(X_test)
    #         w = np.asarray(range(1,numValid))
    #         pred = pred * w[np.newaxis,:]
    #         pred = np.sum(pred, axis=1)
    #
    #     elif param['task'] == "reg_skl_svr":
    #         ## regression with sklearn support vector regression
    #         X_train, X_test = X_train.toarray(), X_test.toarray()
    #         scaler = StandardScaler()
    #         X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #         X_test = scaler.transform(X_test)
    #         svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'],
    #                                 degree=param['degree'], kernel=param['kernel'])
    #         svr.fit(X_train[index_base], labels_train[index_base])    # , sample_weight=weight_train[index_base]
    #         pred = svr.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_ridge":
    #         ridge = Ridge(alpha=param["alpha"], normalize=True)
    #         ridge.fit(X_train[index_base], labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #         pred = ridge.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_lasso":
    #         lasso = Lasso(alpha=param["alpha"], normalize=True)
    #         lasso.fit(X_train[index_base], labels_train[index_base])
    #         pred = lasso.predict(X_test)
    #
    #     elif param['task'] == 'reg_libfm':
    #         ## to array
    #         X_train, X_test = X_train.toarray(), X_test.toarray()
    #
    #         ## scale
    #         scaler = StandardScaler()
    #         X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #         X_test = scaler.transform(X_test)
    #
    #         ## dump feat
    #         dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path+".tmp")
    #         dump_svmlight_file(X_test, labels_test, feat_test_path+".tmp")
    #
    #         ## train fm
    #         cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
    #                     libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \
    #                     param['dim'], param['iter'])
    #         os.system(cmd)
    #         os.remove(feat_train_path+".tmp")
    #         os.remove(feat_test_path+".tmp")
    #
    #         ## extract libfm prediction
    #         pred = np.loadtxt(raw_pred_test_path, dtype=float)
    #         ## labels are in [0,1,2,3]
    #         pred += 1
    #
    #     elif param['task'] == "reg_keras_dnn":
    #         ## regression with keras deep neural networks
    #         model = Sequential()
    #         ## input layer
    #         model.add(Dropout(param["input_dropout"]))
    #         ## hidden layers
    #         first = True
    #         hidden_layers = param['hidden_layers']
    #         while hidden_layers > 0:
    #             if first:
    #                 dim = X_train.shape[1]
    #                 first = False
    #             else:
    #                 dim = param["hidden_units"]
    #             model.add(Dense(dim, param["hidden_units"], init='glorot_uniform'))
    #             if param["batch_norm"]:
    #                 model.add(BatchNormalization((param["hidden_units"],)))
    #             if param["hidden_activation"] == "prelu":
    #                 model.add(PReLU((param["hidden_units"],)))
    #             else:
    #                 model.add(Activation(param['hidden_activation']))
    #             model.add(Dropout(param["hidden_dropout"]))
    #             hidden_layers -= 1
    #
    #         ## output layer
    #         model.add(Dense(param["hidden_units"], 1, init='glorot_uniform'))
    #         model.add(Activation('linear'))
    #
    #         ## loss
    #         model.compile(loss='mean_squared_error', optimizer="adam")
    #
    #         ## to array
    #         X_train = X_train.toarray()
    #         X_test = X_test.toarray()
    #
    #         ## scale
    #         scaler = StandardScaler()
    #         X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #         X_test = scaler.transform(X_test)
    #
    #         ## train
    #         model.fit(X_train[index_base], labels_train[index_base],
    #                     nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0)
    #
    #         ##prediction
    #         pred = model.predict(X_test, verbose=0)
    #         pred.shape = (X_test.shape[0],)
    #
    #     elif param['task'] == "reg_rgf":
    #         ## to array
    #         X_train, X_test = X_train.toarray(), X_test.toarray()
    #
    #         train_x_fn = feat_train_path+".x"
    #         train_y_fn = feat_train_path+".y"
    #         test_x_fn = feat_test_path+".x"
    #         test_pred_fn = feat_test_path+".pred"
    #
    #         model_fn_prefix = "rgf_model"
    #
    #         np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t')
    #         np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t')
    #         np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t')
    #         # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')
    #
    #
    #         pars = [
    #             "train_x_fn=",train_x_fn,"\n",
    #             "train_y_fn=",train_y_fn,"\n",
    #             #"train_w_fn=",weight_train_path,"\n",
    #             "model_fn_prefix=",model_fn_prefix,"\n",
    #             "reg_L2=", param['reg_L2'], "\n",
    #             #"reg_depth=", 1.01, "\n",
    #             "algorithm=","RGF","\n",
    #             "loss=","LS","\n",
    #             "test_interval=", param['max_leaf_forest'],"\n",
    #             "max_leaf_forest=", param['max_leaf_forest'],"\n",
    #             "num_iteration_opt=", param['num_iteration_opt'], "\n",
    #             "num_tree_search=", param['num_tree_search'], "\n",
    #             "min_pop=", param['min_pop'], "\n",
    #             "opt_interval=", param['opt_interval'], "\n",
    #             "opt_stepsize=", param['opt_stepsize'], "\n",
    #             "NormalizeTarget"
    #         ]
    #         pars = "".join([str(p) for p in pars])
    #
    #         rfg_setting_train = "./rfg_setting_train"
    #         with open(rfg_setting_train+".inp", "wb") as f:
    #             f.write(pars)
    #
    #         ## train fm
    #         cmd = "perl %s %s train %s >> rgf.log" % (
    #                 call_exe, rgf_exe, rfg_setting_train)
    #         #print cmd
    #         os.system(cmd)
    #
    #
    #         model_fn = model_fn_prefix + "-01"
    #         pars = [
    #             "test_x_fn=",test_x_fn,"\n",
    #             "model_fn=", model_fn,"\n",
    #             "prediction_fn=", test_pred_fn
    #         ]
    #
    #         pars = "".join([str(p) for p in pars])
    #
    #         rfg_setting_test = "./rfg_setting_test"
    #         with open(rfg_setting_test+".inp", "wb") as f:
    #             f.write(pars)
    #         cmd = "perl %s %s predict %s >> rgf.log" % (
    #                 call_exe, rgf_exe, rfg_setting_test)
    #         #print cmd
    #         os.system(cmd)
    #
    #         pred = np.loadtxt(test_pred_fn, dtype=float)
    #
    #     ## weighted averageing over different models
    #     pred_test = pred
    #     preds_bagging[:,n] = pred_test
    # pred_raw = np.mean(preds_bagging, axis=1)
    # pred_rank = pred_raw.argsort().argsort()
    # #
    # ## write
    # output = pd.DataFrame({"id": id_test, "prediction": pred_raw})
    # output.to_csv(raw_pred_test_path, index=False)
    #
    # ## write
    # output = pd.DataFrame({"id": id_test, "prediction": pred_rank})
    # output.to_csv(rank_pred_test_path, index=False)
    #
    # ## write score
    # pred_score = getScore(pred, cdf_test)
    # output = pd.DataFrame({"id": id_test, "prediction": pred_score})
    # output.to_csv(subm_path, index=False)
    # #"""

    return log_loss_cv_mean, log_loss_cv_std
Ejemplo n.º 29
0
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(local_train, y_local_train)
# print(rf_random.best_params_)


# In[ ]:


#RF classifier for train-validation perf:
clf = ExtraTreesRegressor(verbose=2, n_jobs=1,oob_score=True,min_samples_leaf=2, bootstrap=True,criterion='mae', max_depth = 30, n_estimators=200, random_state=0)
clf.fit(local_train, y_local_train)
p = clf.predict_proba(local_validation)
y_validation_pred_binary = clf.predict(local_validation)
y_validation_pred_prob = []
for x,y in p:
    y_validation_pred_prob.append(y)
count_match = 0
count_error = 0
deviation = 0.0
assert(len(y_validation_pred_prob)==len(y_local_validation))
validation_gtruth=np.asarray(y_local_validation)
for i in range(len(y_local_validation)):
    deviation +=abs(y_validation_pred_prob[i]-validation_gtruth[i])
    if (int(y_validation_pred_binary[i])==int(validation_gtruth[i])):
        count_match+=1
    else:
        count_error+=1
validation_accuracy = count_match/(count_match+count_error)*100.0
Ejemplo n.º 30
0
        'n_jobs': 4
    }
    #rf = RandomForestRegressor(**params_rf)
    #rf.fit(x_train,y_train)
    #y_pre_rf = rf.predict(x_test)
    params_ext = {
        'max_features': 'log2',
        'n_estimators': 600,
        'max_depth': 12,
        'oob_score': True,
        'n_jobs': 4,
        'bootstrap': True
    }
    ext = ExtraTreesRegressor(**params_ext)
    ext.fit(x_train, y_train)
    y_pre_ext = ext.predict(x_test)
    ###
    '''
    plt.scatter(xday[-(start+14):-14],y_train)
    plt.scatter(xday[-14:],y_pre_ext,color = 'green')
    plt.plot(xday[-14:],y_pre_ext,color = 'red')
    path = "e://tianchi_koubei/fig/rf_pre/"+str(i+1)+'.png'
    plt.savefig(path+".png")
    plt.clf()#清除图像,所有的都画到一起了
    '''
    output(fw, i + 1, y_pre_ext)
    print(i)
    i += 1

fr1.close()
fr2.close()
Ejemplo n.º 31
0
class PredictiveGraphEmbedder(object):
    """Provide 2D embedding."""
    def __init__(self,
                 n_estimators=250,
                 medium_dim=100,
                 nn_n_estimators=30,
                 nn_negative_bias=1,
                 nn_k=7,
                 nn_p=2,
                 emb_iter=50,
                 emb_confidence=2,
                 emb_sample_fraction=0.5,
                 emb_feature_fraction=1,
                 emb_alpha=1,
                 emb_gamma=1,
                 emb_beta=30):
        """init."""
        self.set_params(n_estimators, medium_dim, nn_n_estimators,
                        nn_negative_bias, nn_k, nn_p, emb_iter, emb_confidence,
                        emb_sample_fraction, emb_feature_fraction, emb_alpha,
                        emb_gamma, emb_beta)

        self.params_range = dict(
            n_estimators=[250],
            medium_dim=[10, 25, 50, 100, 250, 500],
            nn_n_estimators=[30],
            nn_negative_bias=[0, 1],
            nn_k=[1, 3, 5, 7, 11],
            nn_p=[2],
            emb_iter=[50],
            emb_confidence=[1, 3, 5],
            emb_sample_fraction=[.5, .75, 1],
            emb_feature_fraction=[.01, .05, .1, .3, .5, .7, 1],
            emb_alpha=[0, 1, 3],
            emb_gamma=[1],
            emb_beta=[20, 30, 40])

    def get_params(self):
        """get_params."""
        return dict(n_estimators=self.n_estimators,
                    medium_dim=self.medium_dim,
                    nn_n_estimators=self.nn_n_estimators,
                    nn_negative_bias=self.nn_negative_bias,
                    nn_k=self.nn_k,
                    nn_p=self.nn_p,
                    emb_iter=self.emb_iter,
                    emb_confidence=self.emb_confidence,
                    emb_sample_fraction=self.emb_sample_fraction,
                    emb_feature_fraction=self.emb_feature_fraction,
                    emb_alpha=self.emb_alpha,
                    emb_gamma=self.emb_gamma,
                    emb_beta=self.emb_beta)

    def set_params(self,
                   n_estimators=250,
                   medium_dim=100,
                   nn_n_estimators=30,
                   nn_negative_bias=1,
                   nn_k=7,
                   nn_p=2,
                   emb_iter=10,
                   emb_confidence=2,
                   emb_sample_fraction=0.6,
                   emb_feature_fraction=1,
                   emb_alpha=1,
                   emb_gamma=1,
                   emb_beta=30):
        """set_params."""
        self.n_estimators = n_estimators
        self.medium_dim = medium_dim
        self.nn_n_estimators = nn_n_estimators
        self.nn_negative_bias = nn_negative_bias
        self.nn_k = nn_k
        self.nn_p = nn_p
        self.emb_iter = emb_iter
        self.emb_confidence = emb_confidence
        self.emb_sample_fraction = emb_sample_fraction
        self.emb_feature_fraction = emb_feature_fraction
        self.emb_alpha = emb_alpha
        self.emb_gamma = emb_gamma
        self.emb_beta = emb_beta
        # set objects
        self.est_medium_dim = MediumEmbedder(dim=self.medium_dim)
        self.regress2d = ExtraTreesRegressor(n_estimators=self.n_estimators)
        self.est2d = Biased2DAveragedClassifier(
            negative_bias=self.nn_negative_bias,
            n_estimators=self.nn_n_estimators,
            n_neighbors=self.nn_k,
            weights='distance',
            p=self.nn_p)

    def _repr_params(self, params):
        txt = ''
        for key in sorted(self.params_range):
            if len(self.params_range[key]) > 1:
                txt += '  %s:%s  ' % (key, params[key])
        return txt

    def _params_random_choice(self):
        params = dict([(key, random.choice(self.params_range[key]))
                       for key in self.params_range])
        return params

    def _avg_score(self, data, target, n_repetitions=3):
        scores = []
        for i in range(n_repetitions):
            tr_data, ts_data, tr_target, ts_target = train_test_split(
                data, target, test_size=0.33, random_state=421 + i)
            self.fit(tr_data, tr_target)
            score = self.score(ts_data, ts_target)
            scores.append(score)
        score = np.mean(score)
        return score

    def _feature_importance(self, data, target):
        ec = ExtraTreesClassifier(n_estimators=self.n_estimators)
        feature_p = ec.fit(data, target).feature_importances_
        return feature_p

    def fit(self, data, target):
        """fit."""
        tr_data, ts_data, tr_target, ts_target = train_test_split(
            data, target, test_size=0.5, random_state=42)
        self.est_medium_dim.fit(tr_data, tr_target)
        tr_data_medium = self.est_medium_dim.transform(tr_data)
        ts_data_medium = self.est_medium_dim.transform(ts_data)
        feature_p = self._feature_importance(tr_data_medium, tr_target)
        tr_data2d, graph = embed(tr_data_medium,
                                 target=tr_target,
                                 confidence=self.emb_confidence,
                                 n_iter=self.emb_iter,
                                 sample_fraction=self.emb_sample_fraction,
                                 feature_fraction=self.emb_feature_fraction,
                                 feature_p=feature_p,
                                 alpha=self.emb_alpha,
                                 gamma=self.emb_gamma,
                                 beta=self.emb_beta)
        self.regress2d.fit(tr_data_medium, tr_data2d)
        ts_data2d = self.regress2d.predict(ts_data_medium)
        self.est2d.fit(ts_data2d, ts_target)
        return self

    def transform(self, data):
        """transform."""
        data_medium = self.est_medium_dim.transform(data)
        data_2_dim = self.regress2d.predict(data_medium)
        return data_2_dim

    def predict(self, data):
        """predict."""
        data_medium = self.est_medium_dim.transform(data)
        data_2_dim = self.regress2d.predict(data_medium)
        y_score = self.est2d.predict_proba(data_2_dim)
        return y_score

    def score(self, data, target):
        """score."""
        y_score = self.predict(data)
        auc = metrics.roc_auc_score(target, y_score)
        return auc

    def visualize(self, data, target, title='', region_only=False):
        """visualize."""
        auc = self.score(data, target)
        title += 'roc:%.2f' % (auc)
        title += '\nparams:%s' % serialize_dict(self.get_params())

        x2dim = self.transform(data)

        x_min, x_max = x2dim[:, 0].min(), x2dim[:, 0].max()
        y_min, y_max = x2dim[:, 1].min(), x2dim[:, 1].max()
        b = max((x_max - x_min) / 10, (y_max - y_min) / 10)  # border size
        x_min, x_max = x_min - b, x_max + b
        y_min, y_max = y_min - b, y_max + b
        h = b / 20  # step size in the mesh
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        grid2d = np.c_[xx.ravel(), yy.ravel()]
        z = self.est2d.predict_proba(grid2d)
        z = 1 - z.reshape(xx.shape)
        plt.contourf(xx,
                     yy,
                     z,
                     cmap=plt.get_cmap('BrBG'),
                     alpha=.3,
                     levels=[0.05, 0.25, 0.5, 0.75, 0.95],
                     extend='both')
        plt.contour(xx,
                    yy,
                    z,
                    levels=[-1, 0.5, 2],
                    colors='w',
                    linewidths=[.5, 4, .5],
                    linestyles=['solid', 'solid', 'solid'],
                    extend='both')
        plt.contour(xx,
                    yy,
                    z,
                    levels=[-1, 0.5, 2],
                    colors='k',
                    linewidths=[.5, 2, .5],
                    linestyles=['solid', 'solid', 'solid'],
                    extend='both')
        if region_only is False:
            plt.scatter(x2dim[:, 0],
                        x2dim[:, 1],
                        alpha=.8,
                        c=target,
                        s=30,
                        edgecolors='k',
                        cmap=plt.get_cmap('gray'))
        plt.title(title)
        plt.grid(False)
        plt.axis('off')
        return self

    def visualize_data(self,
                       data,
                       target=None,
                       x_min=None,
                       x_max=None,
                       y_min=None,
                       y_max=None):
        """visualize_test."""
        x2dim = self.transform(data)
        if x_min is None or x_max is None or y_min is None or y_max is None:
            x_min, x_max = x2dim[:, 0].min(), x2dim[:, 0].max()
            y_min, y_max = x2dim[:, 1].min(), x2dim[:, 1].max()
        self.visualize_region(x_min, x_max, y_min, y_max)
        if target is None:
            c = 'w'
        else:
            c = target
        plt.scatter(x2dim[:, 0],
                    x2dim[:, 1],
                    alpha=.8,
                    c=c,
                    s=30,
                    edgecolors='k',
                    cmap=plt.get_cmap('gray'))
        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)
        plt.grid()
        return self

    def visualize_region(self, x_min=None, x_max=None, y_min=None, y_max=None):
        """visualize_region."""
        b = max((x_max - x_min) / 10, (y_max - y_min) / 10)  # border size
        x_min, x_max = x_min - b, x_max + b
        y_min, y_max = y_min - b, y_max + b
        h = b / 20  # step size in the mesh
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        grid2d = np.c_[xx.ravel(), yy.ravel()]
        z = self.est2d.predict_proba(grid2d)
        z = 1 - z.reshape(xx.shape)
        plt.contourf(xx,
                     yy,
                     z,
                     cmap=plt.get_cmap('BrBG'),
                     alpha=.3,
                     levels=[0.05, 0.25, 0.5, 0.75, 0.95],
                     extend='both')
        plt.contour(xx,
                    yy,
                    z,
                    levels=[-1, 0.5, 2],
                    colors='w',
                    linewidths=[.5, 4, .5],
                    linestyles=['solid', 'solid', 'solid'],
                    extend='both')
        plt.contour(xx,
                    yy,
                    z,
                    levels=[-1, 0.5, 2],
                    colors='k',
                    linewidths=[.5, 2, .5],
                    linestyles=['solid', 'solid', 'solid'],
                    extend='both')
Ejemplo n.º 32
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -304012.8776428422
exported_pipeline = ExtraTreesRegressor(bootstrap=False,
                                        max_features=0.8,
                                        min_samples_leaf=1,
                                        min_samples_split=16,
                                        n_estimators=100)
# Fix random state in exported estimator
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 33
0
class ExtraTreesRegressor(IterativeComponentWithSampleWeight, BaseRegressionModel):

    def __init__(self, n_estimators, criterion, min_samples_leaf,
                 min_samples_split, max_features, bootstrap, random_state=None):

        if check_none(n_estimators):
            self.n_estimators = None
        else:
            self.n_estimators = int(self.n_estimators)
        self.criterion = criterion

        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = -1
        self.random_state = random_state

        self.estimator = None
        self.start_time = time.time()
        self.time_limit = None

    def fit(self, X, y, sample_weight=None):
        from sklearn.ensemble import ExtraTreesRegressor
        self.bootstrap = check_for_bool(self.bootstrap)
        self.estimator = ExtraTreesRegressor(n_estimators=self.n_estimators,
                                             max_leaf_nodes=None,
                                             criterion=self.criterion,
                                             max_features=self.max_features,
                                             min_samples_split=self.min_samples_split,
                                             min_samples_leaf=self.min_samples_leaf,
                                             max_depth=None,
                                             bootstrap=self.bootstrap,
                                             random_state=self.random_state,
                                             n_jobs=self.n_jobs)
        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'ET',
                'name': 'Extra Trees Regressor',
                'handles_regression': True,
                'handles_classification': False,
                'handles_multiclass': False,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()

            n_estimators = Constant("n_estimators", 100)
            criterion = CategoricalHyperparameter(
                "criterion", ["mse", "mae"], default_value="mse")

            # The maximum number of features used in the forest is calculated as m^max_features, where
            # m is the total number of features, and max_features is the hyperparameter specified below.
            # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This
            # corresponds with Geurts' heuristic.
            max_features = UniformFloatHyperparameter(
                "max_features", 0., 1., default_value=0.5)

            min_samples_split = UniformIntegerHyperparameter(
                "min_samples_split", 2, 20, default_value=2)
            min_samples_leaf = UniformIntegerHyperparameter(
                "min_samples_leaf", 1, 20, default_value=1)

            bootstrap = CategoricalHyperparameter(
                "bootstrap", ["True", "False"], default_value="False")
            cs.add_hyperparameters([n_estimators, criterion, max_features, min_samples_split, min_samples_leaf,
                                    bootstrap])

            return cs
        elif optimizer == 'tpe':
            space = {'n_estimators': hp.choice('et_n_estimators', [100]),
                     'criterion': hp.choice('et_criterion', ["mse", "mae"]),
                     'max_features': hp.uniform('et_max_features', 0, 1),
                     'min_samples_split': hp.randint('et_min_samples_split', 19) + 2,
                     'min_samples_leaf': hp.randint('et_min_samples_leaf,', 20) + 1,
                     'bootstrap': hp.choice('et_bootstrap', ["True", "False"])}

            init_trial = {'n_estimators': 100, 'criterion': "mse", 'max_features': 0.5,
                          'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': "False"}
            return space
Ejemplo n.º 34
0
#ax1.set_title("Training dataset after PCA")
#ax2.set_title("Standardized training dataset after PCA")
#
#for ax in (ax1, ax2):
#    ax.set_xlabel("1st principal component")
#    ax.set_ylabel("2nd principal component")
#    ax.legend(loc="upper right")
#    ax.grid()
#
#plt.tight_layout()
#plt.show()

# Prediction
############
t0 = time.time()
y_regr = regr.predict(x_test)
regr_predict = time.time() - t0
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict))

#with open('output.log', 'w') as f:
#    print("Training time: %.6f s" % regr_fit, file=f)
#    print("Prediction time: %.6f s" % regr_predict, file=f)
#    print(" ", file=f)
#    print("The model performance for training set", file=f)
#    print("--------------------------------------", file=f)
#    print('MAE is {}'.format(train_score_mae), file=f)
#    print('MSE is {}'.format(train_score_mse), file=f)
#    print('EVS is {}'.format(train_score_evs), file=f)
#    print('ME is {}'.format(train_score_me), file=f)
#    print('R2 score is {}'.format(train_score_r2), file=f)
#    print(" ", file=f)
Ejemplo n.º 35
0
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

df=pd.read_csv('zomato_df.csv')

df.drop('Unnamed: 0',axis=1,inplace=True)
print(df.head())
x=df.drop('rate',axis=1)
y=df['rate']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3,random_state=10)


#Preparing Extra Tree Regression
from sklearn.ensemble import  ExtraTreesRegressor
ET_Model=ExtraTreesRegressor(n_estimators = 120)
ET_Model.fit(x_train,y_train)


y_predict=ET_Model.predict(x_test)


import pickle
# # Saving model to disk
pickle.dump(ET_Model, open('model.pkl','wb'))
model=pickle.load(open('model.pkl','rb'))
print(y_predict)

Ejemplo n.º 36
0
# 使用RandomForestRegressor训练模型,并对测试数据做出预测,结果存储在变量rfr_y_predict中。
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_y_predict = rfr.predict(X_test)

# 使用ExtraTreesRegressor训练模型,并对测试数据做出预测,结果存储在变量etr_y_predict中。
'''
极端随机森林 于普通随机森林不同:
在每当构建一棵树的分裂节点的时候,不会任意地选取特征,
而是先随机收集一部分特征,然后利用信息熵和基尼不纯度等指标挑选最佳的节点特征

'''
etr = ExtraTreesRegressor()
etr.fit(X_train, y_train)
etr_y_predict = etr.predict(X_test)

# 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_predict = gbr.predict(X_test)

from sklearn.metrics import mean_absolute_error,mean_squared_error
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。
print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test))
print( 'The mean squared error of RandomForestRegressor:', mean_squared_error(y_test, rfr_y_predict))
print( 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(y_test, rfr_y_predict))


# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。
print('R-squared value of ExtraTreesRegessor:', etr.score(X_test, y_test))
Ejemplo n.º 37
0
class QVtree:
    def __init__(self,
                 D,
                 maxgrid,
                 radius,
                 para,
                 num_split=40,
                 num_leaf=20,
                 num_est=215):

        self.Q_f = Tree(n_estimators=num_est,
                        min_samples_split=num_split,
                        min_samples_leaf=num_leaf,
                        n_jobs=para.CPU_CORES)

        Twv = (1 / radius) / 1.8
        T = [Twv for t in range(D)]
        L = int(140 / Twv)

        points = maxgrid
        self.W_f = Tilecode(D,
                            T,
                            L,
                            mem_max=1,
                            lin_spline=True,
                            linT=7,
                            cores=para.CPU_CORES)
        self.V_f = Tilecode(D,
                            T,
                            L,
                            mem_max=1,
                            lin_spline=True,
                            linT=7,
                            cores=para.CPU_CORES)

        self.maxgrid = maxgrid
        self.radius = radius
        self.D = D

        self.first = True

        self.beta = para.beta
        self.CORES = para.CPU_CORES

    def iterate(self,
                XA,
                X1,
                u,
                A_low,
                A_high,
                ITER=50,
                Ascaled=False,
                plot=True,
                xargs=[],
                output=True,
                gridsamp=1):

        tic = time()

        self.v_e = 0  # Value function error
        self.p_e = 0  # Policy function error

        tic = time()
        N = int(gridsamp * X1.shape[0])
        grid, m = buildgrid(X1[0:N, :], self.maxgrid, self.radius, scale=True)
        points = grid.shape[0]
        toc = time()
        print 'State grid points: ' + str(points) + ', of maximum: ' + str(
            m) + ', Time taken: ' + str(toc - tic)

        if self.first:
            self.W_f.fit(grid, np.zeros(points))
            self.V_f.fit(grid, np.zeros(points))
            self.first = False

        Al = np.zeros(points)
        Ah = np.zeros(points)
        if Ascaled:
            for i in range(points):
                Ws = self.W_f_old.predict(grid[i, :])
                Al[i] = A_low(grid[i, :], Ws)
                Ah[i] = A_high(grid[i, :], Ws)
        else:
            for i in range(points):
                Al[i] = A_low(grid[i, :])
                Ah[i] = A_high(grid[i, :])

        # ------------------
        #   Q-learning
        # ------------------

        #First iteration
        j = 0

        # Q values
        Q = u + self.beta * self.V_f.predict(X1, store_XS=True)

        # Fit Q function
        self.Q_f.fit(XA, Q)

        # Optimise Q function
        ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output)

        for j in range(ITER):

            # Q values
            Q = u + self.beta * self.V_f.fast_values()

            # Fit Q function
            tic = time()
            self.Q_f.fit(XA, Q)
            toc = time()
            print 'Fit time: ' + str(toc - tic)

            # Optimise Q function
            ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output)

        toc = time()

        print 'Solve time: ' + str(toc - tic)

        if plot:
            self.W_f.plot(xargs, showdata=True)
            pylab.show()
            #self.V_f.plot(['x', 1], showdata=True)
            #pylab.show()

    def maximise(self, grid, Al, Ah, Ascaled, plot=False, output=True):

        tic = time()

        if Ascaled:
            Alow = np.zeros(grid.shape[0])
            Ahigh = np.ones(grid.shape[0])
        else:
            Alow = Al
            Ahigh = Ah

        N = grid.shape[0]
        W_opt = np.zeros(N)
        V = np.zeros(N)
        Wgrid = np.zeros(0)
        for i in range(N):
            Wgrid = np.append(Wgrid, np.linspace(Alow[i], Ahigh[i], 300))
        x = np.repeat(grid, 300, axis=0)
        X = np.hstack([Wgrid.reshape([N * 300, 1]), x])

        tic = time()
        Qhat = self.Q_f.predict(X)
        toc = time()
        print str(toc - tic)

        j = 0
        for i in range(N):
            idx = np.argmax(Qhat[j:j + 300])
            W_opt[i] = Wgrid[j + idx]
            V[i] = Qhat[j + idx]
            j = j + 300

        if Ascaled:
            W_opt = Al[idx] + (Ah[idx] - Al[idx]) * W_opt

        W_opt_old = self.W_f.predict(grid)
        V_old = self.V_f.predict(grid)

        self.V_f.fit(grid, V)
        self.W_f.fit(grid, W_opt, sgd=1, eta=0.4, n_iters=1, scale=0)

        self.p_e = np.mean(abs(W_opt_old - W_opt) / W_opt_old)
        self.v_e = np.mean(abs(V_old - V) / V_old)

        toc = time()

        if output:
            print 'Maximisation time: ' + str(toc - tic)
            print 'Value function change: ' + str(round(
                self.v_e, 4)) + ', Policy change: ' + str(round(self.p_e, 4))

        if plot:
            self.W_f.plot(['x', 1], showdata=True)
            pylab.show()

            self.V_f.plot(['x', 1], showdata=True)
            pylab.show()

        return self.v_e
    clf1 = ExtraTreesRegressor(n_estimators=1000,
                               max_depth=4,
                               min_samples_leaf=1)
    clf2 = RandomForestRegressor(n_estimators=1000,
                                 max_depth=7,
                                 min_samples_split=20,
                                 random_state=0)
    clf3 = GradientBoostingRegressor(learning_rate=0.003,
                                     max_depth=3,
                                     min_samples_split=35,
                                     min_samples_leaf=10,
                                     n_estimators=1500)
    clf4 = LassoLarsCV(cv=20)

    clf1.fit(dev_X, dev_y)
    preds = clf1.predict(val_X)
    if len(X_pred3) < 1:
        X_pred3 = preds
    else:
        X_pred3 = np.concatenate((X_pred3, preds), axis=0)

    scores3.append(r2_score(np.exp(val_y), np.exp(preds)))
    print("model 3 scores: ", scores3)

    clf2.fit(dev_X, dev_y)
    preds = clf2.predict(val_X)
    if len(X_pred4) < 1:
        X_pred4 = preds
    else:
        X_pred4 = np.concatenate((X_pred4, preds), axis=0)
Ejemplo n.º 39
0
def main():

    # random number initialization
    np.random.seed(123456000)

    # preprocess data by PCA and standardization
    Xtrain__full, ytrain__full, Xtest = load_data(argv[1], argv[2])
    # Xtrain__full, ytrain__full, Xtest = load_data("train_data.csv","test_data.csv")
    Xtrain__full, ytrain__full, Xtest = preprocess(Xtrain__full, ytrain__full,
                                                   Xtest)

    # train-set and validation-set split
    X_train, X_val, y_train, y_val = train_test_split(Xtrain__full,
                                                      ytrain__full,
                                                      test_size=0.20,
                                                      random_state=None)

    # ============================================================================================================
    print(" ")
    print(" ")
    print("Linear regressor classifier")
    start_time = time.time()
    LR = regressor(0.01)
    LR.fit(X_train, y_train)

    show_performance(LR, X_train, y_train, "Train")
    show_performance(LR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Stochastic gradient descent regressor classifier")
    start_time = time.time()

    SGDR = SGDRegressor(loss='huber',
                        penalty='elasticnet',
                        max_iter=100,
                        eta0=0.01)
    SGDR.fit(X_train, y_train.flatten())

    show_performance(SGDR, X_train, y_train, "Train")
    show_performance(SGDR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Neural network classifier")
    start_time = time.time()

    def baseline_model(D):

        # Defining the NN based regressor
        model = Sequential()
        model.add(
            Dense(D,
                  input_dim=D,
                  kernel_initializer='glorot_uniform',
                  activation='relu'))
        model.add(Dropout(0.25))
        model.add(
            Dense(D,
                  input_dim=D,
                  kernel_initializer='glorot_uniform',
                  activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(1, kernel_initializer='glorot_uniform'))
        model.compile(loss='mae', optimizer='adam', metrics=['mae'])

        return model

    _, D = np.shape(X_train)
    # KR = KerasRegressor(build_fn=baseline_model(D), epochs=30, batch_size=16, verbose=False)
    KR = baseline_model(D)
    KR.fit(X_train, y_train, epochs=100, batch_size=16, verbose=False)

    show_performance(KR, X_train, y_train, "Train")
    show_performance(KR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Extratrees regressor classifier")
    start_time = time.time()

    ET = ExtraTreesRegressor(n_estimators=200,
                             criterion='mae',
                             min_samples_split=2,
                             min_samples_leaf=1)
    ET.fit(X_train, y_train.flatten())

    show_performance(ET, X_train, y_train, "Train")
    show_performance(ET, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Extreme gradient boosted regressor classifier")
    start_time = time.time()

    n = Xtrain__full.shape[1]
    XGBR = xgbr(n_estimators=400, max_depth=int(np.sqrt(n)))
    XGBR.fit(X_train, y_train.flatten())

    show_performance(XGBR, X_train, y_train, "Train")
    show_performance(XGBR, X_val, y_val, "Validation")

    show_time(time.time() - start_time)

    # ============================================================================================================
    print("Soft voting over best performing ET and XGBR classifiers")
    temp1 = ET.predict(X_val)
    temp2 = XGBR.predict(X_val)
    temp = np.average([temp1, temp2], axis=0, weights=[7, 10])
    mae = mean_absolute_error(y_val, temp)
    print("Validation MAE: %f" % mae)

    # ============================================================================================================
    print(" ")
    print(" ")
    print("Writing out the results")
    temp1 = ET.predict(Xtest)
    temp2 = XGBR.predict(Xtest)
    temp = np.average([temp1, temp2], axis=0, weights=[7, 10])
    predictions = temp.astype(int)

    df = pd.read_csv(argv[2])
    # df = pd.read_csv("test_data.csv")
    df['predicted_ground_truth'] = predictions
    df.to_csv(argv[2], index=False)
    # df.to_csv('test_data.csv', index=False)
    print("Task completed")
Ejemplo n.º 40
0
def run(feature_files, training_dates, feature_set_folder):
    train_set = pd.concat([
        dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date)
        for date in training_dates
    ])
    test_set = dfs(0, len(feature_files), feature_files + ['y'],
                   'dataset/2016-06-01')
    test1_set = dfs(0, len(feature_files), feature_files + ['y'],
                    'dataset/2016-05-25')
    # train_set.to_csv('train_set.csv', index=False)
    # test_set.to_csv('test_set.csv', index=False)
    '''
    unique_size = pd.read_csv('unique_size.csv')
    train_set = pd.merge(train_set, unique_size, how='left')
    train_set = train_set[train_set.unique_size > 1]
    train_set.drop(['unique_size'], axis=1, inplace=True)
    '''

    train_set = train_set.fillna(-1, downcast='infer')
    test_set = test_set.fillna(-1, downcast='infer')
    test1_set = test1_set.fillna(-1, downcast='infer')

    train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x))
    test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x))
    test1_set['y_log'] = test1_set['y'].apply(lambda x: np.log(1 + x))

    feature_set = filter(
        lambda x: x not in
        ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'],
        train_set.columns)

    scaler = StandardScaler()
    scaler.fit(train_set[feature_set].as_matrix())

    # model1
    model1 = LinearRegression(normalize=True)
    model1.fit(scaler.transform(train_set[feature_set].as_matrix()),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    print zip(feature_set, model1.coef_)
    test_set['predictY'] = model1.predict(
        scaler.transform(test_set[feature_set].as_matrix()))
    test_set.to_csv('result/' + feature_set_folder + '/model1_offline.csv')
    test1_set['predictY'] = model1.predict(
        scaler.transform(test1_set[feature_set].as_matrix()))
    test1_set.to_csv('result/' + feature_set_folder + '/model1_offline1.csv')

    # model2
    model2 = XGBRegressor(n_estimators=600,
                          learning_rate=0.01,
                          max_depth=6,
                          colsample_bytree=0.7,
                          subsample=0.7,
                          colsample_bylevel=0.7)
    model2.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder + '/model2_offline.csv')
    test1_set['predictY'] = model2.predict(test1_set[feature_set].as_matrix())
    test1_set.to_csv('result/' + feature_set_folder + '/model2_offline1.csv')

    # model3
    model3 = LinearSVR(tol=1e-7)
    model3.fit(scaler.transform(train_set[feature_set].as_matrix()),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model3.predict(
        scaler.transform(test_set[feature_set].as_matrix()))
    test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv')
    test1_set['predictY'] = model3.predict(
        scaler.transform(test1_set[feature_set].as_matrix()))
    test1_set.to_csv('result/' + feature_set_folder + '/model3_offline1.csv')

    # model4
    model4 = RandomForestRegressor(n_estimators=1000,
                                   max_depth=7,
                                   max_features=0.2,
                                   max_leaf_nodes=100)
    model4.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=np.array(
                   map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())))
    test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder + '/model4_offline.csv')
    test1_set['predictY'] = model4.predict(test1_set[feature_set].as_matrix())
    test1_set.to_csv('result/' + feature_set_folder + '/model4_offline1.csv')

    # model15
    model15 = ExtraTreesRegressor(n_estimators=1000,
                                  max_depth=12,
                                  max_features=0.3,
                                  max_leaf_nodes=400)
    model15.fit(train_set[feature_set].as_matrix(),
                train_set['y'].as_matrix(),
                sample_weight=np.array(
                    map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())))
    test_set['predictY'] = model15.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder + '/model15_offline.csv')
    test1_set['predictY'] = model15.predict(test1_set[feature_set].as_matrix())
    test1_set.to_csv('result/' + feature_set_folder + '/model15_offline1.csv')

    # model5
    model5 = XGBRegressor(n_estimators=600,
                          learning_rate=0.01,
                          max_depth=6,
                          colsample_bytree=0.7,
                          subsample=0.7,
                          colsample_bylevel=0.7,
                          seed=10000)
    model5.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder + '/model5_offline.csv')
    test1_set['predictY'] = model5.predict(test1_set[feature_set].as_matrix())
    test1_set.to_csv('result/' + feature_set_folder + '/model5_offline1.csv')

    # model6
    model6 = XGBRegressor(n_estimators=600,
                          learning_rate=0.01,
                          max_depth=5,
                          colsample_bytree=0.7,
                          subsample=0.7,
                          colsample_bylevel=0.7)
    model6.fit(train_set[feature_set].as_matrix(),
               train_set['y'].as_matrix(),
               sample_weight=map(lambda x: 1.0 / x / x,
                                 train_set['y'].as_matrix()))
    test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix())
    test_set.to_csv('result/' + feature_set_folder + '/model6_offline.csv')
    test1_set['predictY'] = model6.predict(test1_set[feature_set].as_matrix())
    test1_set.to_csv('result/' + feature_set_folder + '/model6_offline1.csv')

    pass
Ejemplo n.º 41
0
]]
y_ol = ol['ActualDays']
y_lo = lo['ActualDays']
y_eo = eo['ActualDays']

# X = data[train_target]
y = data[test_target]

X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)

clf = ExtraTreesRegressor(n_estimators=100, criterion='mae')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = mean_absolute_error(y_pred, y_test)
print(score)
y_pred_1 = [math.floor(x) if x > 0 else math.ceil(x) for x in y_pred]
score2 = mean_absolute_error(y_pred_1, y_test)

print(score2)

# On the line point region regression
clf_ol = ExtraTreesRegressor(n_estimators=100,
                             criterion='mse',
                             bootstrap=False)
clf_ol.fit(y_ol, y_ol)

# Later or on time data region regression
clf_lo = ExtraTreesRegressor(n_estimators=10, criterion='mse', bootstrap=False)
Ejemplo n.º 42
0
    test = o.features[col]

    n = test.isnull().sum(axis=1)

    for c in test.columns:

        test[c + '_nan_'] = pd.isnull(test[c])

    test = test.fillna(d_mean)

    test['znull'] = n

    pred = o.target

    pred['y'] = model_et.predict(test).clip(low_y_cut, high_y_cut)

    pred['y'] = pred.apply(get_weighted_y, axis=1)

    o, reward, done, info = env.step(pred[['id', 'y']])

    pred_y = list(pred.y.values)

    y_actual_list.extend(actual_y)

    y_pred_list.extend(pred_y)

    overall_reward = get_reward(np.array(y_actual_list), np.array(y_pred_list))

    et_overall_reward_list.append(overall_reward)
Ejemplo n.º 43
0
    rf.fit(x_train,y_train)
    y_pre_rf = rf.predict(x_test)
    rf_pre.append(y_pre_rf)
    '''
    ###
    params_ExtraTrees = {
        'max_features': 'log2',
        'n_estimators': 600,
        'max_depth': 10,
        'oob_score': True,
        'n_jobs': 4,
        'bootstrap': True
    }
    ext = ExtraTreesRegressor(**params_ExtraTrees)
    ext.fit(x_train, y_train)
    y_pre_ext = ext.predict(x_test)
    #ext_pre.append(y_pre_ext)
    #y_pre_ext1 = ext.predict(x_test[:-7])
    #y_pre_ext7d = np.append(y_pre_ext1,y_pre_ext1)
    #ext_pre7d.append(y_pre_ext7d)

    #print(y_pre_ext)
    #print(y_pre_ext1)
    ###
    '''
    params_gbrt = {'loss':'huber','n_estimators': 500,'max_depth':12,'learning_rate': 0.01, 'random_state': 3}
    gbrt = GradientBoostingRegressor(**params_gbrt)
    gbrt.fit(x_train,y_train)
    y_pre_gbrt = gbrt.predict(x_test)
    gbrt_pre.append(y_pre_gbrt)
    '''
Ejemplo n.º 44
0
    X = list(zip(*X1))
    Y = cols[13]

    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.2, random_state=rd.randrange(1000))
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    #print(y_test)

    lin_reg_mod = ExtraTreesRegressor(n_estimators=500)

    lin_reg_mod.fit(X_train, y_train)
    pred = lin_reg_mod.predict(X_test)
    #print(pred)
    #print(y_test)
    test_set_r2 = r2_score(y_test, pred)
    print(test_set_r2)
    tr2 += test_set_r2

    #abs_er = mean_absolute_error(y_test, pred)
    #tabse+=abs_er

    temp = []
    for (i, j) in zip(y_test, pred):
        t = (abs(i - j)) / float(i)
        temp.append(t)
    #print(temp)
    print(np.median(temp))
Ejemplo n.º 45
0
class ExtraTreesRegressor(ParamSklearnRegressionAlgorithm):
    def __init__(self, n_estimators, criterion, min_samples_leaf,
                 min_samples_split, max_features,
                 max_leaf_nodes_or_max_depth="max_depth",
                 bootstrap=False, max_leaf_nodes=None, max_depth="None",
                 oob_score=False, n_jobs=1, random_state=None, verbose=0):

        self.n_estimators = int(n_estimators)
        self.estimator_increment = 10
        if criterion not in ("mse"):
            raise ValueError("'criterion' is not in ('mse'): "
                             "%s" % criterion)
        self.criterion = criterion

        if max_leaf_nodes_or_max_depth == "max_depth":
            self.max_leaf_nodes = None
            if max_depth == "None":
                self.max_depth = None
            else:
                self.max_depth = int(max_depth)
                #if use_max_depth == "True":
                #    self.max_depth = int(max_depth)
                #elif use_max_depth == "False":
                #    self.max_depth = None
        else:
            if max_leaf_nodes == "None":
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(max_leaf_nodes)
            self.max_depth = None

        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)

        self.max_features = float(max_features)

        if bootstrap == "True":
            self.bootstrap = True
        elif bootstrap == "False":
            self.bootstrap = False

        self.oob_score = oob_score
        self.n_jobs = int(n_jobs)
        self.random_state = random_state
        self.verbose = int(verbose)
        self.estimator = None

    def fit(self, X, y, refit=False):
        if self.estimator is None or refit:
            self.iterative_fit(X, y, n_iter=1, refit=refit)

        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)
        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(
                n_estimators=0, criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
                oob_score=self.oob_score, n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                warm_start=True
            )
        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(X, y,)
        self.estimator = tmp
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not len(self.estimator.estimators_) < self.n_estimators

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'ET',
                'name': 'Extra Trees Regressor',
                'handles_missing_values': False,
                'handles_nominal_values': False,
                'handles_numerical_features': True,
                'prefers_data_scaled': False,
                # TODO find out if this is good because of sparcity...
                'prefers_data_normalized': False,
                'handles_regression': True,
                'handles_classification': False,
                'handles_multiclass': False,
                'handles_multilabel': False,
                'is_deterministic': True,
                'handles_sparse': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,),
                # TODO find out what is best used here!
                # But rather fortran or C-contiguous?
                'preferred_dtype': np.float32}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100))
        criterion = cs.add_hyperparameter(Constant("criterion", "mse"))
        max_features = cs.add_hyperparameter(UniformFloatHyperparameter(
            "max_features", 0.5, 5, default=1))

        max_depth = cs.add_hyperparameter(
            UnParametrizedHyperparameter(name="max_depth", value="None"))

        min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "min_samples_split", 2, 20, default=2))
        min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "min_samples_leaf", 1, 20, default=1))

        # Unparametrized, we use min_samples as regularization
        # max_leaf_nodes_or_max_depth = UnParametrizedHyperparameter(
        # name="max_leaf_nodes_or_max_depth", value="max_depth")
        # CategoricalHyperparameter("max_leaf_nodes_or_max_depth",
        # choices=["max_leaf_nodes", "max_depth"], default="max_depth")
        # min_weight_fraction_leaf = UniformFloatHyperparameter(
        #    "min_weight_fraction_leaf", 0.0, 0.1)
        # max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes",
        #                                              value="None")

        bootstrap = cs.add_hyperparameter(CategoricalHyperparameter(
            "bootstrap", ["True", "False"], default="False"))

        # Conditions
        # Not applicable because max_leaf_nodes is no legal value of the parent
        #cond_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=max_leaf_nodes,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_leaf_nodes")
        #cond2_max_leaf_nodes_or_max_depth = \
        #    EqualsCondition(child=use_max_depth,
        #                    parent=max_leaf_nodes_or_max_depth,
        #                    value="max_depth")

        #cond_max_depth = EqualsCondition(child=max_depth, parent=use_max_depth,
        #value="True")
        #cs.add_condition(cond_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond2_max_leaf_nodes_or_max_depth)
        #cs.add_condition(cond_max_depth)

        return cs
train = data[targets < 30]
test = data[targets >= 30]  # Test on independent people
n_pixels = data.shape[1]

X_train = train[:, :int(0.5 * n_pixels)]  # Upper half of the faces
Y_train = train[:, int(0.5 * n_pixels):]  # Lower half of the faces
X_test = test[:, :int(0.5 * n_pixels)]
Y_test = test[:, int(0.5 * n_pixels):]

# Build a multi-output forest
forest = ExtraTreesRegressor(n_estimators=10,
                             max_features=32,
                             random_state=0)

forest.fit(X_train, Y_train)
Y_test_predict = forest.predict(X_test)

# Plot the completed faces
n_faces = 5
image_shape = (64, 64)

pl.figure(figsize=(2. * n_faces, 2.26 * 2))
pl.suptitle("Face completion with multi-output forests", size=16)

for i in range(1, 1 + n_faces):
    face_id = np.random.randint(X_test.shape[0])

    true_face = np.hstack((X_test[face_id], Y_test[face_id]))
    completed_face = np.hstack((X_test[face_id], Y_test_predict[face_id]))

    pl.subplot(2, n_faces, i)
Ejemplo n.º 47
0
def sens(sample=20):

    home = '/home/nealbob'
    folder = '/Dropbox/Model/results/chapter7/chapter7/'
    out = '/Dropbox/Thesis/IMG/chapter7/'
    img_ext = '.pdf'
    table_out = '/Dropbox/Thesis/STATS/chapter7/'
    
    rows = ['CS', 'SWA', 'OA', 'CS-HL']
    results = {run_no: {row : 0 for row in rows} for run_no in range(1,sample)} 
    samprange = []
    
    for run_no in range(1, sample): 
        try:
            for row in rows:
                with open(home + folder + str(run_no) + '_' + row + '_result.pkl', 'rb') as f:
                    results[run_no][row] = pickle.load(f)
                    f.close()
                m = len(results[run_no]['CS'][0]['S']['Annual']['Mean']) - 1
                SW = results[run_no][row][0]['SW']['Annual']['Mean'][m]
                if math.isnan(SW) or math.isinf(SW):
                    raise Exception("Found a nan")
            samprange.append(run_no)
        except:
            print row
            print 'Run no: ' + str(run_no) + ' failed.'

    n = len(samprange)
    print str(n) + ' good runs of ' + str(sample - 1) + ' total'
     
    ###### Summary tables #####
    
    series = ['SW', 'Profit', 'B', 'Budget', 'S']
    title = {'SW' : 'Social welfare relative to CS', 'Profit' : 'Profit relative to CS', 'B' : 'Environmental benefits relative to CS', 'S' : 'Storage relative to CS', 'Budget' : 'Environmental trade relative to CS'}
    scale = {'SW' : 1000000, 'Profit' : 1000000, 'S' : 1000, 'W' : 1000, 'E' : 1000, 'B' : 1000000, 'Z' : 1000, 'Q_low' : 1000, 'Q_high' : 1000, 'Q_env' : 1000, 'A_low' : 1000, 'A_high' : 1000, 'A_env' : 1000, 'S_low' : 1000, 'S_high' : 1000, 'S_env' : 1000, 'U_low' : 1000000, 'U_high' : 1000000, 'Budget' : 1000000}

    m = len(results[1]['CS'][0]['S']['Annual']['Mean']) - 1
    
    X = {}
    XI = {}

    for x in series:
        data0 = []
        data1 = []
        data2 = []
        
        for row in rows:
            temp = np.zeros(n)
            record = {}
            record1 = {}
            i = 0
            for run_no in samprange:
                temp[i] = results[run_no][row][0][x]['Annual']['Mean'][m] / scale[x]
                i += 1
                record[run_no] = results[run_no][row][0][x]['Annual']['Mean'][m] / scale[x]
            record1['Mean'] = np.mean(temp)
            record1['Min'] = np.min(temp)
            record1['Q1'] = np.percentile(temp, 25)
            record1['Q3'] = np.percentile(temp, 75)
            record1['Max'] = np.max(temp)
            
            X[row] = temp

            data0.append(record)
            data1.append(record1)

        data = pandas.DataFrame(data0)
        data.index = rows
        data1 = pandas.DataFrame(data1)
        data1.index = rows #['Mean', 'Min', 'Q1', 'Q3', 'Max']

        for row in rows:
            record2 = {}
            temp1 = np.zeros(n)
            for i in range(n):
                temp1[i] = X[row][i] / X['CS'][i]
            
            XI[row] = temp1
            
            record2['Mean'] = np.mean(temp1)
            record2['Min'] = np.min(temp1)
            record2['Q1'] = np.percentile(temp1, 25)
            record2['Q3'] = np.percentile(temp1, 75)
            record2['Max'] = np.max(temp1)
            data2.append(record2)
        
        data2 = pandas.DataFrame(data2)
        data2.index = rows #['Mean', 'Min', 'Q1', 'Q3', 'Max']

        with open(home + table_out + 'sens_full' + x + '.txt', 'w') as f:
            f.write(data.to_latex(float_format='{:,.2f}'.format, columns=samprange))
            f.close()
        
        with open(home + table_out + 'sens_sum' + x + '.txt', 'w') as f:
            f.write(data1.to_latex(float_format='{:,.2f}'.format, columns=['Mean', 'Min', 'Q1', 'Q3', 'Max']))
            f.close()
        
        with open(home + table_out + 'sens_table' + x + '.txt', 'w') as f:
            f.write(data2.to_latex(float_format='{:,.2f}'.format, columns=['Mean', 'Min', 'Q1', 'Q3', 'Max']))
            f.close()
        
        minx = np.percentile([min(XI[i]) for i in XI], 1)
        maxx = np.percentile([max(XI[i]) for i in XI],99)
        
        chart_ch7(XI, 0.985 * minx, 1.015 * maxx, title[x], out, str(x) + '_sens')

    ##################################################################################### Regression

    Y = np.zeros([n, 4])
    
    j = 0
    for row in rows:
        i = 0
        for run_no in samprange:
            Y[i, j] = results[run_no][row][0]['SW']['Annual']['Mean'][m] /  results[run_no]['CS'][0]['SW']['Annual']['Mean'][m]
            i += 1
        j += 1

    paras = []
    for run_no in range(1, sample): 
        with open(home + folder + str(run_no) + '_para.pkl', 'rb') as f:
            paras.append(pickle.load(f))
            f.close()
    
    pname1 = ['delta0', 'I_K', 'SD_I', 't_cost', 'N_high', 'rho_I', 'alpha', 'rho_eps', 'sig_eta', 'LL']
    numpara1 = len(pname1)    
    pname2 = ['omega_mu', 'omega_sig', 'omegadelta', 'delta_a', 'delta_Ea', 'delta_Eb', 'delta_R', 'b_1', 'b_value', 'e_sig']
    numpara2 = len(pname2)    
    para_labels = pname1 + pname2 + ['lambda', 'lambdaHL', 'lambdae']
    numpara = numpara1 + numpara2 + 3

    X = np.zeros([n, numpara])
    
    para_names = ['$\delta0$', '$E[I]/K$',  '$c_v$', '$\tau$', '$n_{high}$', '$\rho_I$', '$\alpha$', '$\rho_e$', '$\sigma_{\eta}$', '${\aA_{low} \over E[I]/K}$', '$\mu_\omega$', '$\sigma_\omega$', '$\omega_\delta$', '$\delta_a$', '$\delta_{Ea}$', '$\delta_{Eb}$', '$\delta_R$', '$b_1$', '$b_{\$} \over \bar I$', '$\sigma_{e0}$', '$\Lambda_{high} - \hat \Lambda_{high}$', '$\Lambda_{high}^{CS-HL} - \hat \Lambda_{high}^{CS-HL}$', '$\lambda_0 - \hat \lambda_0$' ]
    
    for j in range(numpara1):
        for i in range(n):
            if pname1[j] == 'LL':
                X[i, j] = paras[samprange[i]-1].para_list[pname1[j]] / paras[samprange[i]-1].para_list['I_K']
            else:
                X[i, j] = paras[samprange[i]-1].para_list[pname1[j]]
    
    for j in range(numpara1, numpara2+numpara1):
        for i in range(n):
            if pname2[j - numpara1] == 'b_value':
                X[i, j] = paras[samprange[i]-1].ch7[pname2[j - numpara1]] / (paras[samprange[i]-1].para_list['I_K']*1000000)
            else:
                X[i, j] = paras[samprange[i]-1].ch7[pname2[j - numpara1]]
        
    CS_c = -0.153007555
    CS_b = 0.00930613
    CSHL_c = -0.0891846
    CSHL_b = 0.0047009
    
    for i in range(n): 
        if i > 20:
            y = paras[samprange[i]-1].y
        else:
            y = CS_c + CS_b * paras[samprange[i]-1].para_list['N_high']
        X[i, numpara2 + numpara1] = paras[samprange[i]-1].Lambda_high - y 
    
    for i in range(n): 
        if i > 20:
            yhl = paras[samprange[i]-1].yhl
        else:
            yhl = CSHL_c + CSHL_b * paras[samprange[i]-1].para_list['N_high']
        X[i, numpara2 + numpara1 + 1] = paras[samprange[i]-1].Lambda_high_HL - yhl
    
    yelist = [0.4443, 0.1585, 0.1989, 0.2708, 0.3926, 0.0697, 0.1290, 0.1661, 0.2687, 0.0868, 0.1239, 0.3598, 0.3543, 0.2883, 0.2367, 0.2139, 0.2485, 0.2641, 0.5730, 0.1745] 
    
    lambdae = np.zeros(n)
    for i in range(n): 
        if i >= 20:
            ye = paras[samprange[i]-1].E_lambda_hat
        else:
            ye = yelist[samprange[i]-1]  
        X[i, numpara2 + numpara1 + 2] = paras[samprange[i]-1].ch7['inflow_share'] - ye
        lambdae[i] = paras[samprange[i]-1].ch7['inflow_share']
    
    index = lambdae < 0.5 
    pylab.hexbin(lambdae[index], X[index,1], C=Y[index, 2], gridsize=15)
    pylab.xlabel('Environmental share, $\lambda_0$')
    pylab.ylabel('Mean Inflow to Capacity, $E[I_t]/K$')
    cb = pylab.colorbar()
    cb.set_label('OA welfare relative to CS') 
    #pylab.ylim(0, 1000)
    pylab.savefig(home + out + 'OAversusCS.pdf', bbox_inches='tight')
    pylab.show()

    pylab.hexbin(X[:, numpara -1], X[:, 1], C=Y[:, 3], gridsize=15)
    pylab.xlabel('Environmental share, $\lambda_0 - \hat \lambda_0$')
    pylab.ylabel('Mean Inflow to Capacity, $E[I_t]/K$')
    cb = pylab.colorbar()
    cb.set_label('CS-HL welfare relative to CS') 
    #pylab.ylim(0, 1000)
    pylab.savefig(home + out + 'CSHLversusCS.pdf', bbox_inches='tight')
    pylab.show()

    
    tree = Tree(n_estimators=500, n_jobs=4)
    tree.fit(X, Y)
    rank = tree.feature_importances_ * 100
    
    data0 = []
    inn = 0
    for p in para_names:
        record = {}
        record['Importance'] = rank[inn]
        data0.append(record)
        inn = inn + 1

    tab = pandas.DataFrame(data0)
    tab.index = para_names
    tab = tab.sort(columns=['Importance'], ascending=False)
    tab_text = tab.to_latex(float_format='{:,.2f}'.format, escape=False)
    print tab_text 
    with open(home + table_out + 'importance.txt', 'w') as f:
        f.write(tab_text)
        f.close()
     
    for i in range(numpara):
        Xtemp = np.zeros([200, numpara])
        for j in range(numpara):
            Xtemp[:, j] = np.ones(200) * np.mean(X[:, j])

        Xtemp[:, i] = np.linspace(np.min(X[:, i]), np.max(X[:, i]), 200)
        Ytemp = tree.predict(Xtemp)
        
        data = [[Xtemp[:, i], Ytemp]]
        data0 = []
        for k in range(200):
            record = {}
            record['SWA'] = Ytemp[k, 1]
            record['OA'] = Ytemp[k, 2]
            record['CS-HL'] = Ytemp[k, 3]
            data0.append(record)

        data = pandas.DataFrame(data0)
        data.index = Xtemp[:, i]
        chart_data = {'OUTFILE': home + out + 'SW_' + para_labels[i] + img_ext,
         'XLABEL': '',
         'YLABEL': '',
         'YMIN': 0.85,
         'YMAX': 1.03}
        print para_labels[i]
        
        build_chart(chart_data, data, chart_type='date', ylim=True, save=True)
     
    ##################################################################################### Classifier

    srnum = {'CS' : 0, 'SWA' : 1, 'OA' : 2, 'CS-HL' : 3}
    Y = np.zeros(n)

    for i in range(n):
        SW = 0
        SWmax = -1
        for row in rows:
            SW = results[samprange[i]][row][0]['SW']['Annual']['Mean'][m]      
            if SW > SWmax:
                SWmax = SW
                Y[i] = srnum[row]
    
    for row in rows:
        idx = np.where(Y == srnum[row])
        print row + ': ' + str(len(Y[idx]))

     
    treec = Tree_classifier(n_estimators=500, n_jobs=4) #min_samples_split=3, min_samples_leaf=2)
    treec.fit(X, Y)
    rank = treec.feature_importances_ * 100

    data0 = []
    inn = 0
    for p in para_names:
        record = {}
        record['Importance'] = rank[inn]
        record['CS'] = np.mean(X[np.where(Y == 0), inn])
        record['SWA'] = np.mean(X[np.where(Y == 1), inn])
        record['OA'] = np.mean(X[np.where(Y == 2), inn])
        record['CS-HL'] = np.mean(X[np.where(Y == 3), inn])
        data0.append(record)
        inn = inn + 1

    tab = pandas.DataFrame(data0)
    tab.index = para_names
    tab = tab.sort(columns=['Importance'], ascending=False)
    tab_text = tab.to_latex(float_format='{:,.2f}'.format, escape=False)
    
    with open(home + table_out + 'classifier_table.txt', 'w') as f:
        f.write(tab.to_latex(float_format='{:,.2f}'.format, escape=False, columns=['Importance', 'CS', 'SWA', 'OA', 'CS-HL']))
        f.close()
    
    pylab.ioff()
    fig_width_pt = 350
    inches_per_pt = 1.0 / 72.27
    golden_mean = 1.2360679774997898 / 2.0
    fig_width = fig_width_pt * inches_per_pt
    fig_height = fig_width * golden_mean
    fig_size = [fig_width, fig_height]
    params = {'backend': 'ps',
     'axes.labelsize': 10,
     'text.fontsize': 10,
     'legend.fontsize': 10,
     'xtick.labelsize': 8,
     'ytick.labelsize': 8,
     'text.usetex': True,
     'figure.figsize': fig_size}
    pylab.rcParams.update(params)
    plot_colors = 'rybg'
    cmap = pylab.cm.RdYlBu
    
    yi = numpara-1
    
    minyi = -0.1 
    maxyi = 0.1

    (xx, yy,) = np.meshgrid(np.arange(min(X[:, 1]), max(X[:, 1]), 0.02), np.arange(min(X[:, yi]), max(X[:, yi]), 0.01))

    nnn = xx.ravel().shape[0]
    
    Xlist = [np.mean(X[:,i])*np.ones(nnn) for i in range(numpara)]
    Xlist[1] = xx.ravel()
    Xlist[yi] = yy.ravel()
    XX = np.array(Xlist).T

    Z = treec.predict(XX).reshape(xx.shape)
    fig = pylab.contourf(xx, yy, Z, [0, 0.9999, 1.9999, 2.9999, 3.9999], colors=('red', 'yellow', 'blue', 'green'), alpha=0.5, antialiased=False, extend='both')
    for (i, c,) in zip(xrange(4), plot_colors):
        idx0 = np.where(Y == i)
        pylab.scatter(X[idx0, 1], X[idx0, yi], c=c, cmap=cmap, label=rows[i], s = 12, lw=0.5 )
        pylab.legend(bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), loc=3, ncol=4, mode='expand', borderaxespad=0.0)

    pylab.xlabel('Mean inflow over capacity')
    pylab.ylabel('Environmental inflow share')
    pylab.ylim(minyi, maxyi)
    OUT = home + out + 'class_fig.pdf'
    pylab.savefig(OUT, bbox_inches='tight')
    pylab.show()
Ejemplo n.º 48
0
      
        #Best hyperparam config
        best = pruebas[k].best_trial    
        params = best['misc']['vals']
        
        estimators=[50,100,150,300]
        
        
        model = ExtraTreesRegressor(n_estimators=estimators[int(np.array(params['estimators']))],
                                          min_samples_leaf=int(np.array(params['leaf'])),
                                          min_samples_split=int(np.array(params['split'])),
                                          max_features=int(np.array(params['features'])))
        
        model.fit(X_train, y_train)
        #Model metrics
        pred = model.predict(X_train)
        train_loss.append(r2_score(y_train, pred))
        pred = model.predict(X_test)
        test_loss.append(r2_score(y_test, pred))
       
        val_loss.append(best['result']['loss']*-1)
    
    #Store metrics
    metrics = {}
    metrics['train'] = train_loss
    metrics['val'] = val_loss
    metrics['test'] = test_loss
    with open('ETR_metrics_t+'+str(h+1)+'.pkl', 'wb') as f:
        pickle.dump(metrics, f)
    
Ejemplo n.º 49
0
train_x_mean = DataFrame(train_x_mean,columns=['train_mean_'+str(i) for i in range(len(train_x_mean[0]))])
train_x_max = DataFrame(train_x_max,columns=['train_max_'+str(i) for i in range(len(train_x_max[0]))])
train_x_min = DataFrame(train_x_min,columns=['train_min_'+str(i) for i in range(len(train_x_min[0]))])
train_x_median = DataFrame(train_x_median,columns=['train_median_'+str(i) for i in range(len(train_x_median[0]))])
test_x_mean = DataFrame(test_x_mean,columns=['test_mean_'+str(i) for i in range(len(test_x_mean[0]))])
test_x_max = DataFrame(test_x_max,columns=['test_max_'+str(i) for i in range(len(test_x_max[0]))])
test_x_min = DataFrame(test_x_min,columns=['test_min_'+str(i) for i in range(len(test_x_min[0]))])
test_x_median = DataFrame(test_x_median,columns=['test_median_'+str(i) for i in range(len(test_x_median[0]))])

train_x = train_x_mean.join(train_x_max,how='left')
train_x = train_x.join(train_x_min,how='left')
train_x = train_x.join(train_x_median,how='left')
test_x = test_x_mean.join(test_x_max,how='left')
test_x = test_x.join(test_x_min,how='left')
test_x = test_x.join(test_x_median,how='left')
test_x = test_x.fillna(1)

train_y = DataFrame(train_y)

#----------------------------------------------------------------------------------------直接生成激进结果
ET = ExtraTreesRegressor(n_estimators=2600,random_state=1,n_jobs=-1,min_samples_split=2,min_samples_leaf=2,max_depth=12,max_features='sqrt')
ET.fit(train_x,train_y)
pre = ET.predict(test_x)
pre = sqrt(pre)
pre = pre*mean_11_2016

pre = DataFrame(pre.round())
pre.insert(0,'shop_id',[i for i in range(1,2001)])
pre.to_csv('../results/result'+day_time+'_pre.csv',index=False,header=False)
Ejemplo n.º 50
0
    "The MAE of DecisionTreeRegressor is ",
    mean_absolute_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(dtr_y_pred)))

# 从sklearn.ensemble中导入RandomForestsRegressor、ExtraTreesGressor以及GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

# 使用RandomForestRegressor训练模型,并在测试集上做出预测
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train.ravel())
rfr_y_pred = rfr.predict(X_test)

# 使用ExtraTreesRegressor训练模型,并在测试集上做出预测
etr = ExtraTreesRegressor()
etr.fit(X_train, y_train.ravel())
etr_y_pred = etr.predict(X_test)

# 使用GradientBoostingRegressor训练模型,并在测试集上做出预测
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train.ravel())
gbr_y_pred = gbr.predict(X_test)

# 使用R-squared、MSE和MAE指标对默认参数的随机回归森林在测试集上的性能进行评估
print("R-squared value of RandomForestRegressor is ",
      rfr.score(X_test, y_test))
print(
    "The MSE of RandomForestRegressor is ",
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(rfr_y_pred)))
print(
    "The MAE of RandomForestRegressor is ",
Ejemplo n.º 51
0
def decision_tree(X, y1, y2, y3):
  n, _ = X.shape
  nTrain = int(0.5*n)  #training on 50% of the data
  Xtrain = X[:nTrain,:]
  ytrain = y1[:nTrain]
  ytrain_registered = y2[:nTrain]
  ytest_registered = y2[nTrain:]
  ytrain_casual = y3[:nTrain]
  ytest_casual = y3[nTrain:]
  Xtest = X[nTrain:,:]
  ytest = y1[nTrain:]

  #regular

  clf_1 = DecisionTreeRegressor(max_depth=None)
  clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None),
                          n_estimators=500)
  clf_4 = RandomForestRegressor(n_estimators=500, max_depth=None,
                          min_samples_split=1, random_state=0)
  clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None,
                          min_samples_split=1, random_state=0)
  clf_3 = GradientBoostingRegressor(n_estimators=500,
                          max_depth=None, random_state=0)

  print "finished generating tree"

  clf_1.fit(Xtrain, ytrain_registered)
  clf_2.fit(Xtrain, ytrain_registered)
  clf_3.fit(Xtrain, ytrain_registered)
  clf_4.fit(Xtrain, ytrain_registered)
  clf_5.fit(Xtrain, ytrain_registered)


  print 'Finished fitting'


  dt_regular = clf_1.predict(Xtest)
  ada_regular = clf_2.predict(Xtest)
  grad_regular = clf_3.predict(Xtest)
  rf_regular = clf_4.predict(Xtest)
  et_regular = clf_5.predict(Xtest)

  #casual
  print "finished generating tree"

  clf_1.fit(Xtrain, ytrain_casual)
  clf_2.fit(Xtrain, ytrain_casual)
  clf_3.fit(Xtrain, ytrain_casual)
  clf_4.fit(Xtrain, ytrain_casual)
  clf_5.fit(Xtrain, ytrain_casual)


  print 'Finished fitting'


  dt_casual = clf_1.predict(Xtest)
  ada_casual = clf_2.predict(Xtest)
  grad_casual = clf_3.predict(Xtest)
  rf_casual = clf_4.predict(Xtest)
  et_casual = clf_5.predict(Xtest)
  feature_imps = clf_4.feature_importances_

  print "regular decision tree"
  print rmsle(ytest, dt_regular + dt_casual)
  print "boosted decision tree"
  print rmsle(ytest, ada_regular + ada_casual)
  print "gradient tree boosting"
  print rmsle(ytest, grad_regular + grad_casual)
  print "random forest classifier"
  print rmsle(ytest, rf_regular + rf_casual)
  print "extra trees classifier"
  print rmsle(ytest, et_casual + et_regular)

  print "feature importances"
  print feature_imps
Ejemplo n.º 52
0
    def ensemble(self):
        '''
		Create ensemble of gradient boosting regressor and random forest regressor
		'''

        self.remove_columns([
            'institute_latitude', 'institute_longitude', 'institute_state',
            'institute_country', 'var10', 'var11', 'var12', 'var13', 'var14',
            'var15', 'instructor_past_performance',
            'instructor_association_industry_expert', 'secondary_area', 'var24'
        ])

        self.split_dataset()

        gbr_model = GradientBoostingRegressor(learning_rate=0.1,
                                              n_estimators=200,
                                              subsample=0.8)
        rf_model = RandomForestRegressor(n_estimators=50)
        etr_model = ExtraTreesRegressor(n_estimators=50)

        gbr_model.fit(self.Xt, self.yt)

        yt_pred_gbr = gbr_model.predict(self.Xt)
        gbr_training_score = self.eval_score(self.yt, yt_pred_gbr)

        print 'GBR training score ', gbr_training_score

        rf_model.fit(self.Xt, self.yt)

        yt_pred_rf = rf_model.predict(self.Xt)
        rf_training_score = self.eval_score(self.yt, yt_pred_rf)

        print 'RF training score ', rf_training_score

        etr_model.fit(self.Xt, self.yt)

        yt_pred_etr = etr_model.predict(self.Xt)
        etr_training_score = self.eval_score(self.yt, yt_pred_etr)

        print 'ETR training score ', etr_training_score

        self.training_score = self.eval_score(
            self.yt, (yt_pred_rf + yt_pred_gbr + yt_pred_etr) / 3.)

        yv_pred_gbr = gbr_model.predict(self.Xv)
        gbr_test_score = self.eval_score(self.yv, yv_pred_gbr)

        print 'GBR test score ', gbr_test_score

        yv_pred_rf = rf_model.predict(self.Xv)
        rf_test_score = self.eval_score(self.yv, yv_pred_rf)

        print 'Rf score ', rf_test_score

        yv_pred_etr = etr_model.predict(self.Xv)
        etr_test_score = self.eval_score(self.yv, yv_pred_etr)

        print 'ETR test score ', etr_test_score

        self.test_score = self.eval_score(
            self.yv, (yv_pred_rf + yv_pred_gbr + yv_pred_etr) / 3.)

        print 'Correlation between predictions of these two models ', pd.DataFrame(
            {
                'rf_test_score': yv_pred_rf,
                'gbr_test_score': yv_pred_gbr,
                'etr_test_score': yv_pred_etr
            }).corr()
        test_itr_first = test_itr_last


output = []
for tr_,te_ in getExtensiveSeasonalTrainTestData():
    #tr_,te_ = normalize(tr_.drop(['casual','registered','count'],axis=1),te_)
    print("TrainEnd:"+str(tr_['year'].iloc[len(tr_)-1])+":"+str(tr_['month'].iloc[len(tr_)-1])+"TrainStart:"+str(tr_['year'].iloc[0])+":"+str(tr_['month'].iloc[0]))
    print("TestEnd:"+str(te_['year'].iloc[len(te_)-1])+":"+str(te_['month'].iloc[len(te_)-1])+"TestStart:"+str(te_['year'].iloc[0])+":"+str(te_['month'].iloc[0]))
    
    tr_.drop('season',axis=1,inplace=True)
    te_.drop('season',axis=1,inplace=True)


    clf_casual = ExtraTreesRegressor(n_estimators = 100)
    clf_casual.fit(tr_.drop(['casual','registered','count'],axis=1),np.log(tr_.casual+1))  
    output_casual = np.exp(clf_casual.predict(te_))-1

    clf_registered =  ExtraTreesRegressor(n_estimators = 100)
    clf_registered.fit(tr_.drop(['casual','registered','count'],axis=1),np.log(tr_.registered+1))  
    output_registered = np.exp(clf_registered.predict(te_))-1

    clf_count  = ExtraTreesRegressor(n_estimators = 100)
    clf_count.fit(tr_.drop(['casual','registered','count'],axis=1),np.log(tr_['count']+1))  
    output_count = np.exp(clf_count.predict(te_))-1

    out = ((output_casual + output_registered)+output_count)/2
    #out = output_count
    #out = (output_casual + output_registered)
    #print(out.astype(int))
    output.extend(out.astype(int))
    #print(str(out.astype(int).shape[0]))
Ejemplo n.º 54
0
def Modeller(X_train, X_test, Y_train, Y_test, dt_, params, epochs):
    #
    #required by LBGM
    train_data = lgb.Dataset(X_train, Y_train)
    valid_data = lgb.Dataset(X_test, Y_test)

    if X_train.shape[0] < params['min_child_samples'] // 2 or X_train.shape[
            0] > params['min_child_samples'] // 3:
        params['min_child_samples'] //= 100
        params['n_estimators'] //= 1
    elif X_train.shape[0] < params['min_child_samples'] // 4 or X_train.shape[
            0] > params['min_child_samples'] // 5:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 4
    elif X_train.shape[0] < params['min_child_samples'] // 5 or X_train.shape[
            0] > params['min_child_samples'] // 6:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 5
    elif X_train.shape[0] < params['min_child_samples'] // 7 or X_train.shape[
            0] > params['min_child_samples'] // 8:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 5
    elif X_train.shape[0] < params['min_child_samples'] // 8 or X_train.shape[
            0] > params['min_child_samples'] // 9:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 6
    elif X_train.shape[0] < params['min_child_samples'] // 10 or X_train.shape[
            0] > params['min_child_samples'] // 11:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 6
    elif X_train.shape[0] < params['min_child_samples'] // 12 or X_train.shape[
            0] > params['min_child_samples'] // 13:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 6
    elif X_train.shape[0] < params['min_child_samples'] // 14 or X_train.shape[
            0] > params['min_child_samples'] // 14:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 6
    elif X_train.shape[0] < params['min_child_samples'] // 15 or X_train.shape[
            0] > params['min_child_samples'] // 16:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 6
    elif X_train.shape[0] < params['min_child_samples'] // 17:
        params['min_child_samples'] //= 400
        params['n_estimators'] //= 6
    else:
        params['min_child_samples']
        params['n_estimators']

    Regress1 = RandomForestRegressor(max_depth=params['max_depth'],
                                     random_state=params['random_state'],
                                     n_estimators=params['n_estimators'])

    Regress2 = GradientBoostingRegressor(learning_rate=params['learning_rate'],
                                         loss=params['loss'],
                                         n_estimators=params['n_estimators'])

    Regress3 = ExtraTreesRegressor(max_depth=params['max_depth'],
                                   random_state=params['random_state'],
                                   n_estimators=params['n_estimators'])

    Regress4 = XGBRegressor(max_depth=params['max_depth'],
                            n_estimators=params['n_estimators'],
                            min_child_weight=params['min_child_weight'],
                            colsample_bytree=params['colsample_bytree'],
                            subsample=params['subsample'],
                            eta=params['eta'],
                            seed=params['seed'])

    Regress1.fit(X_train, Y_train)
    Regress2.fit(X_train, Y_train)
    Regress3.fit(X_train, Y_train)
    Regress4.fit(X_train, Y_train, eval_metric="rmse")

    print('Parameter value: {}\nN_estimators:{}'.format(
        params['min_child_samples'], params['n_estimators']))

    Regress5 = lgb.train(params,
                         train_data,
                         valid_sets=[train_data, valid_data],
                         num_boost_round=2500)

    Predic_ = Regress1.predict(X_test)
    Predic_2 = Regress2.predict(X_test)
    Predic_3 = Regress3.predict(X_test)
    Predic_4 = Regress4.predict(X_test)
    Predic_5 = Regress5.predict(X_test)
    Predic_6 = [x[0] for x in RNN(forecast_window, epochs)]

    forcast_date = pd.DataFrame({
        'timestamp': dt_,
        'RandForest_{}_Projection'.format(price): Predic_,
        'GradBoost_{}_Projection'.format(price): Predic_2,
        'ExtraTrees_{}_Projection'.format(price): Predic_3,
        'XGB_{}_Projection'.format(price): Predic_4,
        'LGB_{}_Projection'.format(price): Predic_5,
        'RNN_{}_Projection'.format(price): Predic_6
    })

    forcast_date['Average_{}_Projection'.format(price)] = forcast_date.mean(
        axis=1)
    forcast_date.set_index('timestamp', inplace=True)
    return forcast_date
Ejemplo n.º 55
0
# Print the feature ranking
print "Extra Tree Feature ranking:"
for f in xrange(12):
    if indices[f] < 8:
        print "%d. %s (%f)" % (f + 1, feature_list[indices[f]],xt_importances[indices[f]])
    else:
        print "%d. feature %d (%f)" % (f + 1, indices[f], xt_importances[indices[f]])

with open('xt_all.pkl', 'wb') as f:
    cPickle.dump(clf_xt, f)

with open('xt_all.pkl', 'rb') as f:
    clf_xt = cPickle.load(f)
#joblib.dump(clf_xt, 'xt.pkl', compress=9)
#clf_xt = joblib.load('xt.pkl')
abs_err = np.abs(bp_test - clf_xt.predict(data_test))
t1 = time.time() - t0
print "xtr sbp mean: %.2f (sd: %.2f)" % (np.mean(abs_err[:, 0]), np.std(abs_err[:, 0])),
print "xtr dbp mean: %.2f (sd: %.2f)" % (np.mean(abs_err[:, 1]), np.std(abs_err[:, 1])), "took", round(t1, 2), "sec"
scores = cross_val_score(clf_xt, data_train, bp_train)
print "xv scores", scores.mean()
print "explained_variance_score (sbp)", explained_variance_score(bp_test[:, 0], clf_xt.predict(data_test)[:, 0])
print "explained_variance_score (dbp)", explained_variance_score(bp_test[:, 1], clf_xt.predict(data_test)[:, 1])
print "r2_score (sbp)", r2_score(bp_test[:, 0], clf_xt.predict(data_test)[:, 0])
print "r2_score (dbp)", r2_score(bp_test[:, 1], clf_xt.predict(data_test)[:, 1])
print "----"
'''
## svr, need some fine tuning
t0 = time.time()
clf_svr = SVR(C=1.0, epsilon=0.4)
clf_svr.fit(X=data_train, y=bp_train[:, 0])
Ejemplo n.º 56
0
def hyperopt_obj(param, feat_folder, feat_name, trial_counter):

    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    for run in range(1, config.n_runs + 1):
        for fold in range(1, config.n_folds + 1):
            rng = np.random.RandomState(2015 + 1000 * run + 10 * fold)

            #### all the path
            path = "%s/Run%d/Fold%d" % (feat_folder, run, fold)
            save_path = "%s/Run%d/Fold%d" % (output_path, run, fold)
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            # feat
            feat_train_path = "%s/train.feat" % path
            feat_valid_path = "%s/valid.feat" % path
            # weight
            weight_train_path = "%s/train.feat.weight" % path
            weight_valid_path = "%s/valid.feat.weight" % path
            # info
            info_train_path = "%s/train.info" % path
            info_valid_path = "%s/valid.info" % path
            # cdf
            cdf_valid_path = "%s/valid.cdf" % path
            # raw prediction path (rank)
            raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % (
                save_path, feat_name, trial_counter)
            rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % (
                save_path, feat_name, trial_counter)

            ## load feat
            X_train, labels_train = load_svmlight_file(feat_train_path)
            X_valid, labels_valid = load_svmlight_file(feat_valid_path)
            if X_valid.shape[1] < X_train.shape[1]:
                X_valid = hstack([
                    X_valid,
                    np.zeros((X_valid.shape[0],
                              X_train.shape[1] - X_valid.shape[1]))
                ])
            elif X_valid.shape[1] > X_train.shape[1]:
                X_train = hstack([
                    X_train,
                    np.zeros((X_train.shape[0],
                              X_valid.shape[1] - X_train.shape[1]))
                ])
            # ??? why augment the features here?
            X_train = X_train.tocsr()
            X_valid = X_valid.tocsr()
            ## load weight
            weight_train = np.loadtxt(weight_train_path, dtype=float)
            weight_valid = np.loadtxt(weight_valid_path, dtype=float)

            ## load valid info
            info_train = pd.read_csv(info_train_path)
            numTrain = info_train.shape[0]
            info_valid = pd.read_csv(info_valid_path)
            numValid = info_valid.shape[0]
            Y_valid = info_valid["median_relevance"]
            ## load cdf
            cdf_valid = np.loadtxt(cdf_valid_path, dtype=float)

            ## make evalerror func
            evalerror_regrank_valid = lambda preds, dtrain: evalerror_regrank_cdf(
                preds, dtrain, cdf_valid)
            evalerror_softmax_valid = lambda preds, dtrain: evalerror_softmax_cdf(
                preds, dtrain, cdf_valid)
            evalerror_softkappa_valid = lambda preds, dtrain: evalerror_softkappa_cdf(
                preds, dtrain, cdf_valid)
            evalerror_ebc_valid = lambda preds, dtrain: evalerror_ebc_cdf(
                preds, dtrain, cdf_valid, ebc_hard_threshold)
            evalerror_cocr_valid = lambda preds, dtrain: evalerror_cocr_cdf(
                preds, dtrain, cdf_valid)

            ##############
            ## Training ##
            ##############
            ## you can use bagging to stabilize the predictions
            preds_bagging = np.zeros((numValid, bagging_size), dtype=float)
            for n in range(bagging_size):
                if bootstrap_replacement:
                    sampleSize = int(numTrain * bootstrap_ratio)
                    index_base = rng.randint(numTrain, size=sampleSize)
                    index_meta = [
                        i for i in range(numTrain) if i not in index_base
                    ]
                else:
                    randnum = rng.uniform(size=numTrain)
                    index_base = [
                        i for i in range(numTrain)
                        if randnum[i] < bootstrap_ratio
                    ]
                    index_meta = [
                        i for i in range(numTrain)
                        if randnum[i] >= bootstrap_ratio
                    ]

                if param.has_key("booster"):
                    dvalid_base = xgb.DMatrix(X_valid,
                                              label=labels_valid,
                                              weight=weight_valid)
                    dtrain_base = xgb.DMatrix(X_train[index_base],
                                              label=labels_train[index_base],
                                              weight=weight_train[index_base])

                    watchlist = []
                    if verbose_level >= 2:
                        watchlist = [(dtrain_base, 'train'),
                                     (dvalid_base, 'valid')]

                ## various models
                if param["task"] in ["regression", "ranking"]:
                    ## regression & pairwise ranking with xgboost
                    bst = xgb.train(param,
                                    dtrain_base,
                                    param['num_round'],
                                    watchlist,
                                    feval=evalerror_regrank_valid)
                    pred = bst.predict(dvalid_base)

                elif param["task"] in ["softmax"]:
                    ## softmax regression with xgboost
                    bst = xgb.train(param,
                                    dtrain_base,
                                    param['num_round'],
                                    watchlist,
                                    feval=evalerror_softmax_valid)
                    pred = bst.predict(dvalid_base)
                    w = np.asarray(range(1, numOfClass + 1))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param["task"] in ["softkappa"]:
                    ## softkappa with xgboost
                    obj = lambda preds, dtrain: softkappaObj(
                        preds, dtrain, hess_scale=param['hess_scale'])
                    bst = xgb.train(param,
                                    dtrain_base,
                                    param['num_round'],
                                    watchlist,
                                    obj=obj,
                                    feval=evalerror_softkappa_valid)
                    pred = softmax(bst.predict(dvalid_base))
                    w = np.asarray(range(1, numOfClass + 1))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param["task"] in ["ebc"]:
                    ## ebc with xgboost
                    obj = lambda preds, dtrain: ebcObj(preds, dtrain)
                    bst = xgb.train(param,
                                    dtrain_base,
                                    param['num_round'],
                                    watchlist,
                                    obj=obj,
                                    feval=evalerror_ebc_valid)
                    pred = sigmoid(bst.predict(dvalid_base))
                    pred = applyEBCRule(pred,
                                        hard_threshold=ebc_hard_threshold)

                elif param["task"] in ["cocr"]:
                    ## cocr with xgboost
                    obj = lambda preds, dtrain: cocrObj(preds, dtrain)
                    bst = xgb.train(param,
                                    dtrain_base,
                                    param['num_round'],
                                    watchlist,
                                    obj=obj,
                                    feval=evalerror_cocr_valid)
                    pred = bst.predict(dvalid_base)
                    pred = applyCOCRRule(pred)

                elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    rf = RandomForestRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        n_jobs=param['n_jobs'],
                        random_state=param['random_state'])
                    rf.fit(X_train[index_base],
                           labels_train[index_base] + 1,
                           sample_weight=weight_train[index_base])
                    pred = rf.predict(X_valid)

                elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    etr = ExtraTreesRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        n_jobs=param['n_jobs'],
                        random_state=param['random_state'])
                    etr.fit(X_train[index_base],
                            labels_train[index_base] + 1,
                            sample_weight=weight_train[index_base])
                    pred = etr.predict(X_valid)

                elif param['task'] == "reg_skl_gbm":
                    ## regression with sklearn gradient boosting regressor
                    gbm = GradientBoostingRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        learning_rate=param['learning_rate'],
                        max_depth=param['max_depth'],
                        subsample=param['subsample'],
                        random_state=param['random_state'])
                    gbm.fit(X_train.toarray()[index_base],
                            labels_train[index_base] + 1,
                            sample_weight=weight_train[index_base])
                    pred = gbm.predict(X_valid.toarray())

                elif param['task'] == "clf_skl_lr":
                    ## classification with sklearn logistic regression
                    lr = LogisticRegression(penalty="l2",
                                            dual=True,
                                            tol=1e-5,
                                            C=param['C'],
                                            fit_intercept=True,
                                            intercept_scaling=1.0,
                                            class_weight='auto',
                                            random_state=param['random_state'])
                    lr.fit(X_train[index_base], labels_train[index_base] + 1)
                    pred = lr.predict_proba(X_valid)
                    w = np.asarray(range(1, numOfClass + 1))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param['task'] == "reg_skl_svr":
                    ## regression with sklearn support vector regression
                    X_train, X_valid = X_train.toarray(), X_valid.toarray()
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)
                    svr = SVR(C=param['C'],
                              gamma=param['gamma'],
                              epsilon=param['epsilon'],
                              degree=param['degree'],
                              kernel=param['kernel'])
                    svr.fit(X_train[index_base],
                            labels_train[index_base] + 1,
                            sample_weight=weight_train[index_base])
                    pred = svr.predict(X_valid)

                elif param['task'] == "reg_skl_ridge":
                    ## regression with sklearn ridge regression
                    ridge = Ridge(alpha=param["alpha"], normalize=True)
                    ridge.fit(X_train[index_base],
                              labels_train[index_base] + 1,
                              sample_weight=weight_train[index_base])
                    pred = ridge.predict(X_valid)

                elif param['task'] == "reg_skl_lasso":
                    ## regression with sklearn lasso
                    lasso = Lasso(alpha=param["alpha"], normalize=True)
                    lasso.fit(X_train[index_base],
                              labels_train[index_base] + 1)
                    pred = lasso.predict(X_valid)

                elif param['task'] == 'reg_libfm':
                    ## regression with factorization machine (libfm)
                    ## to array
                    X_train = X_train.toarray()
                    X_valid = X_valid.toarray()

                    ## scale
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)

                    ## dump feat
                    dump_svmlight_file(X_train[index_base],
                                       labels_train[index_base],
                                       feat_train_path + ".tmp")
                    dump_svmlight_file(X_valid, labels_valid,
                                       feat_valid_path + ".tmp")

                    ## train fm
                    cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
                                libfm_exe, feat_train_path+".tmp", feat_valid_path+".tmp", raw_pred_valid_path, \
                                param['dim'], param['iter'])
                    os.system(cmd)
                    os.remove(feat_train_path + ".tmp")
                    os.remove(feat_valid_path + ".tmp")

                    ## extract libfm prediction
                    pred = np.loadtxt(raw_pred_valid_path, dtype=float)
                    ## labels are in [0,1,2,3]
                    pred += 1

                elif param['task'] == "reg_keras_dnn":
                    ## regression with keras' deep neural networks
                    model = Sequential()
                    ## input layer
                    model.add(Dropout(param["input_dropout"]))
                    ## hidden layers
                    first = True
                    hidden_layers = param['hidden_layers']
                    while hidden_layers > 0:
                        if first:
                            dim = X_train.shape[1]
                            first = False
                        else:
                            dim = param["hidden_units"]
                        model.add(
                            Dense(dim,
                                  param["hidden_units"],
                                  init='glorot_uniform'))
                        if param["batch_norm"]:
                            model.add(
                                BatchNormalization((param["hidden_units"], )))
                        if param["hidden_activation"] == "prelu":
                            model.add(PReLU((param["hidden_units"], )))
                        else:
                            model.add(Activation(param['hidden_activation']))
                        model.add(Dropout(param["hidden_dropout"]))
                        hidden_layers -= 1

                    ## output layer
                    model.add(
                        Dense(param["hidden_units"], 1, init='glorot_uniform'))
                    model.add(Activation('linear'))

                    ## loss
                    model.compile(loss='mean_squared_error', optimizer="adam")

                    ## to array
                    X_train = X_train.toarray()
                    X_valid = X_valid.toarray()

                    ## scale
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)

                    ## train
                    model.fit(X_train[index_base],
                              labels_train[index_base] + 1,
                              nb_epoch=param['nb_epoch'],
                              batch_size=param['batch_size'],
                              validation_split=0,
                              verbose=0)

                    ##prediction
                    pred = model.predict(X_valid, verbose=0)
                    pred.shape = (X_valid.shape[0], )

                elif param['task'] == "reg_rgf":
                    ## regression with regularized greedy forest (rgf)
                    ## to array
                    X_train, X_valid = X_train.toarray(), X_valid.toarray()

                    train_x_fn = feat_train_path + ".x"
                    train_y_fn = feat_train_path + ".y"
                    valid_x_fn = feat_valid_path + ".x"
                    valid_pred_fn = feat_valid_path + ".pred"

                    model_fn_prefix = "rgf_model"

                    np.savetxt(train_x_fn,
                               X_train[index_base],
                               fmt="%.6f",
                               delimiter='\t')
                    np.savetxt(train_y_fn,
                               labels_train[index_base],
                               fmt="%d",
                               delimiter='\t')
                    np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t')
                    # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')

                    pars = [
                        "train_x_fn=",
                        train_x_fn,
                        "\n",
                        "train_y_fn=",
                        train_y_fn,
                        "\n",
                        #"train_w_fn=",weight_train_path,"\n",
                        "model_fn_prefix=",
                        model_fn_prefix,
                        "\n",
                        "reg_L2=",
                        param['reg_L2'],
                        "\n",
                        #"reg_depth=", 1.01, "\n",
                        "algorithm=",
                        "RGF",
                        "\n",
                        "loss=",
                        "LS",
                        "\n",
                        #"opt_interval=", 100, "\n",
                        "valid_interval=",
                        param['max_leaf_forest'],
                        "\n",
                        "max_leaf_forest=",
                        param['max_leaf_forest'],
                        "\n",
                        "num_iteration_opt=",
                        param['num_iteration_opt'],
                        "\n",
                        "num_tree_search=",
                        param['num_tree_search'],
                        "\n",
                        "min_pop=",
                        param['min_pop'],
                        "\n",
                        "opt_interval=",
                        param['opt_interval'],
                        "\n",
                        "opt_stepsize=",
                        param['opt_stepsize'],
                        "\n",
                        "NormalizeTarget"
                    ]
                    pars = "".join([str(p) for p in pars])

                    rfg_setting_train = "./rfg_setting_train"
                    with open(rfg_setting_train + ".inp", "wb") as f:
                        f.write(pars)

                    ## train fm
                    cmd = "perl %s %s train %s >> rgf.log" % (
                        call_exe, rgf_exe, rfg_setting_train)
                    #print cmd
                    os.system(cmd)

                    model_fn = model_fn_prefix + "-01"
                    pars = [
                        "test_x_fn=", valid_x_fn, "\n", "model_fn=", model_fn,
                        "\n", "prediction_fn=", valid_pred_fn
                    ]

                    pars = "".join([str(p) for p in pars])

                    rfg_setting_valid = "./rfg_setting_valid"
                    with open(rfg_setting_valid + ".inp", "wb") as f:
                        f.write(pars)
                    cmd = "perl %s %s predict %s >> rgf.log" % (
                        call_exe, rgf_exe, rfg_setting_valid)
                    #print cmd
                    os.system(cmd)

                    pred = np.loadtxt(valid_pred_fn, dtype=float)

                ## weighted averageing over different models
                pred_valid = pred
                ## this bagging iteration
                preds_bagging[:, n] = pred_valid
                pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1)
                # why do we need to do this average over different bagging sample?2
                pred_rank = pred_raw.argsort().argsort()
                pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True)
                kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid)
                if (n + 1) != bagging_size:
                    print(
                        "              {:>3}   {:>3}   {:>3}   {:>6}   {} x {}"
                        .format(run, fold, n + 1, np.round(kappa_valid, 6),
                                X_train.shape[0], X_train.shape[1]))
                else:
                    print(
                        "                    {:>3}       {:>3}      {:>3}    {:>8}  {} x {}"
                        .format(run, fold, n + 1, np.round(kappa_valid, 6),
                                X_train.shape[0], X_train.shape[1]))
            kappa_cv[run - 1, fold - 1] = kappa_valid
            ## save this prediction
            dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw})
            dfPred.to_csv(raw_pred_valid_path,
                          index=False,
                          header=True,
                          columns=["target", "prediction"])
            ## save this prediction
            dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank})
            dfPred.to_csv(rank_pred_valid_path,
                          index=False,
                          header=True,
                          columns=["target", "prediction"])

    kappa_cv_mean = np.mean(kappa_cv)
    kappa_cv_std = np.std(kappa_cv)
    if verbose_level >= 1:
        print("              Mean: %.6f" % kappa_cv_mean)
        print("              Std: %.6f" % kappa_cv_std)

    ####################
    #### Retraining ####
    ####################
    #### all the path
    path = "%s/All" % (feat_folder)
    save_path = "%s/All" % output_path
    subm_path = "%s/Subm" % output_path
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(subm_path):
        os.makedirs(subm_path)
    # feat
    feat_train_path = "%s/train.feat" % path
    feat_test_path = "%s/test.feat" % path
    # weight
    weight_train_path = "%s/train.feat.weight" % path
    # info
    info_train_path = "%s/train.info" % path
    info_test_path = "%s/test.info" % path
    # cdf
    cdf_test_path = "%s/test.cdf" % path
    # raw prediction path (rank)
    raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (
        save_path, feat_name, trial_counter)
    rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (
        save_path, feat_name, trial_counter)
    # submission path (relevance as in [1,2,3,4])
    subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (
        subm_path, feat_name, trial_counter, kappa_cv_mean, kappa_cv_std)

    #### load data
    ## load feat
    X_train, labels_train = load_svmlight_file(feat_train_path)
    X_test, labels_test = load_svmlight_file(feat_test_path)
    if X_test.shape[1] < X_train.shape[1]:
        X_test = hstack([
            X_test,
            np.zeros((X_test.shape[0], X_train.shape[1] - X_test.shape[1]))
        ])
    elif X_test.shape[1] > X_train.shape[1]:
        X_train = hstack([
            X_train,
            np.zeros((X_train.shape[0], X_test.shape[1] - X_train.shape[1]))
        ])
    X_train = X_train.tocsr()
    X_test = X_test.tocsr()
    ## load train weight
    weight_train = np.loadtxt(weight_train_path, dtype=float)
    ## load test info
    info_train = pd.read_csv(info_train_path)
    numTrain = info_train.shape[0]
    info_test = pd.read_csv(info_test_path)
    numTest = info_test.shape[0]
    id_test = info_test["id"]

    ## load cdf
    cdf_test = np.loadtxt(cdf_test_path, dtype=float)
    ##
    evalerror_regrank_test = lambda preds, dtrain: evalerror_regrank_cdf(
        preds, dtrain, cdf_test)
    evalerror_softmax_test = lambda preds, dtrain: evalerror_softmax_cdf(
        preds, dtrain, cdf_test)
    evalerror_softkappa_test = lambda preds, dtrain: evalerror_softkappa_cdf(
        preds, dtrain, cdf_test)
    evalerror_ebc_test = lambda preds, dtrain: evalerror_ebc_cdf(
        preds, dtrain, cdf_test, ebc_hard_threshold)
    evalerror_cocr_test = lambda preds, dtrain: evalerror_cocr_cdf(
        preds, dtrain, cdf_test)

    ## bagging
    preds_bagging = np.zeros((numTest, bagging_size), dtype=float)
    for n in range(bagging_size):
        if bootstrap_replacement:
            sampleSize = int(numTrain * bootstrap_ratio)
            #index_meta = rng.randint(numTrain, size=sampleSize)
            #index_base = [i for i in range(numTrain) if i not in index_meta]
            index_base = rng.randint(numTrain, size=sampleSize)
            index_meta = [i for i in range(numTrain) if i not in index_base]
        else:
            randnum = rng.uniform(size=numTrain)
            index_base = [
                i for i in range(numTrain) if randnum[i] < bootstrap_ratio
            ]
            index_meta = [
                i for i in range(numTrain) if randnum[i] >= bootstrap_ratio
            ]

        if param.has_key("booster"):
            dtest = xgb.DMatrix(X_test, label=labels_test)
            dtrain = xgb.DMatrix(X_train[index_base],
                                 label=labels_train[index_base],
                                 weight=weight_train[index_base])

            watchlist = []
            if verbose_level >= 2:
                watchlist = [(dtrain, 'train')]

        ## train
        if param["task"] in ["regression", "ranking"]:
            bst = xgb.train(param,
                            dtrain,
                            param['num_round'],
                            watchlist,
                            feval=evalerror_regrank_test)
            pred = bst.predict(dtest)

        elif param["task"] in ["softmax"]:
            bst = xgb.train(param,
                            dtrain,
                            param['num_round'],
                            watchlist,
                            feval=evalerror_softmax_test)
            pred = bst.predict(dtest)
            w = np.asarray(range(1, numOfClass + 1))
            pred = pred * w[np.newaxis, :]
            pred = np.sum(pred, axis=1)

        elif param["task"] in ["softkappa"]:
            obj = lambda preds, dtrain: softkappaObj(
                preds, dtrain, hess_scale=param['hess_scale'])
            bst = xgb.train(param,
                            dtrain,
                            param['num_round'],
                            watchlist,
                            obj=obj,
                            feval=evalerror_softkappa_test)
            pred = softmax(bst.predict(dtest))
            w = np.asarray(range(1, numOfClass + 1))
            pred = pred * w[np.newaxis, :]
            pred = np.sum(pred, axis=1)

        elif param["task"] in ["ebc"]:
            obj = lambda preds, dtrain: ebcObj(preds, dtrain)
            bst = xgb.train(param,
                            dtrain,
                            param['num_round'],
                            watchlist,
                            obj=obj,
                            feval=evalerror_ebc_test)
            pred = sigmoid(bst.predict(dtest))
            pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)

        elif param["task"] in ["cocr"]:
            obj = lambda preds, dtrain: cocrObj(preds, dtrain)
            bst = xgb.train(param,
                            dtrain,
                            param['num_round'],
                            watchlist,
                            obj=obj,
                            feval=evalerror_cocr_test)
            pred = bst.predict(dtest)
            pred = applyCOCRRule(pred)

        elif param['task'] == "reg_skl_rf":
            ## random forest regressor
            rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                       max_features=param['max_features'],
                                       n_jobs=param['n_jobs'],
                                       random_state=param['random_state'])
            rf.fit(X_train[index_base],
                   labels_train[index_base] + 1,
                   sample_weight=weight_train[index_base])
            pred = rf.predict(X_test)

        elif param['task'] == "reg_skl_etr":
            ## extra trees regressor
            etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                      max_features=param['max_features'],
                                      n_jobs=param['n_jobs'],
                                      random_state=param['random_state'])
            etr.fit(X_train[index_base],
                    labels_train[index_base] + 1,
                    sample_weight=weight_train[index_base])
            pred = etr.predict(X_test)

        elif param['task'] == "reg_skl_gbm":
            ## gradient boosting regressor
            gbm = GradientBoostingRegressor(
                n_estimators=param['n_estimators'],
                max_features=param['max_features'],
                learning_rate=param['learning_rate'],
                max_depth=param['max_depth'],
                subsample=param['subsample'],
                random_state=param['random_state'])
            gbm.fit(X_train.toarray()[index_base],
                    labels_train[index_base] + 1,
                    sample_weight=weight_train[index_base])
            pred = gbm.predict(X_test.toarray())

        elif param['task'] == "clf_skl_lr":
            lr = LogisticRegression(penalty="l2",
                                    dual=True,
                                    tol=1e-5,
                                    C=param['C'],
                                    fit_intercept=True,
                                    intercept_scaling=1.0,
                                    class_weight='auto',
                                    random_state=param['random_state'])
            lr.fit(X_train[index_base], labels_train[index_base] + 1)
            pred = lr.predict_proba(X_test)
            w = np.asarray(range(1, numOfClass + 1))
            pred = pred * w[np.newaxis, :]
            pred = np.sum(pred, axis=1)

        elif param['task'] == "reg_skl_svr":
            ## regression with sklearn support vector regression
            X_train, X_test = X_train.toarray(), X_test.toarray()
            scaler = StandardScaler()
            X_train[index_base] = scaler.fit_transform(X_train[index_base])
            X_test = scaler.transform(X_test)
            svr = SVR(C=param['C'],
                      gamma=param['gamma'],
                      epsilon=param['epsilon'],
                      degree=param['degree'],
                      kernel=param['kernel'])
            svr.fit(X_train[index_base],
                    labels_train[index_base] + 1,
                    sample_weight=weight_train[index_base])
            pred = svr.predict(X_test)

        elif param['task'] == "reg_skl_ridge":
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(X_train[index_base],
                      labels_train[index_base] + 1,
                      sample_weight=weight_train[index_base])
            pred = ridge.predict(X_test)

        elif param['task'] == "reg_skl_lasso":
            lasso = Lasso(alpha=param["alpha"], normalize=True)
            lasso.fit(X_train[index_base], labels_train[index_base] + 1)
            pred = lasso.predict(X_test)

        elif param['task'] == 'reg_libfm':
            ## to array
            X_train, X_test = X_train.toarray(), X_test.toarray()

            ## scale
            scaler = StandardScaler()
            X_train[index_base] = scaler.fit_transform(X_train[index_base])
            X_test = scaler.transform(X_test)

            ## dump feat
            dump_svmlight_file(X_train[index_base], labels_train[index_base],
                               feat_train_path + ".tmp")
            dump_svmlight_file(X_test, labels_test, feat_test_path + ".tmp")

            ## train fm
            cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
                        libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \
                        param['dim'], param['iter'])
            os.system(cmd)
            os.remove(feat_train_path + ".tmp")
            os.remove(feat_test_path + ".tmp")

            ## extract libfm prediction
            pred = np.loadtxt(raw_pred_test_path, dtype=float)
            ## labels are in [0,1,2,3]
            pred += 1

        elif param['task'] == "reg_keras_dnn":
            ## regression with keras deep neural networks
            model = Sequential()
            ## input layer
            model.add(Dropout(param["input_dropout"]))
            ## hidden layers
            first = True
            hidden_layers = param['hidden_layers']
            while hidden_layers > 0:
                if first:
                    dim = X_train.shape[1]
                    first = False
                else:
                    dim = param["hidden_units"]
                model.add(
                    Dense(dim, param["hidden_units"], init='glorot_uniform'))
                if param["batch_norm"]:
                    model.add(BatchNormalization((param["hidden_units"], )))
                if param["hidden_activation"] == "prelu":
                    model.add(PReLU((param["hidden_units"], )))
                else:
                    model.add(Activation(param['hidden_activation']))
                model.add(Dropout(param["hidden_dropout"]))
                hidden_layers -= 1

            ## output layer
            model.add(Dense(param["hidden_units"], 1, init='glorot_uniform'))
            model.add(Activation('linear'))

            ## loss
            model.compile(loss='mean_squared_error', optimizer="adam")

            ## to array
            X_train = X_train.toarray()
            X_test = X_test.toarray()

            ## scale
            scaler = StandardScaler()
            X_train[index_base] = scaler.fit_transform(X_train[index_base])
            X_test = scaler.transform(X_test)

            ## train
            model.fit(X_train[index_base],
                      labels_train[index_base] + 1,
                      nb_epoch=param['nb_epoch'],
                      batch_size=param['batch_size'],
                      verbose=0)

            ##prediction
            pred = model.predict(X_test, verbose=0)
            pred.shape = (X_test.shape[0], )

        elif param['task'] == "reg_rgf":
            ## to array
            X_train, X_test = X_train.toarray(), X_test.toarray()

            train_x_fn = feat_train_path + ".x"
            train_y_fn = feat_train_path + ".y"
            test_x_fn = feat_test_path + ".x"
            test_pred_fn = feat_test_path + ".pred"

            model_fn_prefix = "rgf_model"

            np.savetxt(train_x_fn,
                       X_train[index_base],
                       fmt="%.6f",
                       delimiter='\t')
            np.savetxt(train_y_fn,
                       labels_train[index_base],
                       fmt="%d",
                       delimiter='\t')
            np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t')
            # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')

            pars = [
                "train_x_fn=",
                train_x_fn,
                "\n",
                "train_y_fn=",
                train_y_fn,
                "\n",
                #"train_w_fn=",weight_train_path,"\n",
                "model_fn_prefix=",
                model_fn_prefix,
                "\n",
                "reg_L2=",
                param['reg_L2'],
                "\n",
                #"reg_depth=", 1.01, "\n",
                "algorithm=",
                "RGF",
                "\n",
                "loss=",
                "LS",
                "\n",
                "test_interval=",
                param['max_leaf_forest'],
                "\n",
                "max_leaf_forest=",
                param['max_leaf_forest'],
                "\n",
                "num_iteration_opt=",
                param['num_iteration_opt'],
                "\n",
                "num_tree_search=",
                param['num_tree_search'],
                "\n",
                "min_pop=",
                param['min_pop'],
                "\n",
                "opt_interval=",
                param['opt_interval'],
                "\n",
                "opt_stepsize=",
                param['opt_stepsize'],
                "\n",
                "NormalizeTarget"
            ]
            pars = "".join([str(p) for p in pars])

            rfg_setting_train = "./rfg_setting_train"
            with open(rfg_setting_train + ".inp", "wb") as f:
                f.write(pars)

            ## train fm
            cmd = "perl %s %s train %s >> rgf.log" % (call_exe, rgf_exe,
                                                      rfg_setting_train)
            #print cmd
            os.system(cmd)

            model_fn = model_fn_prefix + "-01"
            pars = [
                "test_x_fn=", test_x_fn, "\n", "model_fn=", model_fn, "\n",
                "prediction_fn=", test_pred_fn
            ]

            pars = "".join([str(p) for p in pars])

            rfg_setting_test = "./rfg_setting_test"
            with open(rfg_setting_test + ".inp", "wb") as f:
                f.write(pars)
            cmd = "perl %s %s predict %s >> rgf.log" % (call_exe, rgf_exe,
                                                        rfg_setting_test)
            #print cmd
            os.system(cmd)

            pred = np.loadtxt(test_pred_fn, dtype=float)

        ## weighted averageing over different models
        pred_test = pred
        preds_bagging[:, n] = pred_test
    pred_raw = np.mean(preds_bagging, axis=1)
    pred_rank = pred_raw.argsort().argsort()
    #
    ## write
    output = pd.DataFrame({"id": id_test, "prediction": pred_raw})
    output.to_csv(raw_pred_test_path, index=False)

    ## write
    output = pd.DataFrame({"id": id_test, "prediction": pred_rank})
    output.to_csv(rank_pred_test_path, index=False)

    ## write score
    pred_score = getScore(pred, cdf_test)
    output = pd.DataFrame({"id": id_test, "prediction": pred_score})
    output.to_csv(subm_path, index=False)
    #"""

    return kappa_cv_mean, kappa_cv_std
Ejemplo n.º 57
0
preds_RF_py = np.exp(clf_RF.predict(pte[feature_names]))-1
RF_py_sub = pd.DataFrame({'Id':ID.Id, 'Sales':preds_RF_py})
RF_py_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/RF_subs.csv", index = False)

# Extreemly Randomized Trees #
reg_ET = ExtraTreesRegressor(n_estimators = 1000,
                             max_features = 0.75,
                             max_depth = 8,
                             min_samples_split = 12,
                             n_jobs = -1,
                             random_state = 737,
                             verbose = 2)

reg_ET = reg_ET.fit(x_train, y_train)

preds_h = reg_ET.predict(pth[feature_names])
ET_holdout = pd.DataFrame({'Date':pth.Date, 'Dow':pth.DayOfWeek,  'Actual':np.exp(pth.Sales)-1, 'Predicted':np.exp(preds_h)-1})
ET_holdout.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_holdout.csv", index = False)

preds_ET = np.exp(reg_ET.predict(pte[feature_names]))-1
ET_sub = pd.DataFrame({'Id':ID.Id, 'Sales':preds_ET})
ET_sub.to_csv("F:/Kaggle/Rossman/Blends/Stacking/ET_subs.csv", index = False)

# SVR with RBF kernel #
svr_rbf = SVR(kernel = 'rbf',
              C = 1e4,
              gamma = 0.05,
              epsilon = 0.03,
              max_iter = 10000)

svr_rbf = svr_rbf.fit(x_train, y_train)
Ejemplo n.º 58
0
def train_for_atom(atom, dataset_path, pred_save_path):
    '''
    Function for training machine learning models for a single atom

    args:
        atom = the atom that the models are trained for (str)
        dataset_path = the path to which datasets can be found (expected to have three .csv files under the path, for train/validation/test)
        pred_save_path = the path for saving all the predictions for analysis
    '''
    print("  ======  Training model for:", atom, "under folder", dataset_path,
          "  ======  ")
    features, targets, metas = prep_data([
        dataset_path + "train_['%s'].csv" % atom,
        dataset_path + "val_['%s'].csv" % atom
    ],
                                         atom,
                                         "train",
                                         filter_outlier=True,
                                         notnull=True)
    features_test, targets_test, metas_test = prep_data(
        dataset_path + "test_['%s'].csv" % atom,
        atom,
        "test",
        filter_outlier=False,
        notnull=False)
    kf = KFold(n_splits=K)
    # Prepare parameters for Kfold in a list and do "out-of-sample" training and testing on training dataset for K folds
    print("Training R0...")
    params = []
    for train_idx, test_idx in kf.split(range(len(features))):
        params.append([
            features.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"],
                          axis=1), targets, train_idx, test_idx
        ])
    pool = multiprocessing.Pool(processes=K)
    first_preds = pool.starmap(train_with_test, params)
    # first_preds=train_with_test(*params[0])

    # Combine results from K parallel execusions into a single list
    all_test_idx = []
    all_first_preds = []
    for i in range(K):
        all_test_idx.extend(params[i][-1])
        all_first_preds.extend(first_preds[i])
    first_preds = pd.Series(all_first_preds, index=all_test_idx)
    features["FIRST_PRED"] = first_preds
    evaluate(first_preds, targets, metas,
             pred_save_path + "first_pred_%s.csv" % atom)

    # Retrain the model on all training data
    model1 = ExtraTreesRegressor(bootstrap=False,
                                 max_features=0.3,
                                 min_samples_leaf=3,
                                 min_samples_split=15,
                                 n_estimators=500)
    model1.fit(
        features.drop(
            ["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY", "FIRST_PRED"],
            axis=1), targets.values.ravel())

    # Write first predictions for the test dataset to the features of test
    features_test["FIRST_PRED"] = model1.predict(
        features_test.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"],
                           axis=1))

    # Save first level model (R0)
    if not DEBUG:
        joblib.dump(model1, "pipelines/%s_model1.sav" % atom)

    # Train and save second level model  (R1)
    print("Training second level model without SHIFTY++ with %d examples..." %
          len(features))
    model_2 = RandomForestRegressor(bootstrap=False,
                                    max_features=0.5,
                                    min_samples_leaf=7,
                                    min_samples_split=12,
                                    n_estimators=500)
    model_2.fit(
        features.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"],
                      axis=1), targets.values.ravel())
    pred_2 = model_2.predict(
        features_test.drop(["SHIFTY_" + atom, "MAX_IDENTITY", "AVG_IDENTITY"],
                           axis=1)).ravel()
    evaluate(pred_2, targets_test, metas_test,
             pred_save_path + "second_pred_%s_nosy.csv" % atom)
    if not DEBUG:
        joblib.dump(model_2, "pipelines/%s_model2_ny.sav" % atom)

    # Train and save second level model with SHIFTY++ predictions (R2)
    model_21 = RandomForestRegressor(bootstrap=False,
                                     max_features=0.5,
                                     min_samples_leaf=7,
                                     min_samples_split=12,
                                     n_estimators=500)
    not_null_idx = features["SHIFTY_" + atom].notnull()
    not_null_idx_test = features_test["SHIFTY_" + atom].notnull()

    print("Training second level model with SHIFTY++ with %d examples..." %
          np.sum(not_null_idx))
    model_21.fit(features[not_null_idx], targets[not_null_idx].values.ravel())
    pred_21 = pred_2.copy()
    pred_21[not_null_idx_test] = model_21.predict(
        features_test[not_null_idx_test])
    evaluate(pred_21, targets_test, metas_test,
             pred_save_path + "second_pred_%s_withsy.csv" % atom)
    if not DEBUG:
        joblib.dump(model_21, "pipelines/%s_model2_wy.sav" % atom)

    print("Finish for", atom)
def do_validation(data_path, steps=10):
    allfiles = initialize(data_path)
    gbm = GradientBoostingRegressor(n_estimators=100,
                                    learning_rate=0.05,
                                    max_depth=6,
                                    min_samples_leaf=5,
                                    subsample=0.5)
    ada = AdaBoostRegressor(n_estimators=200, learning_rate=1)
    etree = ExtraTreesRegressor(n_estimators=200,
                                n_jobs=-1,
                                min_samples_leaf=5)
    rf = RandomForestRegressor(n_estimators=200,
                               max_features=4,
                               min_samples_leaf=5)
    kn = KNeighborsRegressor(n_neighbors=25)
    logit = LogisticRegression(tol=0.05)
    enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05)
    svr = SVR(kernel="linear", probability=True)
    ridge = Ridge(alpha=18)
    bridge = BayesianRidge(n_iter=500)

    gbm_metrics = 0.0
    ada_metrics = 0.0
    etree_metrics = 0.0
    rf_metrics = 0.0
    kn_metrics = 0.0
    logit_metrics = 0.0
    svr_metrics = 0.0
    ridge_metrics = 0.0
    bridge_metrics = 0.0
    enet_metrics = 0.0
    nnet_metrics = 0.0

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    for i in xrange(steps):
        driver = allfiles[i]
        df, Y = create_merged_dataset(driver)
        df['label'] = Y
        # Shuffle DF.
        df = df.reindex(np.random.permutation(df.index))

        train = df[:100]
        label = train['label']
        del train['label']

        test = df[100:400]
        Y = test['label']
        del test['label']

        #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9',
        #        'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19',
        #        'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29',
        #        'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39',
        #        'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49',
        #        'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59',
        #        'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69',
        #        'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80']
        to_drop = ['driver', 'trip']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)

        gbm.fit(X_train, label)
        Y_hat = gbm.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        gbm_metrics += metrics.auc(fpr, tpr)

        ada.fit(X_train, label)
        Y_hat = ada.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ada_metrics += metrics.auc(fpr, tpr)

        etree.fit(X_train, label)
        Y_hat = etree.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        etree_metrics += metrics.auc(fpr, tpr)

        rf.fit(X_train, label)
        Y_hat = rf.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        rf_metrics += metrics.auc(fpr, tpr)

        kn.fit(X_train, label)
        Y_hat = kn.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        kn_metrics += metrics.auc(fpr, tpr)

        # Linear models.
        to_drop = [
            'driver', 'trip', 'distance', 'sd_acceleration', 'final_angle',
            'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed',
            'sd_avg_speed', 'mean_inst_speed', 'points'
        ]

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)

        logit.fit(X_train, label)
        Y_hat = [i[1] for i in logit.predict_proba(X_test)]
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        logit_metrics += metrics.auc(fpr, tpr)

        svr.fit(X_train, label)
        Y_hat = svr.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        svr_metrics += metrics.auc(fpr, tpr)

        ridge.fit(X_train, label)
        Y_hat = ridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ridge_metrics += metrics.auc(fpr, tpr)

        bridge.fit(X_train, label)
        Y_hat = bridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        bridge_metrics += metrics.auc(fpr, tpr)

        enet.fit(X_train, label)
        Y_hat = enet.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        enet_metrics += metrics.auc(fpr, tpr)

        classifier.fit(X_train, label)
        Y_hat = classifier.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        nnet_metrics += metrics.auc(fpr, tpr)

    print ""
    print "GBM:", gbm_metrics / steps
    print "AdaBoost:", ada_metrics / steps
    print "Extra Trees:", etree_metrics / steps
    print "RF:", rf_metrics / steps
    print "KN:", kn_metrics / steps
    print ""
    print "Logit:", logit_metrics / steps
    print "SVR:", svr_metrics / steps
    print "Ridge:", ridge_metrics / steps
    print "BayesianRidge:", bridge_metrics / steps
    print "Elastic Net:", enet_metrics / steps
    print "Neural Networks:", nnet_metrics / steps
    print ""
#print('manual rescaledX\n', manual_scaled[1:5,0])

#Save Scaler
scaler_filename = variable + '_scaler.sav'
dump(scaler, scaler_filename)

#model = KNeighborsRegressor(n_neighbors = 3)
#model = LinearRegression()
#model = DecisionTreeRegressor()
#model = GradientBoostingRegressor()
model = ExtraTreesRegressor()
model.fit(rescaledX, Y_train)

# Transform the validation dataset
rescaledValidationX = scaler.transform(X_validation)
predictions = model.predict(rescaledValidationX)

print('\n', 'min: ', numpy.amin(Y_validation), 'max: ',
      numpy.amax(Y_validation))
print('\n', '0.5 of data in range: ', numpy.percentile(Y_validation, 25),
      ' - ', numpy.percentile(Y_validation, 75))

rmse = numpy.sqrt(mean_squared_error(Y_validation, predictions))
print('\n', 'RMSE: ', numpy.sqrt(mean_squared_error(Y_validation,
                                                    predictions)),
      ' Median percentage %: ',
      100 * rmse / numpy.percentile(Y_validation, 50))
print('\nModel score: ', model.score(rescaledValidationX, Y_validation))

# Save the model to disk
filename = variable + '_model.sav'