def adaBoostModel(train_x, train_y, test_x, test_y, workOrFreeDay):
    
    rng = np.random.RandomState(1)
    
    adaBoost = AdaboostR(DTR(max_depth=5), n_estimators=300, random_state=rng)
    adaBoost.fit(train_x, train_y)
    predicted = adaBoost.predict(test_x)
    
    # show test results 
    printEvaluationScores(predicted, test_y, "AdaBoost model with MSFsc", workOrFreeDay)  
    
    # invokes method to print the tree structure of the 300 trained tree
    #saveTreeStrucutre(adaBoost)
    
    # Predict without MSFSC
    x_trainWithoutMSFSC = train_x.copy()
    x_testWithoutMSFSC = test_x.copy()
    
    del x_trainWithoutMSFSC['MSFSC']
    del x_testWithoutMSFSC['MSFSC']
    
    adaBoost.fit(x_trainWithoutMSFSC, train_y)
    predicted = adaBoost.predict(x_testWithoutMSFSC)
    
    # show test results 
    printEvaluationScores(predicted, test_y, "AdaBoost model with without MSFsc", workOrFreeDay) 
Exemple #2
0
def test_DecitionTreeRegressor():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_boston
    dataset = load_boston()
    X, y, features = dataset['data'], dataset['target'], dataset[
        'feature_names']
    X = pd.DataFrame(X, columns=features)
    y = pd.DataFrame(y, columns=['target'])
    data = pd.concat([X, y], axis=1)

    features = data.columns[:-1]
    target = data.columns[-1]

    from sklearn.model_selection import train_test_split
    X_train, X_vali, y_train, y_vali = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=23)
    print('X_train shape: ', X_train.shape)
    print('X_vali shape: ', X_vali.shape)
    print('y_train shape: ', y_train.shape)
    print('y_vali shape: ', y_vali.shape)

    from ml.tree import DecisionTreeRegressor
    from sklearn.tree import DecisionTreeRegressor as DTR
    models = {}
    models['my_dtr'] = DecisionTreeRegressor(max_depth=5)
    models['sklearn_dtr'] = DTR(max_depth=5)

    for name, model in models.items():
        model.fit(X_train, y_train)
        print('%s score: %.8f' % (name, model.score(X_vali, y_vali)))
 def get_new_model(self):
     if (self.model_type.split("_")[-1] == "Regressor"):
         if (self.model_type == "Linear-Regressor"):
             from sklearn.linear_model import LinearRegression
             self.model = LinearRegression(**self.model_args)
         elif (self.model_type == "Support-Vector-Regressor"):
             import sklearn.svm as SVR
             self.model = SVR(**self.model_args)
         elif (self.model_type == "Decision-Tree-Regressor"):
             from sklearn.tree import DecisionTreeRegressor as DTR
             self.model = DTR(**self.model_args)
         elif (self.model_type == "Random-Forest-Regressor"):
             from sklearn.ensemble import RandomForestRegressor as RFR
             self.model = RFR(**self.model_args)
     else:
         if (self.model_type == "Logistic-Regression-Classifier"):
             from sklearn.linear_model import LogisticRegression
             self.model = LogisticRegression(**self.model_args)
         elif (self.model_type == "KNN-Classifier"):
             from sklearn.neighbors import KNeighborsClassifier as KNN
             self.model = KNN(**self.model_args)
         elif (self.model_type == "Support-Vector-Classifier"):
             import sklearn.svm as SVC
             self.model = SVC(**self.model_args)
         elif (self.model_type == "Naive-Bayes-Classifier"):
             from sklearn.naive_bayes import GNB
             self.model = GNB(**self.model_args)
         elif (self.model_type == "Decision-Tree-Classifier"):
             from sklearn.tree import DecisionTreeClassifier as DTC
             self.model = DTC(**self.model_args)
         elif (self.model_type == "Random-Forest-Classifier"):
             from sklearn.ensemble import RandomForestClassifier as RFC
             self.model = RFC(**self.model_args)
Exemple #4
0
def show_bias_variance(feature_variables, n_test, n_train):

    target_variable = 'Class'
    X, Y = filter_data(feature_variables)

    n_x = len(X)
    random_test_index = random.sample(range(n_x), n_test)

    X_test = [X[i] for i in random_test_index]
    Y_test = [[Y[i]] for i in random_test_index]

    max_df = 30
    dfs = range(1, max_df+1) #max_depth
    result  = {i: [] for i in dfs}
    train_pool_index = [i for i in range(n_x) if i not in random_test_index]

    train_indexes = []
    for i in range(50):
        train_indexes_sample = random.sample(train_pool_index, n_train)
        as_list = sorted(train_indexes_sample)
        train_indexes.append(as_list)

    for df in dfs:
        model = DTR(max_depth = df, max_features=len(feature_variables))
        prediction_errors = []
        training_errors = []
        for i in range(50):
            X_train = [X[j] for j in train_indexes[i]]
            Y_train = [[Y[j]] for j in train_indexes[i]]
            model.fit(X_train, Y_train)
            y_predict = model.predict(X_test)
            y_predict_train = model.predict(X_train) 
            mse = statistics.mean([(y_predict[j]-Y_test[j][0])**2   for j in range(n_test)])
            mse_train = statistics.mean([(y_predict_train[j]-Y_train[j][0])**2   for j in range(n_train)])
            prediction_errors.append(mse)
            training_errors.append(mse_train)

        result[df].append(prediction_errors)
        result[df].append(statistics.mean(prediction_errors))
        result[df].append(training_errors)
        result[df].append(statistics.mean(training_errors))

    fig, ax = plt.subplots()
    sort_dfs = sorted(dfs)

    for i in range(max_df):
        ax.plot(sort_dfs, [result[df][0][i] for df in sort_dfs], '-', color = (1,0.2,0,0.25), lw = 2)
        p=ax.plot(sort_dfs, [result[df][1] for df in sort_dfs], '-', color = (1,0,0,1), lw = 1.5)
    p[0].set_label("Expected Test Error Estimate")
    
    for i in range(max_df):
        ax.plot(sort_dfs, [result[df][2][i] for df in sort_dfs], '-', color = (0,0.2,1,0.25), lw = 2)
        p=ax.plot(sort_dfs, [result[df][3] for df in sort_dfs], '-', color = (0,0,1,1), lw = 1.5)
    p[0].set_label("Expected Training Error Estimate")

    ax.legend()
    ax.set_xlabel('Max Depth')
    ax.set_ylabel('Prediction Error')
    fig.show()
Exemple #5
0
def function(Xt, Xy, Yt, Yy, weight, dynamic=False):
    for i in range(50):
        estimator = DTR(max_depth=5, random_state=42)
        if i == 0:
            estimator.fit(X_train, y_train)
        else:
            estimator.fit(
                X_train, square_grad(y_train, gbm_predict(X_train, ests,
                                                          coef)))
        ests.append(estimator)
        if (dynamic):
            coef.append(weight / (1 + i))
        else:
            coef.append(weight)
    return MSE(y_test, gbm_predict(X_test, ests, coef))
Exemple #6
0
def test_gbdt():
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_boston
    dataset = load_boston()
    X, y, features = dataset['data'], dataset['target'], dataset[
        'feature_names']
    X = pd.DataFrame(X, columns=features)
    y = pd.DataFrame(y, columns=['target'])
    data = pd.concat([X, y], axis=1)

    features = data.columns[:-1]
    target = data.columns[-1]

    from sklearn.model_selection import train_test_split
    X_train, X_vali, y_train, y_vali = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=25)
    print('X_train shape: ', X_train.shape)
    print('X_vali shape: ', X_vali.shape)
    print('y_train shape: ', y_train.shape)
    print('y_vali shape: ', y_vali.shape)

    from sklearn.tree import DecisionTreeRegressor as DTR
    dtr = DTR(max_depth=5)
    dtr.fit(X_train, y_train.values.reshape(-1))
    print('sklearn dtr score: ', dtr.score(X_vali, y_vali))

    from sklearn.ensemble import GradientBoostingRegressor as GBR
    import xgboost as xgb
    gbr = GBR(max_depth=5)
    gbr.fit(X_train, y_train)
    print('sklearn gbr score: ', gbr.score(X_vali, y_vali))

    from ml.tree import DecisionTreeRegressor
    mydtr = DecisionTreeRegressor(max_depth=5)
    mydtr.fit(X_train, y_train)
    print('my dtr score: ', mydtr.score(X_vali, y_vali))

    from ml.ensemble import GradientBoostingRegressor
    mygbr = GradientBoostingRegressor()
    mygbr.fit(X_train, y_train)
    print('my gbr score: ', mygbr.score(X_vali, y_vali))
def adaBoostModelWithCrossFoldValidation(inputData, outputData, workOrFreeDay):
      
    rng = np.random.RandomState(1)
    adaBoost = AdaboostR(DTR(max_depth=5), n_estimators=300, random_state=rng)
    
    # do leave one-out cross prediction
    adaBoostPredict = cross_val_predict(adaBoost, inputData, outputData, cv=len(inputData))   
    
    # show test results 
    printEvaluationScores(adaBoostPredict, outputData, "AdaBoost model with MSFsc and LOO prediction", workOrFreeDay)  
    
        # Predict without MSFSC
    dataWithoutMSFSC = inputData.copy()
    
    del dataWithoutMSFSC['MSFSC']
    adaBoostPredict = cross_val_predict(adaBoost, dataWithoutMSFSC, outputData, cv=len(inputData))
    
    # show test results 
    printEvaluationScores(adaBoostPredict, outputData, "AdaBoost model without MSFsc and LOO prediction", workOrFreeDay)
 def __init__(self,
              criterion='mse',
              splitter='best',
              max_depth=None,
              min_samples_split=2,
              min_samples_leaf=1,
              min_weight_fraction_leaf=0.0,
              max_features=None,
              random_state=None,
              max_leaf_nodes=None,
              min_impurity_decrease=0.0,
              min_impurity_split=None,
              ccp_alpha=0.0):
     self.max_leaf_nodes = max_leaf_nodes
     self.min_samples_split = min_samples_split
     self.random_state = random_state
     self.min_samples_leaf = min_samples_leaf
     self.ccp_alpha = ccp_alpha
     self.min_impurity_decrease = min_impurity_decrease
     self.max_features = max_features
     self.splitter = splitter
     self.max_depth = max_depth
     self.min_weight_fraction_leaf = min_weight_fraction_leaf
     self.min_impurity_split = min_impurity_split
     self.criterion = criterion
     self.model = DTR(
         ccp_alpha=self.ccp_alpha,
         min_impurity_decrease=self.min_impurity_decrease,
         min_weight_fraction_leaf=self.min_weight_fraction_leaf,
         min_impurity_split=self.min_impurity_split,
         splitter=self.splitter,
         min_samples_split=self.min_samples_split,
         max_leaf_nodes=self.max_leaf_nodes,
         max_depth=self.max_depth,
         min_samples_leaf=self.min_samples_leaf,
         max_features=self.max_features,
         criterion=self.criterion,
         random_state=self.random_state)
Exemple #9
0
def bagging(X_train, y_train, X_test, boot_count, depth):

    trees = np.array([DTR(max_depth=depth) for _ in range(0, boot_count)])
    X_train_bootstrap = np.array([])
    y_train_bootstrap = np.array([])

    # Формирование трейн-выборок бутстрэпом:
    for i in range(0, boot_count):
        for j in range(0, X_train.shape[0]):
            random_index = random.choice(
                [i for i in range(0, X_train.shape[0])])
            X_train_bootstrap = np.append(X_train_bootstrap,
                                          X_train[random_index])
            y_train_bootstrap = np.append(y_train_bootstrap,
                                          y_train[random_index])
    X_train_bootstrap = X_train_bootstrap.reshape(boot_count, X_train.shape[0],
                                                  X_train.shape[1])
    y_train_bootstrap = y_train_bootstrap.reshape(boot_count, X_train.shape[0])

    # Обучаем деревья на трейн-выборках:
    fitted_trees = np.array([
        trees[i].fit(X_train_bootstrap[i], y_train_bootstrap[i])
        for i in range(0, boot_count)
    ])

    # Предсказываем ансамблем деревьев:
    y_predicts = np.array([tree.predict(X_test) for tree in fitted_trees])
    y_predicts = y_predicts.reshape(boot_count, X_test.shape[0])
    # Усреднение
    y_pred = np.array([])
    for i in range(0, X_test.shape[0]):
        mean_value = 0
        for j in range(0, boot_count):
            mean_value += y_predicts[j][i]
        mean_value = mean_value / boot_count
        y_pred = np.append(y_pred, mean_value)
    return y_pred
 PAR(C=1.0,
     fit_intercept=False,
     tol=None,
     shuffle=True,
     verbose=1,
     loss='epsilon_insensitive',
     epsilon=0.01,
     random_state=rng),
 'svr_rbf':
 SVR(kernel='rbf', C=1e3, shrinking=True, verbose=True),
 'svr_ply':
 SVR(kernel='poly', C=1e3, degree=3, shrinking=True, verbose=True),
 'gpr':
 GPR(kernel=None, alpha=1e-10, optimizer='fmin_l_bfgs_b', random_state=rng),
 'dtr':
 DTR(max_depth=10),
 'kr_rbf':
 KernelRidge(kernel='rbf', gamma=0.1, alpha=1e-2),
 'kr_ply':
 KernelRidge(kernel='poly', gamma=10.1, alpha=1e-2, degree=3),
 'mlp_r':
 MLPRegressor(
     hidden_layer_sizes=(
         10,
         8,
         5,
     ),
     activation='tanh',
     solver='adam',  #'lbfgs',  #
     alpha=0.0001,
     batch_size=32,
Exemple #11
0
def getDTC(data, target, depth):
    Y = data[target]
    X = data.drop(target, axis=1)
    model = DTR(max_depth=depth)
    return model
Exemple #12
0
})
df_['Category of interaction'] = df_['Category of interaction'].map({
    'positive':
    1,
    'negative':
    -1,
    'neutral':
    0
})

#### seperating dependent and independent variables
x = df_.drop(['Churn date'], axis=1)
y = df_['Churn date']

#splitting into training and testdata
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x,
                                                    y,
                                                    random_state=23,
                                                    test_size=0.3)

from sklearn.tree import DecisionTreeRegressor as DTR

dtr = DTR(max_depth=41, random_state=23)
dtr.fit(train_x, train_y)

pickle.dump(dtr, open('model.pkl', 'wb'))

model = pickle.load(open('model.pkl', 'rb'))
print(model.predict([[737000, 737002, 0]]))
Exemple #13
0
for i in range(0, len(y)):
    if y[i] == 0:
        y[i] = 1

from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LinearRegression as LR
reg = LR()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'LR', X_train, X_test)

from sklearn.tree import DecisionTreeRegressor as DTR
reg = DTR()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'DTR', X_train, X_test)

from sklearn.ensemble import RandomForestRegressor as RF
reg = RF()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
Table(y_pred, y_pred_train, y_train, y_test, 'RFR', X_train, X_test)
plotting(y_pred, 'RFR')

from lightgbm import LGBMRegressor as lgb
reg = lgb()
Exemple #14
0
# In[21]:


from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.model_selection import RandomizedSearchCV

parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
           }

randGrid = RandomizedSearchCV(DTR(), parameters, cv=10, scoring='r2', n_iter=1000, verbose=1, n_jobs=3)
randGrid.fit(xall, yall)
print(randGrid.best_params_)
print(randGrid.best_score_)


# In[22]:


parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
#            "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
           "max_features":["auto","log2","sqrt",None],
#            "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
           }
display(samples - np.round(data.mean()))
display(samples - np.round(data.median()))

from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.metrics import accuracy_score
from sklearn import cross_validation

# Make a copy of the DataFrame, using the 'drop' function to drop the given feature
new_data = data.drop(['Frozen'], axis=1, inplace=False)

# Split the data into training and testing sets using the given feature as the target
new_y = data.drop(['Fresh','Milk','Grocery','Detergents_Paper','Delicatessen'], axis=1, inplace=False)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(new_data, new_y, test_size=0.25, random_state=42)

# Create a decision tree regressor and fit it to the training set
regressor = DTR(random_state=42)
regressor.fit(X_train, y_train)

# Report the score of the prediction using the testing set
score = regressor.score(X_test, y_test)
print(score)

# Scale the data using the natural logarithm
log_data = np.log(data)

# Scale the sample data using the natural logarithm
log_samples = np.log(samples)

# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
Exemple #16
0
# loading original dataset
data = pd.read_csv(path + 'SPP+.csv')
# arranging features from original dataset for model learning
x = data.drop([
    'Wavelength (nm)', 'Width (nm)', 'AspectRatio', 'Length (nm)',
    'Linewidth (nm)', 'MaxCscat'
],
              axis=1)
w_y = data['Width (nm)']
l_y = data['Length (nm)']

# parameters for GridSearchCV class
param_grid = {'max_depth': range(1, 31)}

# Initialize GridSearchCV class
width_gs = GridSearchCV(estimator=DTR(),
                        param_grid=param_grid,
                        cv=10,
                        scoring='neg_mean_squared_error')
length_gs = GridSearchCV(estimator=DTR(),
                         param_grid=param_grid,
                         cv=10,
                         scoring='neg_mean_squared_error')

width_gs.fit(x, w_y)
length_gs.fit(x, l_y)

joblib_width_file = "joblib_width_gs.pkl"
joblib.dump(width_gs, joblib_width_file)

joblib_length_file = "joblib_length_gs.pkl"
 def __init__(self, featureset=None, target=None, mode='predict', path=''):
     if (mode == 'train'):
         self.__svm = SVC(C=1.0,
                          cache_size=200,
                          class_weight=None,
                          coef0=0.0,
                          decision_function_shape='ovr',
                          degree=3,
                          gamma='auto',
                          kernel='rbf',
                          max_iter=-1,
                          probability=False,
                          random_state=None,
                          shrinking=True,
                          tol=0.001,
                          verbose=False)
         self.__svr = SVR(C=1.0,
                          cache_size=200,
                          coef0=0.0,
                          degree=3,
                          epsilon=0.1,
                          gamma='auto',
                          kernel='rbf',
                          max_iter=-1,
                          shrinking=True,
                          tol=0.001,
                          verbose=False)
         self.__nusvm = NuSVC(cache_size=200,
                              class_weight=None,
                              coef0=0.0,
                              decision_function_shape='ovr',
                              degree=3,
                              gamma='auto',
                              kernel='rbf',
                              max_iter=-1,
                              nu=0.5,
                              probability=False,
                              random_state=None,
                              shrinking=True,
                              tol=0.001,
                              verbose=False)
         self.__nusvr = NuSVR(C=1.0,
                              cache_size=200,
                              coef0=0.0,
                              degree=3,
                              gamma='auto',
                              kernel='rbf',
                              max_iter=-1,
                              nu=0.5,
                              shrinking=True,
                              tol=0.001,
                              verbose=False)
         self.__linsvm = LinearSVC(C=1.0,
                                   class_weight=None,
                                   dual=True,
                                   fit_intercept=True,
                                   intercept_scaling=1,
                                   loss='squared_hinge',
                                   max_iter=1000,
                                   multi_class='ovr',
                                   penalty='l2',
                                   random_state=None,
                                   tol=0.0001,
                                   verbose=0)
         self.__linsvr = LinearSVR(C=1.0,
                                   dual=True,
                                   epsilon=0.0,
                                   fit_intercept=True,
                                   intercept_scaling=1.0,
                                   loss='epsilon_insensitive',
                                   max_iter=1000,
                                   random_state=None,
                                   tol=0.0001,
                                   verbose=0)
         self.__mlpc = MLPC(activation='relu',
                            alpha=1e-05,
                            batch_size='auto',
                            beta_1=0.9,
                            beta_2=0.999,
                            early_stopping=False,
                            epsilon=1e-08,
                            hidden_layer_sizes=(100, 25),
                            learning_rate='constant',
                            learning_rate_init=0.001,
                            max_iter=200,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            power_t=0.5,
                            random_state=1,
                            shuffle=True,
                            solver='lbfgs',
                            tol=0.0001,
                            validation_fraction=0.1,
                            verbose=False,
                            warm_start=False)
         self.__mlpr = MLPR(activation='relu',
                            alpha=0.0001,
                            batch_size='auto',
                            beta_1=0.9,
                            beta_2=0.999,
                            early_stopping=False,
                            epsilon=1e-08,
                            hidden_layer_sizes=(100, 25),
                            learning_rate='constant',
                            learning_rate_init=0.001,
                            max_iter=200,
                            momentum=0.9,
                            nesterovs_momentum=True,
                            power_t=0.5,
                            random_state=None,
                            shuffle=True,
                            solver='adam',
                            tol=0.0001,
                            validation_fraction=0.1,
                            verbose=False,
                            warm_start=False)
         self.__dtc = DTC(class_weight=None,
                          criterion='gini',
                          max_depth=None,
                          max_features=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          presort=False,
                          random_state=None,
                          splitter='best')
         self.__dtr = DTR(criterion='mse',
                          max_depth=None,
                          max_features=None,
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          presort=False,
                          random_state=None,
                          splitter='best')
         self.__rfc = RFC(bootstrap=True,
                          class_weight=None,
                          criterion='gini',
                          max_depth=100,
                          max_features='auto',
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          n_estimators=50,
                          n_jobs=1,
                          oob_score=False,
                          random_state=None,
                          verbose=0,
                          warm_start=False)
         self.__rfr = RFR(bootstrap=True,
                          criterion='mse',
                          max_depth=None,
                          max_features='auto',
                          max_leaf_nodes=None,
                          min_impurity_decrease=0.0,
                          min_impurity_split=None,
                          min_samples_leaf=1,
                          min_samples_split=2,
                          min_weight_fraction_leaf=0.0,
                          n_estimators=10,
                          n_jobs=1,
                          oob_score=False,
                          random_state=None,
                          verbose=0,
                          warm_start=False)
         (self.__svm, self.__svr, self.__nusvm, self.__nusvr, self.__linsvm,
          self.__linsvr, self.__mlpc, self.__mlpr, self.__dtc, self.__dtr,
          self.__rfc, self.__rfr) = self.__trainAll(X=list(featureset),
                                                    Y=list(target))
         self.__saveModelsToFile(path)
     else:
         self.__svm = joblib.load(path + 'Mel_SVM.pkl')
         self.__svr = joblib.load(path + 'Mel_SVR.pkl')
         self.__nusvm = joblib.load(path + 'Mel_NuSVM.pkl')
         self.__nusvr = joblib.load(path + 'Mel_NuSVR.pkl')
         self.__linsvm = joblib.load(path + 'Mel_LinSVM.pkl')
         self.__linsvr = joblib.load(path + 'Mel_LinSVR.pkl')
         self.__mlpc = joblib.load(path + 'Mel_MLPC.pkl')
         self.__mlpr = joblib.load(path + 'Mel_MLPR.pkl')
         self.__dtc = joblib.load(path + 'Mel_DTC.pkl')
         self.__dtr = joblib.load(path + 'Mel_DTR.pkl')
         self.__rfc = joblib.load(path + 'Mel_RFC.pkl')
         self.__rfr = joblib.load(path + 'Mel_RFR.pkl')
Exemple #18
0
        trees[i].fit(X_train_bootstrap[i], y_train_bootstrap[i])
        for i in range(0, boot_count)
    ])

    # Предсказываем ансамблем деревьев:
    y_predicts = np.array([tree.predict(X_test) for tree in fitted_trees])
    y_predicts = y_predicts.reshape(boot_count, X_test.shape[0])
    # Усреднение
    y_pred = np.array([])
    for i in range(0, X_test.shape[0]):
        mean_value = 0
        for j in range(0, boot_count):
            mean_value += y_predicts[j][i]
        mean_value = mean_value / boot_count
        y_pred = np.append(y_pred, mean_value)
    return y_pred


X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=123,
                                                    shuffle=True)
y_pred = bagging(X_train, y_train, X_test, boot_count=200, depth=10)
y_dt_pred = DTR().fit(X_train, y_train).predict(X_test)
y = RandomForestRegressor().fit(X_train, y_train).predict(X_test)

print(mean_squared_error(y, y_test))
print(mean_squared_error(y_dt_pred, y_test))
print(mean_squared_error(y_pred, y_test))
Exemple #19
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Importing the dataset
dataset = pd.read_csv("D:\work\ML A to Z\Own\Regression\Faces.csv")

#Matrix and vector
X = dataset.iloc[:, 1:5].values
y = dataset.iloc[:, 5].values

#training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#Regressor
from sklearn.tree import DecisionTreeRegressor as DTR
regressor = DTR()
regressor.fit(X_train, y_train)

#prediction
y_pred = regressor.predict(X_test)

#Plotting
plt.scatter(X_test, y_test, color="red")
plt.plot(X_test, regressor.predict(X_test), color="blue")
plt.show()
Exemple #20
0
# some_prepared = full_pipeline.transform(some_data)
# print("Predictions: ", lr.predict(some_prepared))
# print("Original Labels: ", list(some_labels))
# 可以看出預測結果差的離譜(代表資料的擬合不足 underfitting)

# 透過 Scikit-Learn mean_squared_error 來計算整個訓練資料上回歸模型的RMSE
from sklearn.metrics import mean_squared_error
housing_predictions = lr.predict(housing_prepared)
# lr_mse = mean_squared_error(housing_labels, housing_predictions)
# rmse = np.sqrt(lr_mse)
# print("RMSE: ", rmse) # 預測誤差 68628.19819848923 美元

# 因此對於擬合不足的模型, 可以選用更強大的模型或是提供更好的特徵, 減少限制等等
# 這裡則更換更強大的模型 DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor as DTR
tree_reg = DTR()
tree_reg.fit(housing_prepared, housing_labels)
predicted = tree_reg.predict(housing_prepared)
# mse = mean_squared_error(housing_labels, predicted)
# rmse = np.sqrt(mse)
# print("RMSE: ", rmse) # RMSE:  0.0
# 這裡的結果正確率是100%, 但要考慮是否過度擬合 Overfitting

# 因此, 要使用交叉驗證來進行更好的評估模型
from sklearn.model_selection import cross_val_score as cvs
scores = cvs(tree_reg, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
# Scikit-Learn 交叉驗證更傾向於效用函數(越大越好), 而不是成本函數(越小越好), 所以計算分數實際上是負的MSE
# 來查看結果
def display_score(scores):
    print("Score: ", scores)
z = sqrt(mean_squared_error(y_test, predicted))
print('RMS Evaluation:  {}'.format(z))
print('Prediction/Fit Run Time: {}\n'.format(elapsed_time))

print("Bayesian Ridge:")
model = BR()
start_time = time.time()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
elapsed_time = time.time() - start_time
z = sqrt(mean_squared_error(y_test, predicted))
print('RMS Evaluation:  {}'.format(z))
print('Prediction/Fit Run Time: {}\n'.format(elapsed_time))

print("Decision Tree Regression:")
model = DTR(max_depth=3)
start_time = time.time()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
elapsed_time = time.time() - start_time
z = sqrt(mean_squared_error(y_test, predicted))
print('RMS Evaluation:  {}'.format(z))
print('Prediction/Fit Run Time: {}\n'.format(elapsed_time))

print("Linear Regression:")
model = LNR()
start_time = time.time()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
elapsed_time = time.time() - start_time
z = sqrt(mean_squared_error(y_test, predicted))
def main():
    # ----------------------------
    # Training data
    # ----------------------------
    # Loading training data
    trainingDataFile = 'Training_set.csv'
    trainingData = pd.read_csv(trainingDataFile)

    # Obtaining unique cases of events (Note: This remains the same for both training and test data)
    myEventSet = []
    for x in trainingData.events:
        if x not in myEventSet:
            myEventSet.append(x)
    print('Unique events are as follows: \n', myEventSet,'\n')


    # Event string value reassignment based on unique event cases in 'myEventSet'
    newEvents = []
    for x in trainingData.events:
        for i in range(len(myEventSet)):
            if x == myEventSet[i]:
                newEvents.append(i)

    # Converting datetime to Seconds and saving day of the week
    day = []
    numDateTrainData = []
    for i in range(len(trainingData.date)):
        date_obj = datetime.strptime(str(trainingData.date[i]), '%Y-%m-%d')
        numDateTrainData.append(date_obj.timestamp())
        day.append(date_obj.weekday())

    #print(trainingData.date)
    dictReqCount = {}
    for i in range(len(trainingData.date)):
        if day[i] not in dictReqCount.keys():
            dictReqCount[day[i]] = []
        dictReqCount[day[i]].append(trainingData.request_count[i])
    #print(dictReqCount)

    dictAvgReqCount = {}
    for key,val in dictReqCount.items():
        dictAvgReqCount[key] = sum(val)/len(val)
    #print(dictAvgReqCount)

    maxValue = max(dictAvgReqCount.values())
    maxKey = [key for key,val in dictAvgReqCount.items() if val == maxValue]
    print('Day #{} of the week has the max mean request count'.format(maxKey[0]))

    minValue = min(dictAvgReqCount.values())
    minKey = [key for key, val in dictAvgReqCount.items() if val == minValue]
    print('Day #{} of the week has the min mean request count'.format(minKey[0]))


    # Assembling feature arrays
    features_trainingData = []
    for i in range(len(numDateTrainData)):
        row = [numDateTrainData[i], day[i], trainingData.calendar_code[i], trainingData.site_count[i], trainingData.max_temp[i], trainingData.min_temp[i], trainingData.precipitation[i], newEvents[i]];
        features_trainingData.append(row)

    #for i in range(len(features_trainingData)):
    #    print(len(features_trainingData[i]))

    #Y = list(trainingData.request_count)
    Y = trainingData.request_count
    X = features_trainingData

    #print('length of Y =', len(Y))
    #print(features_trainingData)

    # Models that work on both continuous and discrete data
    scoring = 'neg_mean_squared_error'
    models = [DTR(),GNB(),RFR(),KNR()]
    '''models = [[DTR(), DTR(max_depth=2), DTR(max_depth=5)],
              [GNB(), GNB(priors=None)],
              [RFR(), RFR(), RFR()],
              [KNR(), KNR(), KNR()]]
              '''
    seed = 7
    kfold = MS.KFold(n_splits=10, random_state=seed)
    i = 0
    mErr = []
    for model in models:
        results = MS.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        mErr.append(results.mean())
        i += 1
    #print(mErr)

    best_model_index = 0
    maxAbsErrInd = math.fabs(mErr[0])
    for i in range(1, len(mErr)):
        if (math.fabs(mErr[i]) < maxAbsErrInd):
            best_model_index = i
            maxAbsErrInd = math.fabs(mErr[i])
    print('\nModel #%d (i.e. %s) performed best' %(best_model_index, str(models[best_model_index]).split('(')[0]))

    # -------------------------------------------------------
    # Test Data
    # -------------------------------------------------------
    # Loading test data
    testDataFile = 'Test_set.csv'
    testData = pd.read_csv(testDataFile)

    # Event string reassignment using myEventSet from training data
    newEvents = []
    for x in testData.events:
        for i in range(len(myEventSet)):
            if x == myEventSet[i]:
                newEvents.append(i)

    # Converting datetime to Seconds and determining days of the week
    day = []
    numDateTestData = []
    for i in range(len(testData.date)):
        date_obj = datetime.strptime(str(testData.date[i]), '%Y-%m-%d')
        numDateTestData.append(date_obj.timestamp())
        day.append(date_obj.weekday())

    # Assembling feature arrays
    features_testData = []
    for i in range(len(numDateTestData)):
        row = [numDateTestData[i], day[i], testData.calendar_code[i], testData.site_count[i], testData.max_temp[i],
               testData.min_temp[i], testData.precipitation[i], newEvents[i]];
        features_testData.append(row)

    # Test data features
    X_test = features_testData

    # Test data prediction
    bestModel = models[best_model_index]
    Y_pred = bestModel.fit(X, Y).predict(X_test)
    Y_pred_train = bestModel.fit(X, Y).predict(X)
    print('\nThe predicted values for request count using the test data is as follows:\n',Y_pred)

    output_file = open('predicted_request_count.csv','w')
    for i in range(len(Y_pred)):
        output_file.write(str(Y_pred[i])+'\n')
    output_file.close()

    # Plot the results
    plt.figure(1)
    plt.scatter(numDateTrainData, Y, c="darkorange", label="Training data")
    plt.scatter(numDateTestData, Y_pred, c="cornflowerblue", label="Test data model prediction")
    plt.scatter(numDateTrainData, Y_pred_train, c="red", label="Training data model prediction")
    plt.xlabel("Numerical Date")
    plt.ylabel("Page Count")
    plt.title("Best Model")
    plt.legend()
    plt.show()
Exemple #23
0
for i in data1.columns:
    if data1[i].dtype ==object:
        print(i)
        data1 =cat_to_num(data1,i)
        
data1.drop(['gender','ssc_b','hsc_b','degree_t'],inplace =True,axis =1)        
        
x = data1 
y = data1['salary']       

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

from sklearn.tree import DecisionTreeRegressor as DTR
regr = DTR()
regr.fit(x_train,y_train)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train,y_train)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_test, y_test);

from sklearn.metrics import r2_score
print(r2_score(y_test,regr.predict(x_test)))

ax1 = sns.distplot(y_test,hist=False,color ="r",label ="Actual Value")
sns.distplot(regr.predict(x_test),color ="b",hist = False,label = "Preicted Value",ax =ax1)
def trainDecisionTreeModel(inputData, outputData, workOrFreeDay):
    
    regressor = DTR(random_state=0, max_depth = 5)
    predicted = cross_val_predict(regressor, inputData, outputData, cv=10)
    printEvaluationScores(predicted, outputData, "Simple DecisionTree", workOrFreeDay)
plt.xlabel('Date');


# So this is as far as a linear regression will go. This gives us a baseline $R^2$ of 61.2% to build from with a more complicated model.
# 
# The most obvious thing that will need to be improved is the fact that negative radiation is impossible. Therefore, we will need a model that can deal with this. 

# ## Decision Tree Algorithm

# In[32]:


from sklearn.tree import DecisionTreeRegressor as DTR

# fit random forest
dt = DTR()
dt.fit(X_train, y_train)


# In[33]:


dt.score(X=X_test, y=y_test)


# In[34]:


# predict using the random forest model
y_pred = dt.predict(X_test)
enet_msq = msq(enet.predict(X_test), y_test)
enet_r2 = r2(enet.predict(X_test), y_test)
print('\nThe mean squared error of the ElasticNet model is: \t\t\t%s' %
      enet_msq)
print('The R2 score of the ElasticNet model is: \t\t\t\t%s' % enet_r2)

# =============================================================================
# TREE CLASSIFICATIONS
# =============================================================================
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.neighbors import KNeighborsClassifier as KNC

# DECISION TREE REGRESSOR
dtr = DTR()
dtr.fit(X_train, y_train)
dtr_msq = msq(dtr.predict(X_test), y_test)
dtr_r2 = r2(dtr.predict(X_test), y_test)
print(
    '\nThe mean squared error of the Decision Tree Regressor model is: \t%s' %
    dtr_msq)
print('The R2 score of the Decision Tree Regressor model is: \t\t\t%s' %
      dtr_r2)

# DECISION TREE CLASSIFIER
dtc = DTC()
dtc.fit(X_train, y_train)
dtc_msq = msq(dtc.predict(X_test), y_test)
dtc_r2 = r2(dtc.predict(X_test), y_test)
print(
Exemple #27
0
"""
##############################Regression##############################
"""
from sklearn.metrics import mean_squared_error

#k-Neighbors Regressor
from sklearn.neighbors import KNeighborsRegressor as KNR
knr_energy = KNR(weights='distance').fit(X_train_energy_pca, y_train_energy)
y_pred_knr = knr_energy.predict(X_test_energy_pca)
print("Mean squared error for kNN: {:.3f}.".format(
    mean_squared_error(y_pred_knr, y_test_energy)))

#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor as DTR
dtr_energy = DTR(max_depth=11,
                 min_samples_split=16,
                 min_samples_leaf=2,
                 random_state=37).fit(X_train_energy_stand, y_train_energy)
y_pred_dtr = dtr_energy.predict(X_test_energy_stand)
print("Mean squared error for DTR: {:.3f}.".format(
    mean_squared_error(y_pred_dtr, y_test_energy)))

#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor as RFR
rfr_energy = RFR(n_estimators=100,
                 min_samples_leaf=2,
                 max_leaf_nodes=1000,
                 random_state=37).fit(X_train_energy, y_train_energy)
y_pred_rfr = rfr_energy.predict(X_test_energy)
print("Mean squared error for RFR: {:.3f}.".format(
    mean_squared_error(y_pred_rfr, y_test_energy)))
Exemple #28
0
	mydata = mydata.loc[myreturn.index]

	myreturn = myreturn.sort_values(ascending=False)
	total = len(myreturn.index)
	#mark the first 1/3 stocks (by return) as 1, and the last 1/3 as 0
	top_index = myreturn.index[:int(total/3)]
	bottom_index = myreturn.index[int(total*2/3):]
	top = mydata.loc[top_index]
	bottom = mydata.loc[bottom_index]
	top['target'] = myreturn.loc[top_index]
	bottom['target'] = myreturn.loc[bottom_index]
	mydata = pd.concat([top,bottom],axis=0).dropna(axis=0)
	target = mydata.pop('target')

	#train a new regression tree of today
	dtr = DTR(max_depth=m_depth)
	mytrees = []
	myscores = []
	for i in range(0,num_trees):
		train_X,test_X, train_y, test_y = train_test_split(mydata,target,test_size = 0.2,random_state = 11+2*i)
		newresult = dtr.fit(train_X,train_y)
		mytrees.append(newresult)
		y_predict = pd.Series(newresult.predict(test_X))
		test_y = pd.Series(test_y)

		test_y = test_y.sort_values(ascending=False)
		for rank in range(0,int(len(test_y)/2)):
			test_y[rank] = 1
		for rank in range(int(len(test_y)/2),len(test_y)):
			test_y[rank] = 0
		test_y = test_y.sort_index().tolist()
Exemple #29
0
print("\nMean squared error for Linear Regression: {:.3f}.".format(
    mean_squared_error(LinReg_y_pred, y_test)))
plt.scatter(LinReg_y_pred, y_test, c='green')
"""
####################Ridge Regression####################
"""
ridge = Ridge().fit(X_train, y_train)
Ridge_y_pred = ridge.predict(X_test)
print("\nMean squared error for Ridge Regression: {:.3f}.".format(
    mean_squared_error(Ridge_y_pred, y_test)))
plt.scatter(Ridge_y_pred, y_test, c='red')
"""
####################Regression Tree####################
"""
#No changes
R_tree = DTR(random_state=37).fit(X_train, y_train)
RT_y_pred = R_tree.predict(X_test)
print("\nMean squared error for Regression Tree: {:.3f}.".format(
    mean_squared_error(RT_y_pred, y_test)))

#Depth set to 11
R_tree = DTR(max_depth=11, random_state=37).fit(X_train, y_train)
RT_y_pred = R_tree.predict(X_test)
print("\tDepth set to 11: {:.3f}.".format(mean_squared_error(
    RT_y_pred, y_test)))

#Min samples split set to 16
R_tree = DTR(min_samples_split=16, random_state=37).fit(X_train, y_train)
RT_y_pred = R_tree.predict(X_test)
print("\tMin samples split set to 16: {:.3f}.".format(
    mean_squared_error(RT_y_pred, y_test)))
Exemple #30
0
def train_some_model(X, X_cat, y, model='linear', params=None):
    ada_scores = []
    ridge_scores = []
    gbr_scores = []
    linear_scores = []
    knn_scores = []

    k = 0

    K = 10
    for train, test in model_selection.KFold(K, shuffle=True).split(X, y):
        X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[
            train], y.iloc[test]

        X_cat_train, X_cat_test = X_cat.iloc[train], X_cat.iloc[test]
        sale_price_mean = X_cat_train['SalePrice'].mean()

        for c in X_cat.columns:
            if c == 'SalePrice':
                continue
            hood_price = X_cat_train.groupby(
                c)['SalePrice'].mean().reset_index()
            hood_price.columns = [c, c + '_mean_price']
            merged_train = X_cat_train.reset_index().merge(
                hood_price, how='left',
                on=[c]).set_index('index')[c + '_mean_price']
            X_train = pd.concat((X_train, merged_train), axis=1)
            merged_test = X_cat_test.reset_index().merge(
                hood_price, how='left', on=[
                    c
                ]).set_index('index')[c +
                                      '_mean_price'].fillna(sale_price_mean)
            X_test = pd.concat((X_test, merged_test), axis=1)

        y_train = np.log(y_train)
        y_test = np.log(y_test)

        split = (X_train, y_train, X_test, y_test)

        dtr = DTR(criterion='mse', max_depth=None)
        ada = ensemble.AdaBoostRegressor(base_estimator=dtr,
                                         n_estimators=100,
                                         learning_rate=1.0,
                                         loss='exponential')
        score = get_score(ada, split)
        ada_scores.append(score)

        ridge = linear_model.Ridge(250)
        score = get_score(ridge, split)
        ridge_scores.append(score)

        gbr = ensemble.GradientBoostingRegressor(n_estimators=100,
                                                 learning_rate=0.1,
                                                 max_depth=5)
        score = get_score(gbr, split)
        gbr_scores.append(score)

        linear = linear_model.LinearRegression()
        score = get_score(linear, split)
        linear_scores.append(score)

        #        is bad:
        #        knn = KNR(n_neighbors=10)
        #        score = get_score(knn, split)
        #        knn_scores.append(score)

        #        print(dict(zip(X_train.columns, model.feature_importances_)))
        #        print(X_train.columns[np.argpartition(model.feature_importances_, -4)[-4:]])
        #        print(model.feature_importances_[np.argpartition(model.feature_importances_, -4)[-4:]])

        # print(model.feature_importances_)

        k += 1
    scores = [ada_scores, ridge_scores, gbr_scores, linear_scores, knn_scores]
    return scores, k