Example #1
0
def linearSVR(data):
    X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1)
    y = data["price"]
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42)
    svr = LinearSVR(random_state=42)
    svr.fit(X_train, y_train)
    y_predict = svr.predict(X_test)
    print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
Example #2
0
def linearSVR(X, c_param, norm=2):
    if norm == 2:
        XX = normalizeL2(X)

    T = X.shape[0] # temporal length
    clf = LinearSVR(C=c_param, dual=False, loss='squared_epsilon_insensitive', \
                    epsilon=0.1, tol=0.001, verbose=False)  # epsilon is "-p" in C's liblinear and tol is "-e"
    clf.fit(XX, np.linspace(1,T,T))

    return clf.coef_
Example #3
0
    def train(self, trainSet):
        pntNum = trainSet.meanShape.shape[0]
        treeNum = int(self.maxTreeNum/pntNum)
        
        ### Train the random forests
        begTime = time.time()
        for i in xrange(pntNum):
            rf = RandForest(treeDepth = self.treeDepth,
                            treeNum   = treeNum,
                            feaNum    = self.feaNum,
                            radius    = self.radius,
                            binNum    = self.binNum,
                            feaRange  = self.feaRange)
            rf.train(trainSet, i)
            self.rfs.append(rf)
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tRandom Forest     : %f mins"%elapse)

        ### Extract the local binary features
        begTime = time.time()
        feas = self.genFeaOnTrainset(trainSet)
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tExtract LBFs      : %f mins"%elapse)

        ### Global regression
        begTime = time.time()
        y = trainSet.residuals
        y = y.reshape(y.shape[0], y.shape[1]*y.shape[2])
        for i in xrange(pntNum*2):
            ### TODO Show the training result 
            reg=LinearSVR(epsilon=0.0, 
                          C = 1.0/feas.shape[0],
                          loss='squared_epsilon_insensitive',
                          fit_intercept = True)
            reg.fit(feas, y[:, i])
            self.regs.append(reg)
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tGlobal Regression : %f mins"%elapse)

        ### Update the initshapes
        begTime = time.time()
        for i in xrange(pntNum):
            regX = self.regs[2*i]
            regY = self.regs[2*i+1]
            
            x = regX.predict(feas)
            y = regY.predict(feas)
            delta = NP.squeeze(NP.dstack((x,y)))
            delta = Affine.transPntsForwardWithDiffT(delta, 
                                                     trainSet.ms2reals)
            delta = NP.multiply(delta, 
                                trainSet.bndBoxs[:,[2,3]])
            trainSet.initShapes[:,i,:] = trainSet.initShapes[:,i,:] + delta
        elapse = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tUpdate Shape      : %f mins"%elapse)
def main(train_file, model_file):
    train_x, train_y = load_trainingData(train_file)
    #LR = LinearRegression(normalize = True)
    #LR = Ridge(alpha=0.5)
    #LR = SVR(C=1.0, epsilon=0.2, verbose = True)
    LR = LinearSVR(verbose = 1, epsilon = 0.1)
    logging("training model...")
    starttime = datetime.now()
    LR.fit(train_x, train_y)
    logging("training model, eplased time:%s" % str(datetime.now() - starttime))
    logging("saving model")
    joblib.dump(LR, model_file)
Example #5
0
 def GlobalRegression(self, lbf, shape_residual):
     m = K
     n, f = lbf.shape
     # prepare linear regression X, Y
     X = lbf
     Y = shape_residual / img_o_width
     # parallel
     for i in xrange(landmark_n*2):
         reg = LinearSVR(epsilon=0.0, C=1.0/n, loss='squared_epsilon_insensitive', fit_intercept = True)
         reg.fit(X, Y[:, i])            
         self.w[i] = reg.coef_
     self.w = self.w * img_o_width        
Example #6
0
class SVRR(object):

    def __init__(self, C):
        self.regression = LinearSVR(C=C)

    def fit(self, xs, ys):
        xs = xs.values
        ys = ys['y']
        self.regression.fit(xs, ys)

    def predict(self, xs):
        xs = xs.values
        ys = self.regression.predict(xs)
        return ys
Example #7
0
    def globalRegress(self, posSet, negSet):
        self.feaDim = self.getFeaDim()
        ### Extract the local binary features
        begTime = time.time()
        posFeas = self.genFeaOnTrainset(posSet)
        negFeas = self.genFeaOnTrainset(negSet)
        t = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tExtract LBFs      : %f mins"%t)

        ### Global regression
        begTime = time.time()
        y = posSet.residuals
        y = y.reshape(y.shape[0], y.shape[1]*y.shape[2])
        for i in xrange(posSet.pntNum*2):
            ### TODO Show the training result 
            reg=LinearSVR(epsilon=0.0, 
                          C = 1.0/posFeas.shape[0],
                          loss='squared_epsilon_insensitive',
                          fit_intercept = True)
            reg.fit(posFeas, y[:, i])
            self.globalReg.append(reg)
        t = getTimeByStamp(begTime, time.time(), 'min')
        print("\t\tGlobal Regression : %f mins"%t)

        ### Update the initshapes
        begTime = time.time()
        for i in xrange(posSet.pntNum):
            regX = self.globalReg[2*i]
            regY = self.globalReg[2*i+1]
            
            x = regX.predict(posFeas)
            y = regY.predict(posFeas)
            delta = NP.squeeze(NP.dstack((x,y)))
            delta = NP.multiply(delta,
                                posSet.winSize)
            posSet.initShapes[:,i,:] = posSet.initShapes[:,i,:] + delta
            x = regX.predict(negFeas)
            y = regY.predict(negFeas)
            delta = NP.squeeze(NP.dstack((x,y)))
            delta = NP.multiply(delta,
                                negSet.winSize)
            negSet.initShapes[:,i,:] = negSet.initShapes[:,i,:] + delta
        t = getTimeByStamp(begTime, time.time(), 'min')

        self.applyPntOffsetIntoTree()
        print("\t\tUpdate Shape      : %f mins"%t)
def meta_model_fit(X_train, y_train, svm_hardness, fit_intercept, number_of_threads, regressor_type="LinearSVR"):
    """
    Trains meta-labeler for predicting number of labels for each user.

    Based on: Tang, L., Rajan, S., & Narayanan, V. K. (2009, April).
              Large scale multi-label classification via metalabeler.
              In Proceedings of the 18th international conference on World wide web (pp. 211-220). ACM.
    """
    if regressor_type == "LinearSVR":
        if X_train.shape[0] > X_train.shape[1]:
            dual = False
        else:
            dual = True

        model = LinearSVR(C=svm_hardness, random_state=0, dual=dual,
                          fit_intercept=fit_intercept)
        y_train_meta = y_train.sum(axis=1)
        model.fit(X_train, y_train_meta)
    else:
        print("Invalid regressor type.")
        raise RuntimeError

    return model
Example #9
0
def build_svm(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a support vector regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    clf = LinearSVR(random_state=1, dual=False, epsilon=0,
                    loss='squared_epsilon_insensitive')
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/svm_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
    class LinearSVRPermuteCoef:
        def __init__(self, **kwargs):
            self.model = LinearSVR(**kwargs)

        def fit(self, X, y):
            self.model.fit(X, y)

            self.coef_ = self.model.coef_
            self.intercept_ = self.model.intercept_

            def add_coef(arr, fn):
                arr.append(fn(self.coef_))

            add_coef(coeffs_state['max'], np.max)
            add_coef(coeffs_state['min'], np.min)

            return self

        def get_params(self, deep=True):
            return self.model.get_params(deep)

        def set_params(self, **kwargs):
            self.model.set_params(**kwargs)
            return self

        def predict(self, X):
            return self.model.predict(X)

        def score(self, X, y, sample_weight=None):
            if sample_weight is not None:
                return self.model.score(X, y, sample_weight)
            else:
                return self.model.score(X, y)

        @staticmethod
        def permute_min_coefs():
            return coeffs_state['min']

        @staticmethod
        def permute_max_coefs():
            return coeffs_state['max']

        @staticmethod
        def reset_perm_coefs():
            coeffs_state['min'] = []
            coeffs_state['max'] = []
Example #11
0
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = LinearSVR(C=25.0,
                              dual=False,
                              epsilon=0.001,
                              loss="squared_epsilon_insensitive",
                              tol=1e-05)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #12
0
    gamma, C = hyperparams[i]
    plt.title(r"$\gamma = {}, C = {}$".format(gamma, C), fontsize=16)

#save_fig("moons_rbf_svc_plot")
plt.show()

#%% SVR Regression

np.random.seed(42)
m = 50
X = 2 * np.random.rand(m, 1)
y = (4 + 3 * X + np.random.randn(m, 1)).ravel()

from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5, random_state=42)
svm_reg.fit(X, y)

svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)
svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)


def find_support_vectors(svm_reg, X, y):
    y_pred = svm_reg.predict(X)
    off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
    return np.argwhere(off_margin)


svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
target_col = col[2]
features = col[3:len(col)]

X = data[features].values
y = data[target_col].values
y = np.log1p(y)
y = np.reshape(y, (-1,1))

###############################################################################

# Model configuration

base = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=0.1)),
    MaxAbsScaler(),
    StackingEstimator(estimator=RidgeCV()),
    Normalizer(norm="l2"),
    StackingEstimator(estimator=LinearSVR(C=0.5, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=4, n_estimators=100)),
    MinMaxScaler(),    
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=LinearSVR(C=5.0, dual=True, epsilon=0.1, loss="epsilon_insensitive", tol=0.0001)),
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=SGDRegressor()),
    RobustScaler(),
    StackingEstimator(estimator=LinearSVR(C=15.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.75, tol=0.001)),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=1, min_child_weight=6, n_estimators=100, nthread=1, objective="reg:squarederror", subsample=0.6500000000000001)),
    MinMaxScaler(),
	combined = np.append(X, np.matrix(Y).T, axis=1) 
	np.random.shuffle(combined)
	tail_size = -1 * size
	last_column = X.shape[1]
	training_labels = combined[:tail_size, last_column]
	training_data = combined[:tail_size, :-2]
	test_data = combined[tail_size:, :-2]
	actual_labels = combined[tail_size:, last_column]
	return training_data, np.ravel(training_labels), test_data, np.ravel(actual_labels)

training = open('author_features')
NO_TRAINING_SAMPLES = 6000
NO_OF_AUTHORS = 10000
matrix = dok_matrix((NO_TRAINING_SAMPLES, NO_OF_AUTHORS), dtype=np.int)
for line in training.readlines():
	values = line.rstrip().split()
	matrix[int(values[0]), int(values[1])] = 1

labels_file = open('year_training_labels')
labels = [int(x) for x in labels_file.readline().rstrip().split()]

training_matrix = matrix[:4498]
training_data, training_labels, test_data, actual_labels = sample(training_matrix, labels)
classifier = LinearSVR()
classifier.fit(training_data, training_labels)
output = classifier.predict(test_data)
for index, predicted in enumerate(output):
	print '%s %s' % (predicted, actual_labels[index])

print metrics.explained_variance_score(actual_labels, output)
Example #15
0
with open("C:/Users/sean/Desktop/SVR_DATA/edwademd.csv","rb") as data_file:
    data,target = [],[]
    for row in csv.reader(data_file):
        data += [[row[0],row[4],row[6],row[10]]]
        target += [row[9]]

data,target = Lin_clean_data(data[1:],target[1:],2)

point = 2000
X_train = data[:point-1]
X_test = data[point:point+int(point*0.2)]
y_train = target[:point-1]
y_test = target[point:point+int(point*0.2)]


svr = LinearSVR(C=0.1)
svr_model = svr.fit(X_train,y_train)
lin = svr.predict(X_train)
lin_test = svr.predict(X_test)

lin,lin_test = data_normalize(y_train,y_test,lin,lin_test)

print("Train score : ",score(y_train,lin))
print("Train average error : ",sum(abs(y_train-lin)) / float(len(y_train)))

print("Fit score : ",score(y_test,lin_test))
print("Fit average error : ",sum(abs(y_test-lin_test)) / float(len(y_test)))

figure1 = plt.figure(1,figsize=[20,10])
draw_pic(range(len(X_train)),range(len(X_test)),lin,lin_test,y_train,y_test,label='lin',figure=figure1)
figure1.savefig("C:/Users/sean/Desktop/SVR_DATA/linSVR.png",dpi=300,format="png")
Example #16
0
def linearSVR(train,trainLable,testData):
    clf = LinearSVR()  
    clf.fit(train,trainLable)  
    predict = clf.predict(testData)  
    return predict  
Example #17
0
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
import proc_temperature as pt

lr = LinearRegression(fit_intercept=False)
rf = RandomForestRegressor()
svr = LinearSVR()
import Train.split_by_season as ss

# Energy in bottle
tot_en = 2212  # in MJ, 45kg*1.856*26
base_cons = 4.5  # 9MJ burner burning for half an hour everyday
heater_max_config = 15  # MJ, can be 25 as well
heater_duration = 4  # hours
heater_energy_cons = heater_max_config * heater_duration
hot_water_max_config = 125  # MJ/hour for 16 L/Min hot water and 199 MJ/hour for 26L/Min hotwater
hot_water_duration = 5  # Minutes
hw_energy_cons = hot_water_max_config * hot_water_duration / 60
max_cons = hw_energy_cons + heater_energy_cons + base_cons

alpha = 12  # Temperature below which heater and hot water will start pushing the consumption up the slope
beta = 6  # Temperature at which max_cons is being consumed at the house
gamma = (max_cons - base_cons) / (alpha - beta)


class MLE:
    def __init__(self, binsize, cols):
        self.binsize = binsize
Example #18
0
import numpy as np

from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    LinearSVR(C=0.0001,
              dual=False,
              epsilon=0.1,
              loss="squared_epsilon_insensitive",
              tol=0.0001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #19
0
def dict_method_reg():
    dict_method = {}
    # 1st part
    """1SVR"""
    me1 = SVR(kernel='rbf', gamma='auto', degree=3, tol=1e-3, epsilon=0.1, shrinking=False, max_iter=2000)
    cv1 = 5
    scoring1 = 'r2'
    param_grid1 = [{'C': [1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.01, 0.001, 0.0001]}]
    dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]})

    """2BayesianRidge"""
    me2 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False,
                        copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06,
                        n_iter=300, normalize=False, tol=0.01, verbose=False)
    cv2 = 5
    scoring2 = 'r2'
    param_grid2 = [{'alpha_1': [1e-07, 1e-06, 1e-05], 'alpha_2': [1e-07, 1e-05, 1e-03]}]
    dict_method.update({'BayR-set': [me2, cv2, scoring2, param_grid2]})

    """3SGDRL2"""
    me3 = SGDRegressor(alpha=0.0001, average=False,
                       epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
                       learning_rate='invscaling', loss='squared_loss', max_iter=1000,
                       penalty='l2', power_t=0.25,
                       random_state=0, shuffle=True, tol=0.01,
                       verbose=0, warm_start=False)
    cv3 = 5
    scoring3 = 'r2'
    param_grid3 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05]}]
    dict_method.update({'SGDRL2-set': [me3, cv3, scoring3, param_grid3]})

    """4KNR"""
    me4 = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2,
                                        metric='minkowski')
    cv4 = 5
    scoring4 = 'r2'
    param_grid4 = [{'n_neighbors': [3, 4, 5, 6]}]
    dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]})

    """5kernelridge"""
    kernel = 1.0 * RBF(1.0)
    me5 = kernel_ridge.KernelRidge(alpha=1, kernel=kernel, gamma="scale", degree=3, coef0=1, kernel_params=None)
    cv5 = 5
    scoring5 = 'r2'
    param_grid5 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001]}]
    dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]})

    """6GPR"""
    # kernel = 1.0 * RBF(1.0)
    kernel = Matern(length_scale=0.1, nu=0.5)
    me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1e-10, optimizer='fmin_l_bfgs_b',
                                                    n_restarts_optimizer=10,
                                                    normalize_y=False, copy_X_train=True, random_state=0)
    cv6 = 5
    scoring6 = 'r2'
    param_grid6 = [{'alpha': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]}]
    dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]})

    # 2nd part

    """6RFR"""
    me7 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                         min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0,
                                         min_impurity_split=None, bootstrap=True, oob_score=False,
                                         random_state=None, verbose=0, warm_start=False)
    cv7 = 5
    scoring7 = 'r2'
    param_grid7 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]})

    """7GBR"""
    me8 = ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100,
                                             subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                                             min_samples_leaf=1, min_weight_fraction_leaf=0.,
                                             max_depth=3, min_impurity_decrease=0.,
                                             min_impurity_split=None, init=None, random_state=None,
                                             max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
                                             warm_start=False, presort='auto')
    cv8 = 5
    scoring8 = 'r2'
    param_grid8 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]})

    "AdaBR"
    dt = DecisionTreeRegressor(criterion="mae", splitter="best", max_features=None, max_depth=3, min_samples_split=2)
    me9 = AdaBoostRegressor(dt, n_estimators=100, learning_rate=1, loss='square', random_state=0)
    cv9 = 5
    scoring9 = 'r2'
    param_grid9 = [{'n_estimators': [50, 120, 100, 200]}]
    dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]})

    '''TreeR'''
    me10 = DecisionTreeRegressor(
        criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
        min_impurity_decrease=0.0, min_impurity_split=None, presort=False)
    cv10 = 5
    scoring10 = 'r2'
    param_grid10 = [{'max_depth': [3, 4, 5, 6], 'min_samples_split': [2, 3, 4]}]
    dict_method.update({'TreeC-em': [me10, cv10, scoring10, param_grid10]})

    'ElasticNet'
    me11 = ElasticNet(alpha=1.0, l1_ratio=0.7, fit_intercept=True, normalize=False, precompute=False, max_iter=1000,
                      copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None)

    cv11 = 5
    scoring11 = 'r2'
    param_grid11 = [{'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'l1_ratio': [0.3, 0.5, 0.8]}]
    dict_method.update({"ElasticNet-L1": [me11, cv11, scoring11, param_grid11]})

    'Lasso'
    me12 = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000,
                 tol=0.001,
                 warm_start=False, positive=False, random_state=None, )

    cv12 = 5
    scoring12 = 'r2'
    param_grid12 = [{'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100, 1000]}, ]
    dict_method.update({"Lasso-L1": [me12, cv12, scoring12, param_grid12]})

    """SGDRL1"""
    me13 = SGDRegressor(alpha=0.0001, average=False,
                        epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
                        learning_rate='invscaling', loss='squared_loss', max_iter=1000,
                        penalty='l1', power_t=0.25,
                        random_state=0, shuffle=True, tol=0.01,
                        verbose=0, warm_start=False)
    cv13 = 5
    scoring13 = 'r2'
    param_grid13 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-5, 1e-6, 1e-7], "epsilon": [0.1, 0.2, 1]}]
    dict_method.update({'SGDR-L1': [me13, cv13, scoring13, param_grid13]})

    """LinearSVR"""
    me14 = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0,
                     loss='epsilon_insensitive', fit_intercept=True,
                     intercept_scaling=1., dual=True, verbose=0,
                     random_state=3, max_iter=1000)
    cv14 = 5
    scoring14 = 'r2'
    param_grid14 = [{'C': [10, 6, 5, 3, 2.5, 1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.0, 0.1]}]
    dict_method.update({"LinearSVR-set": [me14, cv14, scoring14, param_grid14]})

    return dict_method
Example #20
0
st = preprocessing.MinMaxScaler()
x = st.fit_transform(x)

method = ["SVR-set", "AdaBR-em", 'GBR-em', "LinearSVR-set"]

result = score_muti(x, y, me="reg", paras=True, method_name=method, shrink=1, str_name=False, param_grid=None)

from sklearn.model_selection import cross_val_predict

pre_y = cross_val_predict(
    SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.001, gamma='auto',
        kernel='rbf', max_iter=2000, shrinking=False, tol=0.001, verbose=False)
    , x, y, ) - cut
lin = LinearSVR(C=10, dual=True, epsilon=0.0, fit_intercept=True,
                intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
                random_state=3, tol=0.0001, verbose=0)

lin.fit(x, y)
pre_y2 = lin.predict(x) - cut

print(result[0])
print(result[1][-1].coef_)
print(result[1][-1].intercept_)

coef = lin.coef_
inter = lin.intercept_
data_max_ = st.data_max_
data_min_ = st.data_min_
data_range = st.data_range_
Example #21
0
# with data with a very low Signal-to-noise ratio as one would expect from financial data. It is also a very fast algorithm (liblinear is heavily optimized).
# We will do a grid search with 5-fold GroupKFold cross-validation. As mentioned earlier, the fact that returns are not independent of Feature_7, we will have
# to group our cross-validation in order to avoid data leakage and hence overestimation of the CV performance*.
#
# Ideally, we should optimize using a loss function suitable for optimizing Weighed Mean Absolute Error (which is non-differentiable at 0). We did not
# prioritize this, and we still got reasonable results in the model scoring.
#
# (*)See: https://stats.stackexchange.com/questions/95797/how-to-split-the-dataset-for-cross-validation-learning-curve-and-final-evaluat
# and http://www.jmlr.org/papers/volume11/cawley10a/cawley10a.pdf

# %%
print('Building model...')

# Define initial model
model = LinearSVR(epsilon=0.0,
                  C=0.0005,
                  loss='squared_epsilon_insensitive',
                  random_state=0)  # 1727.860

# Define model pipeline for multi output regression
multi_out_reg = MultiOutputRegressor(model)
model_pipeline = Pipeline(
    steps=[('preprocessor', preprocessor_X), ('multioutreg', multi_out_reg)])
estimator = TransformedTargetRegressor(regressor=model_pipeline,
                                       transformer=preprocessor_Y)


def WA(a, axis, weight):
    # Adapted from function_base.py
    a = np.asanyarray(a)
    wgt = np.asanyarray(weight)
    wgt = np.broadcast_to(wgt, (a.ndim - 1) * (1, ) + wgt.shape)
Example #22
0
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
#beause of the previous learning,in these codes,I will not use the normalizatin and result analysing

#try to compare the SVR with the linearRegression on a same dataset

data = pd.read_csv("./Folds5x2_pp.csv", header=0, encoding="gbk")
X = data[['AT', 'V', 'AP', 'RH']]
y = data[['PE']]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=10)  #拆分成训练集和测试集
svr_Linear = LinearSVR(random_state=0)
svr_Linear.fit(X_train, y_train)
print("SVR_score:", svr_Linear.score(X_train, y_train))
liner = LinearRegression()
liner.fit(X_train, y_train)
print("Linearmodel_score:", liner.score(X_train, y_train))
#by doing so,in this example,you will see that linerRegresion fit better

#try to compare the svc with logisticregression on a same dataset
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine_dataset = pd.read_csv(URL, header=None)
wine_dataset.columns = [
    'class label', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10',
    'F11', 'F12', 'F13'
]
X, y = wine_dataset.iloc[:, 1:].values, wine_dataset.iloc[:, 0].values
Example #23
0
    ax.set_xlim([2000, 2020.5])
    ax.set_ylim([-7500,60000])
    ax.set_xlabel("Year")
    ax.set_ylabel("Photos")
    fig.savefig(path+"images/svrprediction_cluster" + str(c) + ".jpg")
    
    
    
    
###########################################################
### PREDICTING VALUES WITH LINEAR SUPPORT VECTOR REGRESSION
###########################################################

# http://scikit-learn.org/stable/auto_examples/plot_kernel_ridge_regression.html

linsvr = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive')

param_grid ={'epsilon': [0.0, 0.1,0.2,0.3,0.4],
             'C': [1, 10, 100, 1000],
            'loss':['epsilon_insensitive','squared_epsilon_insensitive']}
linsvr_grid = GridSearchCV(linsvr, param_grid, cv=6, n_jobs=-1)
y_linsvr = linsvr_grid.fit(explanatory_df, response_series)
linbest_estimator = linsvr_grid.best_estimator_

print "Best epsilon: %s" %linbest_estimator.epsilon
print "Best C: %s" %linbest_estimator.C
print "Best Loss Function: %s" %linbest_estimator.loss
print "R-squared: %f" % linsvr_grid.score(explanatory_df,response_series)


# Create dataframe of number of points in each cluster per year
def QuickML_Ensembling(X_train,
                       y_train,
                       X_test,
                       y_test='',
                       modeltype='Regression',
                       Boosting_Flag=False,
                       scoring='',
                       verbose=0):
    """
    Quickly builds and runs multiple models for a clean data set(only numerics).
    """
    start_time = time.time()
    seed = 99
    FOLDS = 5
    model_dict = {}
    model_tuples = []
    if len(X_train) <= 100000 and X_train.shape[1] < 50:
        NUMS = 100
    else:
        try:
            X_train = X_train.sample(frac=0.30, random_state=99)
            y_train = y_train[X_train.index]
        except:
            pass
        NUMS = 200
    if modeltype == 'Regression':
        if scoring == '':
            scoring = 'neg_mean_squared_error'
        #scv = ShuffleSplit(n_splits=FOLDS,random_state=seed)
        scv = KFold(n_splits=FOLDS, shuffle=False, random_state=seed)
        if Boosting_Flag is None:
            ## Create an ensemble model ####
            model5 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                       n_estimators=NUMS,
                                       random_state=seed)
            model_tuples.append(('Adaboost', model5))
        elif not Boosting_Flag:
            model5 = LassoLarsCV(cv=scv)
            model_tuples.append(('LassoLarsCV', model5))
        else:
            model5 = LassoLarsCV(cv=scv)
            model_tuples.append(('LassoLarsCV', model5))
        if Boosting_Flag is None:
            model6 = BaggingRegressor(DecisionTreeRegressor(random_state=seed),
                                      n_estimators=NUMS,
                                      random_state=seed)
            model_tuples.append(('Bagging_Regressor', model6))
        elif not Boosting_Flag:
            model6 = LinearSVR()
            model_tuples.append(('Linear_SVR', model6))
        else:
            model6 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model6))
        model7 = KNeighborsRegressor(n_neighbors=8)
        model_tuples.append(('KNN_Regressor', model7))
        if Boosting_Flag is None:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            model8 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model8))
        elif not Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                       n_estimators=NUMS,
                                       random_state=seed)
            model_tuples.append(('Adaboost', model8))
        else:
            model8 = RandomForestRegressor(bootstrap=False,
                                           max_depth=10,
                                           max_features='auto',
                                           min_samples_leaf=2,
                                           n_estimators=200,
                                           random_state=99)
            model_tuples.append(('RF_Regressor', model8))
    else:
        if scoring == '':
            scoring = 'accuracy'
        num_classes = len(np.unique(y_test))
        scv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed)
        if Boosting_Flag is None:
            ## Create an ensemble model ####
            model5 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
                random_state=seed, max_depth=1, min_samples_leaf=2),
                                        n_estimators=NUMS,
                                        random_state=seed)
            model_tuples.append(('Adaboost', model5))
        elif not Boosting_Flag:
            model5 = LinearDiscriminantAnalysis()
            model_tuples.append(('Linear_Discriminant', model5))
        else:
            model5 = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100],
                                          solver='liblinear',
                                          random_state=seed)
            model_tuples.append(('Logistic_Regression_CV', model5))
        if Boosting_Flag is None:
            model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model6))
        elif not Boosting_Flag:
            model6 = LinearSVC()
            model_tuples.append(('Linear_SVC', model6))
        else:
            model6 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)
            model_tuples.append(('Decision_Tree', model6))
        if modeltype == 'Binary_Classification':
            model7 = GaussianNB()
        else:
            model7 = MultinomialNB()
        model_tuples.append(('Naive_Bayes', model7))
        if Boosting_Flag is None:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            model8 = RandomForestClassifier(bootstrap=False,
                                            max_depth=10,
                                            max_features='auto',
                                            min_samples_leaf=2,
                                            n_estimators=200,
                                            random_state=99)
            model_tuples.append(('Bagging_Classifier', model8))
        elif not Boosting_Flag:
            #### If the Boosting_Flag is True, it means Boosting model is present.
            ###   So choose a different kind of classifier here
            sgd_best_model = SGDClassifier(alpha=1e-06,
                                           loss='log',
                                           max_iter=1000,
                                           penalty='l2',
                                           learning_rate='constant',
                                           eta0=.1,
                                           random_state=3,
                                           tol=None)
            model8 = OneVsRestClassifier(sgd_best_model)
            model_tuples.append(('One_vs_Rest_Classifier', model8))
        else:
            model8 = RandomForestClassifier(bootstrap=False,
                                            max_depth=10,
                                            max_features='auto',
                                            min_samples_leaf=2,
                                            n_estimators=200,
                                            random_state=99)
            model_tuples.append(('Bagging_Classifier', model8))
    model_dict = dict(model_tuples)
    models, results = run_ensemble_models(model_dict, X_train, y_train, X_test,
                                          y_test, scoring, modeltype)
    return models, results
    print "\n--------------------------------------------"
    print "----------- Fold %d -----------------------" %i
    print "--------------------------------------------"
    
    val_id = fold_ids.ix[:, i].dropna()
    idx = train["Id"].isin(list(val_id))
    
    trainingSet = train[~idx]
    validationSet = train[idx]
    
    tr_X = np.matrix(trainingSet[feature_names])
    tr_Y = np.array(trainingSet["Response"])
    val_X = np.matrix(validationSet[feature_names])
    val_Y = np.array(validationSet["Response"])
    
    regm = LinearSVR(C = 0.06, epsilon = 0.45, tol = 1e-5,
                     dual = True, verbose = True, random_state = 133)
                     
    regm.fit(tr_X, tr_Y)    
    preds = regm.predict(val_X)
    
    df = pd.DataFrame(dict({"Id" : validationSet["Id"], "ground_truth" : validationSet["Response"], 
                            "linsvr_preds" : preds}))
    
    linsvr_val = linsvr_val.append(df, ignore_index = True)
    
    tpreds = regm.predict(test_X)
    cname = "Fold" + `i`
    linsvr_test[cname] = tpreds
    
linsvr_val.to_csv("ensemble2/linsvr_val.csv")
linsvr_test.to_csv("ensemble2/linsvr_test.csv")
X_2020 = weightedRunStats2020[[
    'weightWRC', 'weightPA', 'weightH', 'weightAB', 'weightRBI', 'weightG',
    'weight2B'
]].values
y_pred = regressor.predict(X_2020)

# In[241]:

y_pred_list = []  # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedRunStats2020['runsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:
y_pred = regr.predict(X_2020)
weightedRunStats2020['linearSVRRuns'] = y_pred

regr = SVR()
regr.fit(X, Y)

y_pred = regr.predict(X_2020)
weightedRunStats2020['svrRuns'] = y_pred

weightedRunStats2020 = weightedRunStats2020.sort_values(by=['linearSVRRuns'],
                                                        ascending=False)
print(weightedRunStats2020)
 def trainModel(self,Model = "default"):
     if Model == "default":
         self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3)
     else:
         self.mlModel = Model
     self.mlModel.fit(self.X_train, self.y_train)
Example #28
0
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=100,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
                                             power_t=0.0)),
    StackingEstimator(estimator=LinearSVR(C=25.0,
                                          dual=True,
                                          epsilon=0.01,
                                          loss="epsilon_insensitive",
                                          tol=0.0001)),
    FeatureAgglomeration(affinity="l2", linkage="average"),
    SelectPercentile(score_func=f_regression, percentile=6),
    StackingEstimator(estimator=LinearSVR(C=20.0,
                                          dual=True,
                                          epsilon=0.1,
                                          loss="squared_epsilon_insensitive",
                                          tol=0.1)), RidgeCV())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
col_dict = defaultdict(list)
tran_dict = {}
for idx, feature_name in enumerate(feature_names):
    short_name = re.findall('[^=]*',feature_name)[0] #get the part before the equals sign, if there is onee
    col_dict[short_name].append(idx)
    pidx = use_cols.index(short_name)
    if predictors[pidx].norm_type in transformer_map:
        tran_dict[use_cols[pidx]] = transformer_map[predictors[pidx].norm_type]
        X[:,idx] = tran_dict[use_cols[pidx]].fit_transform(X[:,idx].reshape(-1,1)).squeeze()

dict_vect.tran_dict = tran_dict

#%% COMPILE LIST OF MODELS TO COMPARE  
Lin_est = Ridge()

svr_est = LinearSVR(epsilon=0)

max_depth=16 
min_samples_leaf=50
min_samples_split=100
n_trees=100 #100
RF_est = RandomForestRegressor(n_estimators=n_trees, max_depth=max_depth, 
                               min_samples_leaf=min_samples_leaf, 
                               min_samples_split=min_samples_split,n_jobs=-1)

GBR_est = GradientBoostingRegressor(learning_rate=0.1, n_estimators=n_trees, 
                                min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split, 
                                max_depth=2)

#%% Run CV grid search if desired.
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.5348482165317705
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(FunctionTransformer(copy), FunctionTransformer(copy))),
    SelectPercentile(score_func=f_regression, percentile=89),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=SGDRegressor(alpha=0.0,
                                             eta0=0.01,
                                             fit_intercept=True,
                                             l1_ratio=1.0,
                                             learning_rate="constant",
                                             loss="squared_loss",
                                             penalty="elasticnet",
                                             power_t=50.0)), MaxAbsScaler(),
    MaxAbsScaler(),
    LinearSVR(C=0.5,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=1e-05))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #31
0

cat_vars = ['DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment','CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Day','Month','Year']


num_vars = ['Open','Store','CompetitionDistance','ratio1','ratio2']



X_trn, X_val = train_test_split(train, test_size=0.012, random_state=10)

print 'Training Stage 1 Models'

#train svm
svm1 = LinearSVR(verbose=True)
svm1.fit(X_trn[cat_vars+num_vars],X_trn['Sales'])
svm1_feature = svm1.predict(train[cat_vars+num_vars])
preds = svm1.predict(X_val[cat_vars+num_vars])
print 'svm ',(np.mean(((np.exp(preds)-np.exp(X_val['Sales']))/(np.exp(X_val['Sales'])+1))**2))**0.5


#train xgb
dtrain = xgb.DMatrix(X_trn[cat_vars+num_vars],X_trn['Sales'])
dvalid = xgb.DMatrix(X_val[cat_vars+num_vars],X_val['Sales'])
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

num_boost_round = 50
params1 = {"objective": "reg:linear","booster" : "gbtree",
"eta": 0.5,"max_depth": 2,"subsample": 0.5,"colsample_bytree": 0.4,
"nthread":4,"silent": 1,"seed": 1301}
Example #32
0
def SVMRClassifier(training_data_X, training_data_y, vocab, word_vocab,
                   svmr_type):
    if svmr_type == 'linearsvr':
        pos_vectors = CountVectorizer(vocabulary=vocab,
                                      analyzer='word',
                                      ngram_range=(1, 5),
                                      tokenizer=lambda x: x.split(' '),
                                      lowercase=False)
        text_vectors = CountVectorizer(analyzer='word',
                                       ngram_range=(1, 1),
                                       tokenizer=lambda x: x.split(' '),
                                       lowercase=True)

        classifier = LinearSVR(max_iter=100000)
        parameters = [{
            'C': [0.1, 1, 10],
            'epsilon': [0, 0.1, 1],
            'loss': ('epsilon_insensitive', 'squared_epsilon_insensitive')
        }]
        fclassifier = GridSearchCV(classifier, parameters, cv=5, n_jobs=7)

        feature_cat_1 = FeatureUnion([
            ('POS',
             Pipeline([
                 ('selector', ItemSelector(key='POS')),
                 ('vectorizer', pos_vectors),
                 ('tf', TfidfTransformer(norm='l2', use_idf=True)),
             ])),
            ('text',
             Pipeline([
                 ('selector', ItemSelector(key='text_norm')),
                 ('vectorizer', text_vectors),
                 ('tf', TfidfTransformer(norm='l2', use_idf=True)),
             ])),
            ('gf',
             Pipeline([
                 ('selector', ItemSelector(key='gf')),
                 ('toarray',
                  FunctionTransformer(returnNumpyMatrix, validate=False)),
                 ('tf', TfidfTransformer(norm='l2', use_idf=True)),
             ])),
            ('fa',
             Pipeline([
                 ('selector', ItemSelector(key='fa')),
                 ('toarray',
                  FunctionTransformer(returnNumpyMatrix, validate=False)),
                 ('tf', TfidfTransformer(norm='l2', use_idf=True)),
             ])),
            ('diag_act',
             Pipeline([
                 ('selector', ItemSelector(key='diag_act')),
                 ('toarray',
                  FunctionTransformer(returnNumpyMatrix, validate=False)),
                 ('tf', TfidfTransformer(norm='l2', use_idf=True)),
             ])),
        ])

        feature_cat_2 = FeatureUnion([
            ('word_count',
             Pipeline([
                 ('selector', ItemSelector(key='word_count')),
                 ('toarray',
                  FunctionTransformer(returnNumpyArray, validate=False)),
             ])),
            ('f_measure',
             Pipeline([
                 ('selector', ItemSelector(key='f_measure')),
                 ('toarray',
                  FunctionTransformer(returnNumpyArray, validate=False)),
             ])),
        ])

        text_clf = Pipeline([
            ('features',
             FeatureUnion([
                 ('pipeline', Pipeline([
                     ('features', feature_cat_1),
                 ])),
                 ('pipeline2',
                  Pipeline([
                      ('features', feature_cat_2),
                      ('scaler', MinMaxScaler()),
                  ])),
             ])),
            ('clf', fclassifier),
        ])
    text_clf.fit(training_data_X, training_data_y)
    return text_clf
Example #33
0
def StandardLinearSVR(C=10.0,epsilon=0.01):
	return Pipeline([
		("std_scaler",StandardScaler()),
		("linearSVR",LinearSVR(C = C,epsilon = epsilon))
		])
Example #34
0
y = notas['linguagem_codigo']

#  Treino da inteligencia
# Ele seleciona alguns elementos para "ensinar" e outros para "testar" a qualidade do teste
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, random_state=326784)      # Separa a amostra em elementos de treino e de teste
print(f'Dados para treino (x e y): x = {x_treino.shape} e y = {y_treino.shape}')                                                       # random_state é outra forma de fixar a escolha de termos aleatorios
#print(x_treino.shape)                                                                   # não é muito eficiente, pois um método pode chamar outro que utilizam random, e este não seguira este padrão ...
print(f'Dados para teste (x e y): x = {x_teste.shape} e y = {y_teste.shape}')


# Criar o modelo de inteligencia artificial
print('Criação e treino da inteligencia artificial (IA)')
a = time.process_time()
# modelo = SVR()        # Cria um modelo Não Linear     (é muito "pesado")
print('Modelo - Linear SVR')
modelo_svrl = LinearSVR(max_iter=1000)    #   Máquina de Vetores de suporte (SVM, do inglês: support vector machine)
modelo_svrl = modelo_svrl.fit(x_treino, y_treino)               # .fit - Realiza o treino (forma de aprender as regras, ou tentar)
predicoes_svrl = modelo_svrl.predict(x_teste) # .predict - Saida dos valores estimados pela IA
# plot.figure(figsize=(10,10))
# sns.scatterplot(x=y_teste, y=(predicoes_svr - y_teste))  # plotar diferença entre o projetado e o real
# plot.show()
#print(modelo_svr)
qualidade_svrl = mean_squared_error(y_teste, predicoes_svrl)
del modelo_svrl, predicoes_svrl
print(f'Tempo gasto: {time.process_time()- a} s')

print('Modelo - SVR')           # Muito Pesado
a = time.process_time()
modelo_svr = SVR()
modelo_svr = modelo_svr.fit(x_treino, y_treino)
predicoes_svr = modelo_svr.predict(x_teste)
Example #35
0
            print('Classifier {}, fold {}, run {}'.format(
                classifier_name, fold, run_delay))
            print('Probably caused by a locked thread')
            return 'Timeout_{}_{}_{}'.format(classifier_name, fold, run_delay)
    except Exception as e:
        print('Exception {} occured during threading of:'.format(e))
        print('Classifier {}, fold {}, run {}'.format(classifier_name, fold,
                                                      run_delay))
        return 'Catched Exception: {}, Classifier: {}, fold: {}, run: {}'.format(
            e, classifier_name, fold, run_delay)


if __name__ == "__main__":
    # the classifiers
    classifiers = [
        LinearSVR(),
        #RandomForestRegressor(),
    ]
    # their parameters, should be in the correct order
    classifiers_params = [
        {
            'C': 0.25
        },
        #{'n_estimators' : 100},
    ]

    # number of genes selected per run
    n_genes = 250
    # number of folds per classifier
    n_folds = 2500
    # number of threads
Example #36
0
build_housing(
    AdaBoostRegressor(DecisionTreeRegressor(random_state=13,
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=17), "AdaBoostHousing")
build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors=True)
build_housing(
    MLPRegressor(activation="tanh",
                 hidden_layer_sizes=(26, ),
                 solver="lbfgs",
                 random_state=13,
                 tol=0.001,
                 max_iter=1000), "MLPHousing")
build_housing(SGDRegressor(random_state=13), "SGDHousing")
build_housing(SVR(), "SVRHousing")
build_housing(LinearSVR(random_state=13), "LinearSVRHousing")
build_housing(NuSVR(), "NuSVRHousing")

#
# Anomaly detection
#


def build_iforest_housing_anomaly(iforest, name, **kwargs):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    customize(iforest, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
    print("Data shape after feature creation: ", df_data.shape)
    print("Columns: ", " ".join(list(df_data.columns)))
    print('------------------------------------------')

    if args.regression:
        y = df_data['mmse'].values
    else:
        y = df_data['target'].values
    X = df_data.drop(['id', 'target'], axis=1)

    # build classification model
    if args.regression:
        lr_1 = LinearRegression(fit_intercept=True)
        svm_10 = LinearSVR(C=10,
                           fit_intercept=True,
                           random_state=123,
                           max_iter=10000)
        svm_100 = LinearSVR(C=100,
                            fit_intercept=True,
                            random_state=123,
                            max_iter=10000)
        xgb = xgb.XGBRegressor(max_depth=10,
                               subsample=0.8,
                               n_estimators=50,
                               colsample_bytree=0.8,
                               learning_rate=1,
                               nthread=8)
        rfc = RandomForestRegressor(random_state=123,
                                    n_estimators=50,
                                    max_depth=5)
        learners = [('xgb', xgb), ('rfc', rfc), ('svm_10', svm_10),
plt.show()

###################################################################### Plot the tree regressor vs. test outputs
plt.figure(figsize=(10,6))
testTreeTargetHandle, = plt.plot(day, testTargets / 1000000, label = 'Target values')
testTreeOutputHandle, = plt.plot(day, treeTestingOutputs / 1000000, label = 'Decision tree', linestyle = 'dotted')
plt.xlabel('Day')
plt.ylabel(r'Incoming Solar Energy [$MJ / m^2$]')
plt.title('Comparison of Decision Tree Test Targets and Outputs')
plt.legend(handles = [testTreeTargetHandle, testTreeOutputHandle])
plt.show()


#INITIALIZE 
from sklearn.svm import LinearSVR
svm_clf = LinearSVR(C=0.6, loss='squared_epsilon_insensitive')
svm_clf.fit(scaledTrainingInputs, np.ravel(scaledTrainingTargets)) 

# PREDICT the training outputs and the test outputs
scaledTrainingOutputs = svm_clf.predict(scaledTrainingInputs)
scaledTestOutputs = svm_clf.predict(scaledTestInputs)


trainingOutputs = tScaler.inverse_transform(scaledTrainingOutputs)
testOutputs = tScaler.inverse_transform(scaledTestOutputs)

 #Calculate and display training and test root mean square error (RMSE)
trainingsvmRMSE = np.sqrt(np.sum((trainingOutputs - trainingTargets[:, 0]) ** 2) / len(trainingOutputs)) / 1000000  # Divide by 1e6 for MJ/m^2
testsvmRMSE = np.sqrt(np.sum((testOutputs - testTargets[:, 0]) ** 2) / len(testOutputs)) / 1000000

#### PLOTTING
Example #39
0
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=50,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
                                             loss="huber",
                                             penalty="elasticnet",
                                             power_t=0.0)),
    StackingEstimator(estimator=LinearSVR(
        C=25.0, dual=True, epsilon=0.1, loss="epsilon_insensitive",
        tol=0.0001)), FeatureAgglomeration(affinity="l2", linkage="average"),
    SelectPercentile(score_func=f_regression, percentile=6),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False,
                                                    max_features=0.8,
                                                    min_samples_leaf=19,
                                                    min_samples_split=10,
                                                    n_estimators=400)),
    ZeroCount(), FeatureAgglomeration(affinity="l2", linkage="complete"),
    StackingEstimator(estimator=RidgeCV()), RidgeCV())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #40
0
import pandas as pd
from sklearn.svm import LinearSVR
import matplotlib.pyplot as plt

inputfile = './datasave/new_reg_data_GM11.csv'  #灰色预测后保存的路径
data = pd.read_csv(inputfile)  #读取数据
data.index = range(1994, 2016)
feature = ['x1', 'x4', 'x5', 'x6', 'x7', 'x8']
data_train = data.loc[range(1994, 2014)].copy()  #取2014年前的数据建模
data_mean = data_train.mean()
data_std = data_train.std()
data_train = (data_train - data_mean) / data_std  #数据标准化

x_train = data_train[feature].values  #特征数据
y_train = data_train['y'].values  #标签数据
linearsvr = LinearSVR()  #调用LinearSVR()函数
linearsvr.fit(x_train, y_train)
x = ((data[feature] - data_mean[feature])/ \
data_std[feature]).values  #预测,并还原结果。
data[u'y_pred'] = linearsvr.predict(x) * \
data_std['y'] + data_mean['y']
## SVR预测后保存的结果
outputfile = './datasave/new_reg_data_GM11_revenue.csv'
data.to_csv(outputfile)
print('真实值与预测值分别为:', data[['y', 'y_pred']])

p = data[['y', 'y_pred']].plot(style=['b-o', 'r-*'])
p.set_ylim(0, 2500)
p.set_xlim(1993, 2016)
plt.show()
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-17.62015561497372
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=make_pipeline(
            StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)),
            LassoLarsCV(normalize=True)
        )),
        FunctionTransformer(copy)
    ),
    LinearSVR(C=5.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #42
0
test_dataA.drop(['嗜碱细胞%'], axis=1, inplace=True)
#对列的空值进行填充
for i in train_data.columns:
    train_data[i].fillna(train_data[i].mean(), inplace=True)
for i in test_dataA:
    test_dataA[i].fillna(test_dataA[i].mean(), inplace=True)
print(train_data.info())
train_data_y = train_data['血糖']
train_data.drop(['血糖'], axis=1, inplace=True)
print(test_dataA.info())
#归一化
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data.astype(float))
test_dataA = scaler.transform(test_dataA.astype(float))
#建立模型
lin_svr = LinearSVR(random_state=42, max_iter=5000)
lin_svr.fit(train_data, train_data_y)
test_features_labers = lin_svr.predict(test_dataA)
#评估模型
mse = mean_squared_error(test_labels, test_features_labers)
print(mse)
print(np.sqrt(mse))
#使用RandomizedSearchCV
param_distributions = {
    'gamma': reciprocal([0.001, 0.1]),
    # 'C': uniform(1,10)
    'C': [uniform(1, 10), uniform(1, 10)]
}
rnd_search_cv = RandomizedSearchCV(SVR(),
                                   param_distributions,
                                   n_iter=10,
svr_score_test = svr.score(smr_test.feature_matrix, smr_test.labels)
print 'SVR precision test: {}'.format(svr_score_test)
# plot_learning_curve(svr, 'SVR Curve', smr_train.feature_matrix, smr_train.labels, n_jobs=4)
print ''

lsvc = LinearSVC()
print 'LinearSVC config:'
print lsvc.get_params()
lsvc.fit(smr_train.feature_matrix, smr_train.labels)
lsvc_score_train = lsvc.score(smr_train.feature_matrix, smr_train.labels)
print 'LinearSVC precision train: {}'.format(lsvc_score_train)
lsvc_score_test = lsvc.score(smr_test.feature_matrix, smr_test.labels)
print 'LinearSVC precision test: {}'.format(lsvc_score_test)
print ''

lsvr = LinearSVR()
print 'LinearSVR config:'
print svc.get_params()
lsvr.fit(smr_train.feature_matrix, smr_train.labels)
lsvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'LinearSVR precision train: {}'.format(lsvr_score_train)
lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels)
print 'LinearSVR precision test: {}'.format(lsvr_score_test)
print ''

nusvc = NuSVC()
print 'NuSVC config:'
print nusvc.get_params()
nusvc.fit(smr_train.feature_matrix, smr_train.labels)
nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVC precision train: {}'.format(nusvc_score_train)
    NEIGHBOR = 400  # pick some neighbor to compute the eigenvalues
    randidx = np.random.permutation(data.shape[0])[:SAMPLE]
    knbrs = NearestNeighbors(n_neighbors=NEIGHBOR,
                             algorithm='ball_tree').fit(data)

    sing_vals = []
    for idx in randidx:
        dist, ind = knbrs.kneighbors(data[idx:idx + 1])
        nbrs = data[ind[0, 1:]]
        u, s, v = np.linalg.svd(nbrs - nbrs.mean(axis=0))
        s /= s.max()
        sing_vals.append(s)
    sing_vals = np.array(sing_vals).mean(axis=0)
    return sing_vals


# Train a linear SVR

npzfile = np.load('large_data.npz')
X = npzfile['X']
y = npzfile['y']

# we already normalize these values in gen.py
# X /= X.max(axis=0, keepdims=True)

svr = SVR(C=1)
svr.fit(X, y)
joblib.dump(svr, 'model.sav')

# In[ ]:
Example #45
0
## Create K folds
k_fold = KFold(Y_train_raw.shape[0], n_folds=10)
for train, test in k_fold:
    X1 = X_train_reduced[train]
    Y1 = Y_train_raw[train]

    X2 = X_train_reduced[test]
    Y2 = Y_train_raw[test]

    ## Train Classifiers on fold
    rdg_clf = Ridge(alpha=0.5)
    rdg_clf.fit(X1, Y1)
    lso_clf = Lasso(alpha=0.6257)
    lso_clf.fit(X1, Y1)
    svr_clf = LinearSVR(C=1e3)
    svr_clf.fit(X1, Y1)

    ## Score Classifiers on fold
    rdg_clf_score = rdg_clf.score(X2, Y2)
    lso_clf_score = lso_clf.score(X2, Y2)
    svr_clf_score = svr_clf.score(X2, Y2)

    print "Ridge:  ", rdg_clf_score
    print "Lasso:  ", lso_clf_score
    print "SVR_RBF:  ", svr_clf_score


## Train final Classifiers
# clf = Ridge(alpha=.5)
clf = LinearSVR(C=1e3, gamma=0.1)
Example #46
0
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

df = pd.read_csv("finalenc.csv")
y = df['price']
X = df.drop(columns=['price'])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=1)

regr = make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-03))
reg = LinearRegression().fit(X_train, y_train)

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

plt.figure()
plt.plot(range(100000))
plt.scatter(y_test,
            y_pred,
            alpha=0.4,
            c='red',
            label='Ground Truth vs Predicted')
plt.savefig('SVR.png')

(x_input, y_input) = get_training_data(feature_lin_lambda=feature_lin_lambda, feature_lin_var=feature_lin_var, data_exp=data_exp)

# 对属性进行归一化
x_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))  

###  y 需不需要进行归一化?没有归一化的理由,但影响结果!!!
# y_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))    
# x_input_minmax = x_scaler.fit_transform(x_input)
# y_input_minmax = y_scaler.fit_transform(y_input.reshape(-1,1))
# y_input_minmax = y_input_minmax.reshape((len(y_input_minmax)))
# 通过交叉验证来选择C
best_cv_score = -1e+30;
for log2c in np.arange(-10,30,1):
    clf = LinearSVR(C=2**log2c, epsilon=0.0001)
    clf.fit(x_input_minmax, y_input)
    cv_score = cross_val_score(cv=sample_num, estimator=clf, X=x_input_minmax, y=y_input, scoring= 'mean_squared_error').mean() # 留1
    print(cv_score)
    if cv_score > best_cv_score:
        best_cv_score = cv_score
        bestc = 2**log2c


# 利用所选的参数进行预测
clf = LinearSVR(C=bestc, epsilon=0.0001)
clf.fit(x_input_minmax, y_input)
y_pred = clf.predict(x_input_minmax)
# y_pred = y_scaler.inverse_transform(y_pred.reshape(-1,1))

view_point = 5;
Example #48
0
 def __init__(self, C):
     self.regression = LinearSVR(C=C)
class TextLearner(object):
    def __init__(self,data_path,model_path = "./",name = ""):
        self.name = name
        self.data_path = data_path
        self.model_path = model_path
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = [] # not only train but general purpose too
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = Filter()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = None

    def addModelDetails(self,model_p,name = ""):
        self.name = name
        self.model_path = model_p


    def load_data(self,TrTe = 0):               #TrTe => 0-Train  1-Test # returns the dimensions of vectors
        with open( self.data_path, 'rb') as f:
            if TrTe == 0:
                self.DesignMatrix = pickle.load(f)
                return len(self.DesignMatrix[1])
            if TrTe == 1:
                self.TestMatrix = pickle.load(f)
                return len(self.TestMatrix[1])

    def clearOld(self):
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None


    def process(self,text,default = 0):
        if default == 0:
            text = text.strip().lower().encode("utf-8")
        else:
            text = self.F.process(text)
        return text


    def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1):     #TrTe => 0-Train  1-Test
        if TrTe == 0:
            for i in self.DesignMatrix:
                self.X_train.append(self.process(i[feature_index]))
                self.y_train.append(i[label_index])
            self.X_train = np.array(self.X_train)
            self.y_train = np.array(self.y_train)

        elif TrTe == 1:
            for i in self.TestMatrix:
                self.X_test.append(self.process(i[feature_index]))
                self.y_test.append(i[label_index])
            self.X_test = np.array(self.X_test)
            self.y_test = np.array(self.y_test)


    def featurizeXY(self,only_train = 1):      # Extracts Features
        sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would']
        self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw)
        self.X_train = self.vectorizer.fit_transform(self.X_train)
        self.feature_names = self.vectorizer.get_feature_names()
        if only_train == 0:
            self.X_test = self.vectorizer.transform(self.X_test)


    def reduceDimension(self,only_train = 1, percent = 50):      # Reduce dimensions / self best of features
        n_samples, n_features = self.X_train.shape
        k = int(n_features*(percent/100))

        self.chi2 = SelectKBest(chi2, k=k)
        self.X_train = self.chi2.fit_transform(self.X_train, self.y_train)
        self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)]
        self.feature_names = np.asarray(self.feature_names)
        if only_train == 0:
            self.X_test = self.chi2.transform(self.X_test)


    def trainModel(self,Model = "default"):
        if Model == "default":
            self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3)
        else:
            self.mlModel = Model
        self.mlModel.fit(self.X_train, self.y_train)


    def testModel(self,approx = 1):        # returns score ONLY
        self.y_pred = np.array(self.mlModel.predict(self.X_test))

        if approx == 1:
            ### To convert real valued results to binary for scoring
            temp = []
            for y in self.y_pred:
                if y > 0.0:
                    temp.append(1.0)
                else:
                    temp.append(-1.0)
            self.y_pred = temp

        return metrics.accuracy_score(self.y_test, self.y_pred)


    def getReport(self,save = 1, get_top_words = 0):       # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                    report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                    report += "\nDensity: " +  str(density(self.mlModel.coef_))

                    rank = np.argsort(self.mlModel.coef_[0])
                    top10 = rank[-20:]
                    bottom10 = rank[:20]
                    report += "\n\nTop 10 keywords: "
                    report += "\nPositive: " + (" ".join(self.feature_names[top10]))
                    report += "\nNegative: " + (" ".join(self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report


    def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1):        # returns report # Caution: resets train and test X,y
        skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True)
        print(skf)
        master_report = ""

        X_copy = self.X_train
        y_copy = self.y_train

        for train_index, test_index in skf:
            self.X_train, self.X_test = X_copy[train_index], X_copy[test_index]
            self.y_train, self.y_test = y_copy[train_index], y_copy[test_index]
            self.featurizeXY(0)
            self.reduceDimension(0,dim_red)
            self.trainModel()
            self.testModel()
            master_report += self.getReport(save = 0,get_top_words = 0)
            if full_iter == 1:
                continue
            else:
                break

        if save == 1:
            with open(self.model_path + "master_report.txt", "w") as text_file:
                text_file.write(master_report)

        return master_report


    def save_obj(self,obj, name ):
        with open(self.model_path + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f,  protocol=2)


    def saveModel(self):        # saves in model path
        self.save_obj(self.mlModel, self.name + "_model")
        self.save_obj(self.vectorizer, self.name + "_vectorizer")
        self.save_obj(self.chi2, self.name + "_feature_selector")


    def plot(self):
        '''
        beta (Just plotting the model) (Not working)
        '''

        h = .02  # step size in the mesh
        # create a mesh to plot in
        x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
        y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.title(self.name)
        plt.savefig(self.model_path + 'plot.png')
Example #50
0
def GetRegModels(nums=None,
                 cats=None,
                 text=None,
                 scoring='neg_mean_squared_error',
                 split_option=None,
                 split_type=None,
                 max_train_time=3600):

    if nums is None and cats is None and text is None:
        return

    if nums is None:
        nums = []
    if cats is None:
        cats = []
    if text is None:
        text = []

    models = []
    """ Numeric preprocessing options """
    if len(nums) > 0:
        params_impute = [[
            'pre__nums__imputer', 'categoric', ['mean', 'median']
        ]]
        params_scale = [[
            'pre__nums__scaler', 'categoric',
            ['MinMax', 'MaxAbs', 'Standard', 'Robust']
        ]]
    else:
        params_impute = []
        params_scale = []
    """ Linear models """

    # Linear regression
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', LinearRegression())])

    params = []
    params += params_impute

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Linear Regression'
    models += [mod]

    # Lasso regression
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', Lasso(copy_X=False))])

    params = [['mod__alpha', 'exponential', [-5, 5]]]
    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Lasso Regression'
    models += [mod]

    # Ridge regression
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', Ridge(copy_X=False))])

    params = [['mod__alpha', 'exponential', [-5, 5]]]
    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Ridge Regression'
    models += [mod]

    # ElasticNet regression
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', ElasticNet(copy_X=False))])

    params = [['mod__l1_ratio', 'float', [1e-3, 1 - 1e-3]],
              ['mod__alpha', 'exponential', [-5, 5]]]
    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'ElasticNet Regression'
    models += [mod]

    #    # SGD Regression
    #    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
    #                    ('mod', SGDRegressor())])
    #
    #    params = [['mod__alpha', 'exponential', [-5, 5]],
    #              ['mod__penalty', 'categoric', ['l2', 'l2', 'elasticnet']],
    #              ['mod__l1_ratio', 'float', [1e-5, 1-1e-5]]]
    #
    #    params += params_impute + params_scale
    #
    #    mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option,max_train_time=max_train_time)
    #    mod.name = 'SGD Regression'
    #    models += [mod]

    #    # Lasso LARS
    #    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
    #                    ('mod', LassoLars(max_iter=500))])
    #
    #    params = [['mod__alpha', 'exponential', [-5, 5]]]
    #    params += params_impute + params_scale
    #
    #    mod = BayesianOptEstimator(mod, params, scoring, split_type, split_option,max_train_time=max_train_time)
    #    mod.name = 'LARS Regression'
    #    models += [mod]

    # Passive Aggressive Regressor
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', PassiveAggressiveRegressor())])

    params = [['mod__C', 'exponential', [-5, 5]]]
    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Passive Aggressive Regression'
    models += [mod]
    """ Support Vector Machines """

    # Linear SVM
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod',
                     LinearSVR(dual=False,
                               loss='squared_epsilon_insensitive'))])

    params = [['mod__C', 'exponential', [-5, 5]]]
    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Linear SVM'
    models += [mod]

    # Kernel SVM
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('krn', RBFSampler()),
                    ('mod',
                     LinearSVR(dual=False,
                               loss='squared_epsilon_insensitive'))])

    params = [['krn__gamma', 'exponential', [-10, 10]],
              ['krn__n_components', 'integer', [10, 200]],
              ['mod__C', 'exponential', [-5, 5]]]
    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Kernel SVM'
    models += [mod]
    """ Tree based methods """
    # Decision Tree
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', DecisionTreeRegressor())])

    params = [['mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']],
              ['mod__max_depth', 'integer', [1, 15]],
              ['mod__min_samples_split', 'integer', [2, 20]],
              ['mod__min_samples_leaf', 'integer', [1, 20]]]

    params += params_impute

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Decision Tree'
    models += [mod]

    # Random Forest
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', RandomForestRegressor())])

    params = [['mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']],
              ['mod__max_depth', 'integer', [1, 15]],
              ['mod__min_samples_split', 'integer', [2, 20]],
              ['mod__min_samples_leaf', 'integer', [1, 20]]]

    params += params_impute

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Random Forest'
    models += [mod]

    # Extremely Random Forest
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', ExtraTreesRegressor())])

    params = [['mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']],
              ['mod__max_depth', 'integer', [1, 15]],
              ['mod__min_samples_split', 'integer', [2, 20]],
              ['mod__min_samples_leaf', 'integer', [1, 20]]]

    params += params_impute

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Extra Trees'
    models += [mod]

    # Boosting needs dense data
    if len(text) == 0:
        # Gradient Boosted Trees
        mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                        ('mod', GradientBoostingRegressor())])

        params = [[
            'mod__criterion', 'categoric', ['mse', 'friedman_mse', 'mae']
        ], ['mod__max_depth', 'integer', [1, 5]],
                  ['mod__min_samples_split', 'integer', [2, 20]],
                  ['mod__min_samples_leaf', 'integer', [1, 20]],
                  ['mod__learning_rate', 'exponential', [-5, 0]],
                  ['mod__n_estimators', 'integer', [10, 50]]]

        params += params_impute

        mod = BayesianOptEstimator(mod,
                                   params,
                                   scoring,
                                   split_type,
                                   split_option,
                                   max_train_time=max_train_time)
        mod.name = 'Gradient Boosted Trees'
        models += [mod]

        # AdaBoost
        mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                        ('mod', AdaBoostRegressor())])

        params = [[
            'mod__loss', 'categoric', ['linear', 'square', 'exponential']
        ], ['mod__learning_rate', 'exponential', [-5, 5]],
                  ['mod__n_estimators', 'integer', [10, 50]]]

        params += params_impute

        mod = BayesianOptEstimator(mod,
                                   params,
                                   scoring,
                                   split_type,
                                   split_option,
                                   max_train_time=max_train_time)
        mod.name = 'AdaBoost'
        models += [mod]

    # XGBoost!
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', XGBRegressor())])

    params = [['mod__max_depth', 'integer', [1, 5]],
              ['mod__learning_rate', 'exponential', [-5, 0]],
              ['mod__n_estimators', 'integer', [10, 50]]]

    params += params_impute

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'XGBoost'
    models += [mod]
    """ KNN """

    # KNN
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', KNeighborsRegressor())])

    params = [['mod__n_neighbors', 'integer', [1, 20]],
              ['mod__weights', 'categoric', ['uniform', 'distance']]]

    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'K Nearest Neighbors'
    models += [mod]
    """ Neural Network"""

    # NN
    mod = Pipeline([('pre', GetPreprocessor(nums, cats, text)),
                    ('mod', MLPRegressor(learning_rate_init=0.01))])

    params = [['mod__alpha', 'exponential', [-10, 10]],
              ['mod__hidden_layer_sizes', 'integer', [5, 50]]]

    params += params_impute + params_scale

    mod = BayesianOptEstimator(mod,
                               params,
                               scoring,
                               split_type,
                               split_option,
                               max_train_time=max_train_time)
    mod.name = 'Neural Network'
    models += [mod]

    return {mod.name: mod for mod in models}
 def __init__(self, **kwargs):
     self.model = LinearSVR(**kwargs)
'''
X_train_rank = np.linalg.matrix_rank(X_train)

#####################################################################
##          PARAMETER ESTIMATION WITH RAW DATA                     ##
#####################################################################
### Parameter estimation with least square linear matrix solver
beta_height, residual_height, rank, singu_value = np.linalg.lstsq(
    X_train, Y_height_train, rcond=None)

beta_thick, residual_thick, rank, singu_value = np.linalg.lstsq(X_train,
                                                                Y_thick_train,
                                                                rcond=None)

### Parameter estimation with svr
svr_lin = LinearSVR(verbose=True)
svr_lin.fit(X_train, Y_height_train)
beta_height_svr = svr_lin.coef_

#####################################################################
##                  MONTE CARLO SIMULATION                         ##
#####################################################################

### Monte Carlo Simulation
mu_vec = np.zeros((4, 1))
sigma_vec = np.ones((4, 1))

### Parameters -> number of sample an error percentage
num_samples_mc = 1000000
error_percentage = 0.2
md=dnn_reg(X_train,y_train,X_test,y_test)
reg_eval(X_test,y_test,md)

###Lasso CV regression

def reg_eval2(y_test,model):
    y_pred=model.predict(X_test)
    print("evaluation the results for model:",model)
    print("MSE:",mean_squared_error(y_test,y_pred))
    print("R2:",r2_score(y_test,y_pred))
    print("EVS:",explained_variance_score(y_test,y_pred))

lasso = LassoCV(cv=5, random_state=0,max_iter=10000)
lasso.fit(X_train,y_train)
reg_eval2(y_test,lasso)

#ElasticNet Regressionb
ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77)
ela.fit(X_train,y_train)
print("R square:",ela.score(X_test,y_test))
reg_eval2(y_test,ela)


#SVR Regression
from sklearn.svm import LinearSVR
LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000)
# scaler=RobustScaler()
# pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)])
LSVR.fit(X_train,y_train)
reg_eval2(y_test,LSVR))
import numpy as np
import pandas as pd
from src.model_validation import ModelValidation
from personal.chris_farr.mixed_stepwise_selection import MixedStepSelect
from src.data.data_ks_filtered import DataKSFiltered
from random import randint
from sklearn.svm import LinearSVR
# Setup
data_class = DataKSFiltered()
x_train, x_test, y_train, y_scaler = data_class.load_data()
validation = ModelValidation()
predictions_file = "personal/chris_farr/predictions.csv"
features_file = "personal/chris_farr/features.csv"
model = LinearSVR(random_state=0)

# TODO Run mixed select
for _ in range(100):
    starting_features = randint(5, int(len(x_train.columns) / 2))
    ms = MixedStepSelect(corr_threshold=.99,
                         data_class=data_class,
                         n_start_feats=starting_features)
    ms.model = model
    ms.run(1000)
    in_features = ms.in_features

    # TODO Store results in CSV

    # TODO randomize attributes with each run

    # Final scoring and storage
Example #55
0
from sklearn.ensemble import VotingClassifier, VotingRegressor


@pytest.mark.parametrize(
    "X, y, estimator",
    [(*make_classification(n_samples=10),
      StackingClassifier(estimators=[('lr', LogisticRegression()),
                                     ('svm', LinearSVC()),
                                     ('rf', RandomForestClassifier())])),
     (*make_classification(n_samples=10),
      VotingClassifier(estimators=[('lr', LogisticRegression()),
                                   ('svm', LinearSVC()),
                                   ('rf', RandomForestClassifier())])),
     (*make_regression(n_samples=10),
      StackingRegressor(estimators=[('lr', LinearRegression()),
                                    ('svm', LinearSVR()),
                                    ('rf', RandomForestRegressor())])),
     (*make_regression(n_samples=10),
      VotingRegressor(estimators=[('lr', LinearRegression()),
                                  ('svm', LinearSVR()),
                                  ('rf', RandomForestRegressor())]))],
    ids=['stacking-classifier', 'voting-classifier',
         'stacking-regressor', 'voting-regressor']
)
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
    # check that the behavior of `estimators`, `estimators_`,
    # `named_estimators`, `named_estimators_` is consistent across all
    # ensemble classes and when using `set_params()`.

    # before fit
    assert 'svm' in estimator.named_estimators
# for i, svm_clf in enumerate(svm_clfs):
# 	plt.subplot(222 + i)
# 	plot_predictions(svm_clf, [-1.5, 2.5, -1, -1.5])
# 	plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
# 	gamma, C = hyperparams[i]
# 	plt.title(r"$\gamma = {}, C={}$".format(gamma, C), fontsize=16)

# plt.show()


rnd.seed(42)
m = 50
X = 2 * rnd.rand(m, 1)
y = (4 + 3 * X + rnd.randn(m, 1)).ravel()

svm_reg1 = LinearSVR(epsilon=1.5)
svm_reg2 =LinearSVR(epsilon=0.5)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)



def find_support_vectors(svm_reg, X, y):
	y_pred = svm_reg.predict(X)
	off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
	return np.argwhere(off_margin)

svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)

eps_x1 = 1