def svr_main(X, Y):
    X_train = X[:TRAIN_SIZE]
    Y_train = Y[:TRAIN_SIZE]
    X_test = X[TRAIN_SIZE:]
    Y_test = Y[TRAIN_SIZE:]

    clf = SVR(kernel='rbf', C=1e3, gamma=0.00001)
    #clf.fit(X_train,Y_train)
    #y_pred = clf.predict(X_test)
    #plt.plot(X_test, y_pred, linestyle='-', color='red') 

    #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1)
    #clf = DecisionTreeRegressor(max_depth=25)
    #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14)
    #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25)
    #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7)
    predict_list = []
    for i in xrange(TEST_SIZE):
        X = [ [x] for x in xrange(i, TRAIN_SIZE+i)]
        clf.fit(X, Y[i:TRAIN_SIZE+i])
        y_pred = clf.predict([TRAIN_SIZE+1+i])
        predict_list.append(y_pred)

    print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)
    print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))
    origin_data = Y_test
    print "origin data:%s"%origin_data
    plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model')  
    plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') 
    plt.legend(loc=1, prop={'size': 12})
    plt.show()
def compute_mse(regressor, horizon):
    # get wind park and corresponding target. 
    windpark = NREL().get_windpark(NREL.park_id['tehachapi'], 3, 2004, 2005)
    target = windpark.get_target()

    # use power mapping for pattern-label mapping. 
    feature_window = 3
    mapping = PowerMapping()
    X = mapping.get_features_park(windpark, feature_window, horizon)
    y = mapping.get_labels_turbine(target, feature_window, horizon)

    # train roughly for the year 2004, test for 2005.
    train_to = int(math.floor(len(X) * 0.5))
    test_to = len(X)
    train_step, test_step = 25, 25
    X_train=X[:train_to:train_step]
    y_train=y[:train_to:train_step]
    X_test=X[train_to:test_to:test_step]
    y_test=y[train_to:test_to:test_step]

    if(regressor == 'svr'):
        reg = SVR(kernel='rbf', epsilon=0.1, C = 100.0,\
                gamma = 0.0001).fit(X_train,y_train)
        mse = mean_squared_error(reg.predict(X_test),y_test)
    elif(regressor == 'knn'):
        reg = KNeighborsRegressor(10, 'uniform').fit(X_train,y_train)
        mse = mean_squared_error(reg.predict(X_test),y_test)
    return mse
def evaluate_learner(X_train, X_test, y_train, y_test):
    '''
    Run multiple times with different algorithms to get an idea of the
    relative performance of each configuration.
    Returns a sequence of tuples containing:
        (title, expected values, actual values)
    for each learner.
    '''

    # Use a support vector machine for regression
    from sklearn.svm import SVR

    # Train using a radial basis function
    svr = SVR(kernel='rbf', gamma=0.1)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r_2 = svr.score(X_test, y_test)
    yield 'RBF Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred

    # Train using a linear kernel
    svr = SVR(kernel='linear')
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r_2 = svr.score(X_test, y_test)
    yield 'Linear Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred

    # Train using a polynomial kernel
    svr = SVR(kernel='poly', degree=2)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r_2 = svr.score(X_test, y_test)
    yield 'Polynomial Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
Beispiel #4
0
class SVR(PlayerModel):
    ### a wrapper for support vector regression using scikit-learn for this project
    def __init__(self):
        PlayerModel.__init__(self)
        # configure support vector regression and start training
        self.regr = SupportVectorRegression(kernel = 'linear', C = 1000)
        self.regr.fit(self.dataset_X_train, self.dataset_Y_train)
        print "Finish building player model."
        print "Parameters: ", self.regr.get_params()
        print "============================================================"

    def testScore(self, test_X):
        score = self.regr.predict(self.normalizeTest(test_X))
        return np.mean(score)

    def getParams(self):
        return self.regr.get_params()

    def visualize(self):
        x = np.zeros((10, self.col - 1))
        mean = self.dataset_X_train.mean(0)
        for i in range(10):
            x[i, :] = mean
        x[:, 0:1] = np.array([np.arange(0.0, 1.1, 0.11)]).T
        # print x
        y = self.regr.predict(x)
        # print y
        pyplot.scatter(self.dataset_X_train[:, 0:1], self.dataset_Y_train, c='k', label='data')
        pyplot.hold('on')
        pyplot.plot(x[:, 0:1], y, c = "r", label='Support Vector Regression')
        pyplot.xlabel('data collect from player')
        pyplot.ylabel('score')
        pyplot.title('Support Vector Regression')
        pyplot.legend()
        pyplot.show()
def svr_predict(sampled_data_FC1,sampled_data_FC2,len_real=1800,len_train=1100):    
    sampled_data_FC1_minus_FC2=sampled_data_FC2
    sampled_data_FC1_minus_FC2=np.array(sampled_data_FC1_minus_FC2)
#    sampled_data_FC1_minus_FC2=FC1_minus_FC2.FC1_minus_FC2(sampled_data_FC1,sampled_data_FC2)
    
    ## solve method 2: normalize
    sampled_data_FC=[]
    for sdF in sampled_data_FC1_minus_FC2:
        sampled_data_FC.append(sdF[1])
    
    
    ## time normalize
    temp_time_end=sampled_data_FC1_minus_FC2[-1,0]
    sampled_data_FC1_minus_FC2[:,0]=sampled_data_FC1_minus_FC2[:,0]/temp_time_end
    
    sampled_data=np.column_stack((sampled_data_FC1_minus_FC2[:,0],sampled_data_FC)) 
    
    X = sampled_data[0:len_real,0]  # X is the all real  time_data
    Y =sampled_data[0:len_real,1]   # Y is the all real value_data
    
    
    pat_list=[]
    for i in range(len_train-regressors_num+1):
        pat_list.append(list(sampled_data[i:i+regressors_num,1])+[sampled_data[i+regressors_num,0],sampled_data[i+regressors_num,1]])  # make the value couple,like [[x1,x2,x3,x4],[x5]]
    pat_list=np.array(pat_list)
    
    X1=pat_list[:,0:regressors_num+1]
    X=X.reshape([len_real,1])
    Y1=pat_list[:,regressors_num+1]   # Y1 is the train real value_data
    
    ###############################################################################
    # Fit regression model
    
    svr_rbf = SVR(kernel='rbf', epsilon=0.0083, C=1000, gamma=0.05)
    svr_rbf.fit(X1, Y1)
    
    ########################################################################
    # prognostic phase
    y_rbf_prog=[]
    
    for i in range(len_real-len_train):
        if i ==0:
            X_temp1=list(X1[-1][:-1])+[sampled_data[len_train,0]]
            X_temp1=np.array(X_temp1)
            y_rbf_prog.append(float(svr_rbf.predict(X_temp1)))
        elif i < regressors_num:
    
            X_temp=list(X1[-1][-(regressors_num-i)-1:-1])+y_rbf_prog+[sampled_data[i+len_train,0]]
            X_temp=np.array(X_temp)
            y_rbf_prog.append(float(svr_rbf.predict(X_temp)))
        elif i >= regressors_num:
            X_temp=y_rbf_prog[-regressors_num:]+[sampled_data[i+len_train,0]]
            X_temp=np.array(X_temp)
            y_rbf_prog.append(float(svr_rbf.predict(X_temp)))
    
    FC2_prog_pred=sampled_data_FC1[len_train:len_real,1]-y_rbf_prog
#    return FC2_prog_pred 
    return y_rbf_prog
def SVM(Xtrain, ytrain, Xtest=None, C=1):
    model = SVR(C=C)  ## module imported from Scikit-Learn
    model.fit(Xtrain, ytrain)
    pred = model.predict(Xtrain)
    if Xtest is None:
        return pred
    else:
        pred_test = model.predict(Xtest)
    return pred, pred_test
def supportVectorRegression(X, Y_casual, Y_registered, testSet_final):
	svr1 = SVR(kernel='rbf', gamma=0.1)
	svr2 = SVR(kernel='rbf', gamma=0.1)
	svr1.fit(X, Y_casual)
	svr2.fit(X, Y_registered)
	svr1_Y = np.exp(svr1.predict(testSet_final))-1
	svr2_Y = np.exp(svr2.predict(testSet_final))-1
	final_prediction = np.intp(np.around(svr1_Y + svr2_Y))
	return final_prediction
class W2VPool:
    def __init__(self, poolingDim = 20):
        self.clf = SVR(C = 0.5)
        self.model = Word2Vec.load("vectors.bin")
        self.poolingDim = poolingDim
    def getFeatures(self, data):
        sentenceAs = [data[0] for data in data]
        sentenceBs = [data[1] for data in data]
        scores = [float(data[2]) for data in data]
        features = []
        for i in range(len(sentenceAs)):
            mat = self.simMatrix(self.model, sentenceAs[i], sentenceBs[i])
            mat = self.dynamicPooling(mat, self.poolingDim)
            features.append(np.ndarray.flatten(mat))
        return features, scores
    def simMatrix(self, model, sentence1, sentence2):
        tokens1 = word_tokenize(sentence1)
        tokens2 = word_tokenize(sentence2)
        mat = np.zeros((len(tokens1), len(tokens2)))
        for index1, token1 in enumerate(tokens1):
            for index2, token2 in enumerate(tokens2):
                vec1 = model[token1] if token1 in model else np.zeros((len(model['the'])))
                vec2 = model[token2] if token2 in model else np.zeros((len(model['the'])))
                mat[index1][index2] = cosine(vec1, vec2)
        return mat
    def dynamicPooling(self, matrix, finalDim):
        finalMatrix = np.zeros((finalDim, finalDim))
        for i in range(finalDim):
            for j in range(finalDim):
                compressionArea = []
                for a in range(int(float(i) / finalDim * matrix.shape[0]), int(float(i + 1) / finalDim * matrix.shape[0])):
                    for b in range(int(float(j) / finalDim * matrix.shape[1]), int(float(j + 1) / finalDim * matrix.shape[1])):
                        compressionArea.append(matrix[a][b])
                if len(compressionArea) == 0:
                    finalMatrix[i][j] = matrix[int(float(i) / finalDim * matrix.shape[0])][int(float(j) / finalDim * matrix.shape[1])]
                else:
                    finalMatrix[i][j] = min(compressionArea)

        return np.nan_to_num(finalMatrix)

    def train(self, trainData):
        features, scores = self.getFeatures(trainData)
        self.clf.fit(features, scores)
        results = self.clf.predict(features)
        print("Training Error")
        print(sklearn.metrics.mean_squared_error(results, np.array(scores)))

    def test(self, test):
        features, scores = self.getFeatures(test)
        results = self.clf.predict(features)
        print("Testing Error")
        print(sklearn.metrics.mean_squared_error(results, np.array(scores)))
Beispiel #9
0
def svr_model(x_train, y_train, x_test, x_valid, cache_name, use_cache=False):
    if use_cache:
        fhand = open(cache_name, 'r')
        data_dict = pickle.load(fhand)
        return data_dict['test_pred'], data_dict['valid_pred']
    np.random.seed(seed=123)
    model = SVR()
    model.fit(x_train, np.log(y_train))
    test_pred = np.exp(model.predict(x_test))
    valid_pred = np.exp(model.predict(x_valid))
    data_dict = {'test_pred': test_pred, 'valid_pred': valid_pred}
    fhand = open(cache_name, 'w')
    pickle.dump(data_dict, fhand)
    fhand.close()
    return test_pred, valid_pred
Beispiel #10
0
def method_laprls(vecX, vecy, train, test, states=2, params=[1.0, 0.1, 0.1], true_latent=None, plot=False):
    ks = 2
    A = np.zeros((vecX.shape[0], vecX.shape[0]))
    for k in range(1, ks):
        for i in range(vecX.shape[0]-k):
            A[i, i+k] = 1
            A[i+k, i] = 1
    print A.shape
    # A = A[train, :]
    # A = A[:, train]
    print A.shape
    D = np.diag(np.sum(A, axis=1))
    L = D - A

    # K_transd = get_kernel(vecX[train, :], vecX[train, :], type='rbf', sigma=params[2])
    K_transd = get_kernel(vecX, vecX, type='rbf', sigma=params[2])
    # deformation radius
    r = 0.01
    I = np.eye(K_transd.shape[0])

    M = L
    Ktilde = np.linalg.inv(I + r*K_transd.dot(M)).dot(K_transd)
    lap_param = r*M.dot(Ktilde)

    lap_param = lap_param[train, :]
    lap_param = lap_param[:, train]

    clf = SVR(C=params[0], epsilon=params[1], shrinking=False,
              kernel=partial(get_kernel,
                              type='lap',
                              sigma=params[2],
                              lap_param=lap_param))
    clf.fit(vecX[train, :], vecy[train])
    # clf.fit(vecX, vecy)
    return 'Laplacian reg. SVR (RBF)', clf.predict(vecX[test, :]), np.ones(len(test))
Beispiel #11
0
def compute_rmse(features, labels, train_index, test_index):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    r, c = x_train.shape
    if r < 15:
        return None

    if NORMALIZATION_FLAG:
        feature_scaler = StandardScaler().fit(x_train)
        x_train = feature_scaler.transform(x_train)
        x_test = feature_scaler.transform(x_test)
        label_scaler = StandardScaler().fit(y_train)
        y_train = label_scaler.transform(y_train).ravel()

    clf = SVR(C=100, gamma=0.001, kernel='rbf').fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    if NORMALIZATION_FLAG:
        y_pred = y_pred*label_scaler.scale_ + label_scaler.mean_

    if LOG_FLAG:
        actual_pred = numpy.array([10 ** y for y in y_pred])
        actual_price = numpy.array([10 ** y for y in y_test])
    else:
        actual_pred = y_pred
        actual_price = y_test

    actual_rmse_pc = numpy.sqrt(numpy.mean(((actual_pred - actual_price) / actual_price) ** 2))
    actual_rmse = numpy.sqrt(numpy.mean((actual_pred - actual_price) ** 2))

    return actual_rmse, actual_rmse_pc
Beispiel #12
0
def tecnicaSVR():
    
    parametros = [{'kernel':'linear', 'C':0.1, 'epsilon':0.2},
                  {'kernel':'linear', 'C':1.0, 'epsilon':0.2},
                  {'kernel':'rbf', 'degree':3, 'gamma':.0001, 'C':1.0, 'epsilon':0.2},
                  {'kernel':'rbf', 'degree':2, 'gamma':.01, 'C':0.1, 'epsilon':0.2}]

    mae=mse=r2=0
    
    for c in parametros:
        clf = SVR(**c)
        #VALIDACION CRUZADA
        mae=mse=r2=0
        kf = KFold(len(boston_Y), n_folds=10, indices=True)
        for train, test in kf:
            trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test]
            
            clf.fit(trainX, trainY)
            prediccion=clf.predict(testX)
            
            mae+=metrics.mean_absolute_error(testY, prediccion)
            mse+=metrics.mean_squared_error(testY, prediccion)
            r2+=metrics.r2_score(testY, prediccion)
        
            print clf.coef_
        print "Parametros: ", c
        print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf)
        mae=mse=r2=0
def test4():
	'''
	We assume that for each year, 7.1~9.30 belongs to summer model(model-1),
		12.1~2.28 belongs to winter model(model-3),
		the others, 3.1~6.30, 10.1~11.30 belongs to spring model(model-2)'''
	model_1_train_x = x[:15]+x[285:375]+x[645:745]+x[1015:1105]+x[1375:1465]
	model_1_train_y = y[:15]+y[285:375]+y[645:745]+y[1015:1105]+y[1375:1465]
	model_2_train_x = x[15:75]+x[375:435]+x[745:805]+x[1105:1165]
	model_2_train_y = y[15:75]+y[375:435]+y[745:805]+y[1105:1165]
	model_3_train_x = x[75:165]+x[435:525]+x[805:895]+x[1165:1255]
	model_3_train_y = y[75:165]+y[435:525]+y[805:895]+y[1165:1255]
	model_4_train_x = x[165:285]+x[525:645]+x[895:1015]+x[1255:1375]
	model_4_train_y = y[165:285]+y[525:645]+y[895:1015]+y[1255:1375]
	model_1, model_2, model_3, model_4 = SVR(), SVR(), SVR(), SVR()
	model_1.fit(model_1_train_x, model_1_train_y)
	model_2.fit(model_2_train_x, model_2_train_y)
	model_3.fit(model_3_train_x, model_3_train_y)
	model_4.fit(model_4_train_x, model_4_train_y)
	model_1_test_x = x[1735:1825]
	model_1_test_y = y[1735:1825]
	model_2_test_x = x[1465:1525]+x[1825:1885]
	model_2_test_y = y[1465:1525]+y[1825:1885]
	model_3_test_x = x[1525:1615]+x[1885:1975]
	model_3_test_y = y[1525:1615]+y[1885:1975]
	model_4_test_x = x[1615:1735]+x[1975:]
	model_4_test_y = y[1615:1735]+y[1975:]
	model_1_pred, model_2_pred, model_3_pred, model_4_pred = model_1.predict(model_1_test_x), model_2.predict(model_2_test_x), model_3.predict(model_3_test_x), model_4.predict(model_4_test_x)
	calc_err(model_1_pred, model_1_test_y)
	calc_err(model_2_pred, model_2_test_y)
	calc_err(model_3_pred, model_3_test_y)
	calc_err(model_4_pred, model_4_test_y)
	calc_err(list(model_1_pred)+list(model_2_pred)+list(model_3_pred)+list(model_4_pred), model_1_test_y+model_2_test_y+model_3_test_y+model_4_test_y)
Beispiel #14
0
class HotTweets:
	''' Train and get tweet hotness '''

	def __init__(self, kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1, n_comp=100):
		''' Prepare support vector regression ''' 
		self.svr = SVR(kernel=kernel, C=C, gamma=gamma, epsilon=epsilon, verbose=True)
		#self.svr = LogisticRegression(random_state=42, verbose=0)
		self.n_comp = n_comp

	def fit_scaler(self, dev, i_dev):
		''' Train normalizers for features and importances '''
		# importance scaler
		self.std_scaler_i = sklearn.preprocessing.StandardScaler()
		self.std_scaler_i.fit(i_dev)
		self.norm = sklearn.preprocessing.StandardScaler()
		self.norm.fit(dev[:,0:self.n_comp])
		self.n_comp = self.n_comp
	
	def train(self, features, importances):
		''' Train regression '''
		importances = self.std_scaler_i.transform(importances)
		features = self.norm.transform(features[:,0:self.n_comp])
		self.svr.fit(features, importances)
		
		
	def predict(self, features):
		''' Predict importances '''
		features = self.norm.transform(features[:,0:self.n_comp])
		results = self.svr.predict(features)
		#print results[0:100:5]
		results = self.std_scaler_i.inverse_transform(results)
		#print results[0:100:5]
		return results
def svm_regressor(features,target,test_size_percent=0.2,cv_split=5):
    
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)  
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    svr = SVR(kernel='rbf',C=10,gamma=1)
    svr.fit(X_train,y_train.ravel())
    test_prediction = svr.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy
    return svr
Beispiel #16
0
def CaSVRModel(X_train, Y_train, X_test, Y_test, cv_iterator):
#     
#     param_grid = {'C':[10000],
#                    'epsilon':[0.001, 0.01, 0.05, 0.1, 0.15, 1]
#                    }
#       
#     svr = SVR(random_state=42, cache_size=1000, verbose=2)
#     search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs= 1, iid=True, cv=cv_iterator)
#     search.fit(X_train, Y_train["Ca"])
#     #search.grid_scores_
#       
#     model = search.best_estimator_

    #scaler = StandardScaler()

    model = SVR(C=10000, epsilon = 0.01, cache_size=1000)
    model.fit(X_train, Y_train["Ca"])
    #model.fit(X_train, Y_train["Ca"])
    
    #model.fit(X_train, Y_train["Ca"])
    
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    yhat_svr = model.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_svr))
    
    return model, test_error
Beispiel #17
0
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_svr = [{
            'kernel': ['rbf', 'sigmoid', 'linear'],
            'C': [0.01, 0.1, 1, 10, 100],
            'epsilon': [0.0000001, 0.000001, 0.00001]
            }]
        params = ParameterGrid(params_svr)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            # X_train = self.pca.fit_transform(X_train.values)
            X_train = X_train.values
            # pdb.set_trace()
            X_cv, y_cv = stock.get_data(mid_date, end_date)
            # X_cv = self.pca.transform(X_cv.values)
            X_cv = X_cv.values

            lowest_mse = np.inf
            for i, param in enumerate(params):
                svr = SVR(**param)
                # ada = AdaBoostRegressor(svr)
                svr.fit(X_train, y_train.values)
                mse = mean_squared_error(
                    y_cv, svr.predict(X_cv))
                if mse <= lowest_mse:
                    self.models[ticker] = svr

        return self
def machinelearning(csv_file):
  # parse CSV
  d = {}
  d['date'] = []
  d['radiation'] = []
  d['humidity'] = []
  d['temperature'] = []
  d['wind'] = []
  d['demand'] = []

  dictreader = csv.DictReader(csv_file, fieldnames=['date', 'radiation', 'humidity', 'temperature', 'wind', 'demand'], delimiter=',')

  next(dictreader)
  for row in dictreader:
    for key in row:
      d[key].append(row[key])

  # interpolate weather data
  interpolate(d['radiation'])
  interpolate(d['humidity'])
  interpolate(d['temperature'])
  interpolate(d['wind'])

  # train machine learning algorithm
  training_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[:32])
  training_y = np.array(d['demand'][:32])

  poly_svr = SVR(kernel='poly', degree=2)
  poly_svr.fit(training_x, training_y)

  prediction_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[32:])
  demand_predictions = poly_svr.predict(prediction_x)

  return demand_predictions
Beispiel #19
0
def Sand_SVR(X_train, Y_train, X_test, Y_test, cv_iterator):
    
    #===========================================================================
    # param_grid = {'C':[100,500,1000, 5000, 10000, 100000],
    #               'epsilon':[0.075,0.1, 0.125]
    #               }
    #  
    # svr = SVR(cache_size = 1000, random_state=42)
    # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", cv=cv_iterator)
    #===========================================================================
    #search.fit(X_train, Y_train["Sand"])
    #search.grid_scores_
    
    #svr = search.best_estimator_ 
    #svr.fit(X_train, Y_train["SAND"])
    
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    svr = SVR(C=10000)
    svr.fit(X_train, Y_train["Sand"])
    
    yhat_svr = svr.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Sand"], yhat_svr))
    
    return svr, test_error
Beispiel #20
0
def train_model(train, test, labels):
    clf = SVR(C=1.0, epsilon=0.2)
    clf.fit(train, labels)
    #clf = GaussianNB()
    #clf.fit(train, labels)
    print "Good!"
    predictions = clf.predict(test)
    print predictions.shape
    predictions = pd.DataFrame(predictions, columns = ['relevance'])
    print "Good again!"
    print "Predictions head -------"
    print predictions.head()
    print predictions.shape
    print "TEST head -------"
    print test.head()
    print test.shape
    test['id'].to_csv("TEST_TEST.csv",index=False)
    predictions.to_csv("PREDICTIONS.csv",index=False)
    #test = test.reset_index()
    #predictions = predictions.reset_index()
    #test = test.groupby(level=0).first()
    #predictions = predictions.groupby(level=0).first()
    predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False)
    print predictions
    return predictions
Beispiel #21
0
def main(args):
    (training_file, label_file, test_file, test_label, c, e) = args
    svr = SVR(C=float(c), epsilon=float(e), kernel='rbf')
    X = load_feat(training_file)
    y = [float(line.strip()) for line in open(label_file)]
    
    X = np.asarray(X)
     
    y = np.asarray(y)
    
    test_X = load_feat(test_file)
    test_X = np.asarray(test_X)
    test_X[np.isnan(test_X)] = 0

    svr.fit(X, y)
    
    pred = svr.predict(test_X)
    if test_label != 'none':
        test_y = [float(line.strip()) for line in open(test_label)]
        test_y = np.asarray(test_y)
        print 'MAE: ', mean_absolute_error(test_y, pred)
        print 'RMSE: ', sqrt(mean_squared_error(test_y, pred))
        print 'corrpearson: ', sp.stats.pearsonr(test_y, pred)
        print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2
        print mquantiles(test_y, prob=[0.10, 0.90])
        print mquantiles(pred, prob=[0.10, 0.90])
    with open(test_file + '.svr.pred', 'w') as output:
        for p in pred:
            print >>output, p
    return
Beispiel #22
0
def svr_rbf(X_train, Y_train, X_validate):
    """Support vector regression, using RBF kernel"""
    SVR_RBF = SVR(kernel='rbf')
    SVR_RBF.fit(X_train, Y_train)
    Y_pred = SVR_RBF.predict(X_validate)
    write_to_file("SVR_RBF_Y_pred.csv", Y_pred)
    return Y_pred
Beispiel #23
0
def learn(X, y):
    # do pca
    pca = PCA(n_components=6)
    pca_6 = pca.fit(X)

    print('variance ratio')
    print(pca_6.explained_variance_ratio_)
    X = pca.fit_transform(X)

    # X = np.concatenate((X_pca[:, 0].reshape(X.shape[0], 1), X_pca[:, 5].reshape(X.shape[0], 1)), axis=1)
    # do svr
    svr_rbf = SVR(kernel='rbf', C=1)
    svr_rbf.fit(X, y)
    # print(model_rbf)

    y_rbf = svr_rbf.predict(X)
    print(y_rbf)
    print(y)

    # see difference
    y_rbf = np.transpose(y_rbf)
    deviation(y, y_rbf)

    # pickle model
    with open('rbfmodel.pkl', 'wb') as f:
        pickle.dump(svr_rbf, f)

    with open('pcamodel.pkl', 'wb') as f:
        pickle.dump(pca_6, f)
Beispiel #24
0
def SVM(train, test, tunings=None, smoteit=True, bin=True, regress=False):
    "SVM "
    if not isinstance(train, pd.core.frame.DataFrame):
        train = csv2DF(train, as_mtx=False, toBin=bin)

    if not isinstance(test, pd.core.frame.DataFrame):
        test = csv2DF(test, as_mtx=False, toBin=True)

    if smoteit:
        train = SMOTE(train, resample=True)
        # except: set_trace()
    if not tunings:
        if regress:
            clf = SVR()
        else:
            clf = SVC()
    else:
        if regress:
            clf = SVR()
        else:
            clf = SVC()

    features = train.columns[:-1]
    klass = train[train.columns[-1]]
    # set_trace()
    clf.fit(train[features], klass)
    actual = test[test.columns[-1]].as_matrix()
    try:
        preds = clf.predict(test[test.columns[:-1]])
    except:
        set_trace()
    return actual, preds
class SVMRegressor(Regressor):
   def findImportantFeatures(self, numFeatures = 1000):
      #Selecting the important features
      self.features = []
      count = 0
      for key in sorted(self.trainSet.getVocabulary(), key = lambda word: self.trainSet.getUniqueWeightOf(word), reverse=True):
         count += 1
         self.features.append(key)
         if count == numFeatures:
            break
   def train(self, numFeatures = 1000):
      self.findImportantFeatures(numFeatures)
      self.vectorizer = CountVectorizer(vocabulary = self.features,min_df = 1)
      self.regressor = SVR(kernel='linear', C=25, epsilon=10)
      strings = []
      Y = []
      for docKey in self.trainSet.getDocuments():
         document = self.trainSet.getDocument(docKey)
         strings.append(" ".join(document.getBagOfWords2("all")))
         Y.append(document.getSalary())
      X = self.vectorizer.fit_transform(strings)
      self.regressor.fit(X,Y)
      Coef = self.regressor.coef_
      coef_list = Coef.toarray()
      #for i in range(len(coef_list[0])):
      #   if math.fabs(coef_list[0][i]-0.0) > 0.1:
      #      print self.features[i],coef_list[0][i]


   def predict(self, document):
      strings = []
      strings.append(" ".join(document.getBagOfWords2("all")))
      Z = self.vectorizer.fit_transform(strings)
      return self.regressor.predict(Z)[0]
Beispiel #26
0
def P_SVRModel(X_train, Y_train, X_test, Y_test, cv_iterator):
    
    #===========================================================================
    # scaler = StandardScaler()
    # X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)
    # 
    # 
    # param_grid = {'C':[0.0001, 0.001, 0.01, 0.1],
    #               'epsilon':[0.1, 0.01]
    #               }
    #   
    # svr = SVR(random_state=42, verbose = 2)
    # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs=1, cv=cv_iterator, iid=False)
    # search.fit(X_train, Y_train["P"])
    # #search.grid_scores_
    # #svr = search.best_estimator_
    #===========================================================================
    
    svr = SVR(C=10000, epsilon=0.1)
    svr.fit(X_train, Y_train["P"])
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    yhat_svr = svr.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["P"], yhat_svr))
    
    return svr, test_error
Beispiel #27
0
class SVMLearner(object):

    def __init__(self, kernel="linear", C=1e3, gamma=0.1, degree=2, verbose = False):
		self.name = "{} Support Vector Machine Learner".format(kernel.capitalize())
		self.kernel=kernel
		if kernel=="linear":
			self.svr = SVR(kernel=kernel, C=C)
		elif kernel=="rbf":
			self.svr = SVR(kernel=kernel, C=C, gamma=gamma)
		elif kernel=="poly":
			self.svr = SVR(kernel=kernel, C=C, degree=degree)

    def addEvidence(self,dataX,dataY):
        """
        @summary: Add training data to learner
        @param dataX: X values of data to add
        @param dataY: the Y training values
        """
        # build and save the model
        self.svr.fit(dataX, dataY)
        
    def query(self,points):
        """
        @summary: Estimate a set of test points given the model we built.
        @param points: should be a numpy array with each row corresponding to a specific query.
        @returns the estimated values according to the saved model.
        """
        return self.svr.predict(points)
Beispiel #28
0
def main(args):
    (training_file, label_file, test_file, u_file, e, c, output_file, components) = args
    X_training = load_feat(training_file)
    n = len(X_training)
    U = load_feat(u_file)
    y_training = [float(line.strip()) for line in open(label_file)]
   
    U = np.asarray(U)
    X_training = np.asarray(X_training)
    #X = preprocessing.normalize(X, norm='l2')
    y_training = np.asarray(y_training)
    
    X_test = load_feat(test_file)
    y_test = [float(line.strip()) for line in open(test_label)]
    X_test = np.asarray(X_test)
    X_test[np.isnan(X_test)] = 0.0
    #test_X = preprocessing.normalize(test_X, norm='l2')
    y_test = np.asarray(y_test)
    s = min(len(X_training), len(U))

    
    cca = CCA(n_components=components, max_iter=50)
    (X_cca, U_cca) = cca.fit_transform(X_training[:s], U[:s])
    X_test_cca = cca.transform(X_test)
    
    svr = SVR(C=c, epsilon=e, kernel='rbf')
    svr.fit(X_cca, y_training[:s])    
    pred = svr.predict(X_test_cca)
    
 
    with open(output_file, 'w') as output:
        for p in pred:
            print >>output, p
    return
Beispiel #29
0
def predict_device_byday_SVR():
    X,Y_unique,Y_all,X_raw = load_device_counter_byday()

    from sklearn.svm import SVR
    model = SVR()
    # model = SVR(kernel='linear')
    training_size = 160
    # model.fit(X[:training_size],Y_unique[:training_size])
    model.fit(X[:training_size],Y_all[:training_size])

    start_index = 180
    end_index = 190
    X_to_predict = X[start_index:end_index]
    # X_to_predict.append([date_str_toordinal('2017-04-18')])
    # X_to_predict.append([date_str_toordinal('2017-03-27')])

    print X_to_predict
    # Y_real = Y_unique[start_index:end_index]
    Y_real = Y_all[start_index:end_index]
    print X_raw[start_index:end_index]
    y_predicted=model.predict(X_to_predict)
    # print y_predicted
    y_predicted = np.array(y_predicted).astype(int)
    print y_predicted
    print Y_real
    # print y_predicted - np.array(Y_real)

    # plt.subplot(111)
    # plt.scatter(X_to_predict,Y_real,c='r')
    plt.scatter(X_to_predict,y_predicted)
    # plt.plot(X_to_predict,y_predicted)
    plt.show()
Beispiel #30
0
def build_model(titles, X1, X3, X4, titles_test, X1_test, X3_test, X4_test, y, weights=None, params=[400, 10, 0, 0], top_words=10):
    '''
    X1: query lenght,title lenght,description presetn flag,number of words from query that also occured in title,
        compression distance between query and title ,1 - edit distance between query and title,
        1 - average(maximum edit distance between word from query and every word from title),
        last word from query present in title flag,ratio of words from query that also occured in title
    X3: Stanislav's features
    X4: Mikhail's features
    params list: [Number of SVD components, C in SVR, gamma in SVR]
    '''
    if top_words == 10:
        X5 = np.loadtxt(config.path_features + 'train_ext_counts_top10.txt')
        X5_test = np.loadtxt(config.path_features +
                             'test_ext_counts_top10.txt')
        queries_ext = np.array(pd.read_csv(
            config.path_features + 'train_ext_top10.csv')['query'])
        queries_ext_test = np.array(pd.read_csv(
            config.path_features + 'test_ext_top10.csv')['query'])
    elif top_words == 15:
        X5 = np.loadtxt(config.path_features + 'train_ext_counts_top15.txt')
        X5_test = np.loadtxt(config.path_features +
                             'test_ext_counts_top15.txt')
        queries_ext = np.array(pd.read_csv(
            config.path_features + 'train_ext_top15.csv')['query'])
        queries_ext_test = np.array(pd.read_csv(
            config.path_features + 'test_ext_top15.csv')['query'])
    else:
        print 1 / 0

    df_train = pd.DataFrame(np.c_[queries_ext, titles], columns=[
                            'query', 'product_title'])
    df_test = pd.DataFrame(np.c_[queries_ext_test, titles_test], columns=[
                           'query', 'product_title'])
    train_qt = list(df_train.apply(lambda x: '%s %s' %
                                   (x['query'], x['product_title']), axis=1))
    test_qt = list(df_test.apply(lambda x: '%s %s' %
                                 (x['query'], x['product_title']), axis=1))

    tfv = text.TfidfVectorizer(min_df=10,  max_features=None,
                               strip_accents='unicode', analyzer='char', token_pattern=r'\w{1,}',
                               ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                               stop_words='english')
    tfv.fit(train_qt)
    X2 = tfv.transform(train_qt)
    X2_test = tfv.transform(test_qt)
    svd = TruncatedSVD(n_components=params[0])
    mms = MinMaxScaler()

    X = np.c_[svd.fit_transform(X2), X1, X4, X3, X5]
    X_test = np.c_[svd.transform(X2_test), X1_test, X4_test, X3_test, X5_test]

    X = mms.fit_transform(X)
    X_test = mms.transform(X_test)

    # train model
    clf = SVR(C=params[1], gamma=params[2], cache_size=2048, kernel='rbf')
    clf.fit(X, y, sample_weight=weights)
    p = clf.predict(X_test)
    return p
Beispiel #31
0
                                      x_train,
                                      y=y_train,
                                      scoring='neg_mean_squared_error',
                                      cv=2))))

    sexp = squaredExponential()
    gp = GaussianProcess(sexp)
    acq = Acquisition(mode='ExpectedImprovement')
    param = OrderedDict()
    param['x'] = ('cont', [1, 100])
    param['y'] = ('cont', [1, 100])

    gpgo = GPGO(gp, acq, f, param)
    gpgo.run(max_iter=200)
    best_x, best_y = gpgo.getResult()
    print('best_x:', best_x)
    print('best_y:', best_y)
    model_SVR = SVR(C=best_x[0], gamma=best_x[1])

    model_SVR.fit(x_train, y_train)
    y_predict = model_SVR.predict(x_test)

    RMSE_SVR = np.sqrt(mean_squared_error(y_test, y_predict))
    R2_SVR = r2_score(y_test, y_predict)
    MAE_SVR = median_absolute_error(y_test, y_predict)

    print('****************' + 'SVR' + '****************')
    print('RMSE_SVR:', RMSE_SVR)
    print('R2_SVR:', R2_SVR)
    print('MAE_SVR:', MAE_SVR)
# SVR

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('c:\\stock_market.csv')
X = dataset.iloc[:, 1:30].values

y = dataset.iloc[:, 30].values

#print(y)
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=0)
# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='linear')
regressor.fit(X, y)

y_pred = regressor.predict(X_test)
print(" The Predicted Output is \n", y_pred)
print(" The Actual Values are \n", y_test)
Beispiel #33
0
dfTrain.drop('Expected', axis=1, inplace=True)
dfTrain.drop('Id', axis=1, inplace=True)

print "Training SVR with rbf Kernel..."
model.fit(dfTrain, yTrain)

print "Test Sets:", testSets
for i in testSets:
    print "Loading dataset basic_no_" + str(i) + 'sur16 into testSets...'
    l_dfTest.append(
        pd.read_csv('./data/f_df_train_' + featuresSet + '_' + str(i) +
                    'sur16.csv',
                    index_col=0))
dfTest = pd.concat(l_dfTest)
dfTest.dropna(inplace=True)
yTest = dfTest.Expected
dfTest.drop('Expected', axis=1, inplace=True)
dfTest.drop('Id', axis=1, inplace=True)

print "Predicting values..."
pred = model.predict(dfTest)

print "Calculating MAE scores:"
scores = (np.abs(pred - yTest)).mean()
print scores
print ""

t_end = time.time()
print "Time to process: ", t_end - t_start
print "-----------------------------------------"
start_time = time.time()

#loading the model with the chosen hyperparameters
SVR_Model_chosen = SVR(kernel="rbf",
                       epsilon=485.8971280256051,
                       C=371.8743120624125,
                       gamma=76.01511064212842)

#training the model
SVR_Model_chosen.fit(X_train_fs, Y_train.ravel())

#calculating the time taken to train
print("--- %s seconds ---" % (time.time() - start_time))

# Training RMSE - calculate and print
Y_train_SVR = SVR_Model_chosen.predict(X_train_fs)
rmse = (np.sqrt(mean_squared_error(Y_train, Y_train_SVR))) * 100
print(rf"The RMSE is {rmse:2.4f}%")

# ### Testing the model using Validation Data

# In[102]:

# Use the model to make predicitions about test data
Y_predict_SVR = SVR_Model_chosen.predict(X_test_fs)

# Generating a plot for actual and predicted values
plt.figure(figsize=[10, 4])
plt.plot(Y_test, "b-", label="Actual Value of Y")
plt.plot(Y_predict_SVR, "g*", label="Predicted Value of Y")
plt.title("Predicition using Support Vector Regressor")
Beispiel #35
0
### SVR Regression ###
################################

# ** NOTE - SVR does not do feature scaling
ss_x = StandardScaler()
ss_y = StandardScaler()
X_scaled = ss_x.fit_transform(X)
y_scaled = ss_y.fit_transform(y.reshape(-1, 1))

svr_regressor = SVR(kernel="rbf")
svr_regressor.fit(X_scaled, y_scaled)

# Predict - since we did feature scaling -
# So have to scale/transform 6.5 also
position_val = ss_x.transform([[6.5]])
pred_val_scaled = svr_regressor.predict(position_val)
# The above statement will return scaled predicted value
# So have to convert that using inverse transform
svr_pred = ss_y.inverse_transform(pred_val_scaled)
print(
    'The predicted salary of a person at 6.5 Level with Support Vector Regression is ',
    svr_pred)

################################
### Decision Tree Regression ###
################################
tree_regressor = DecisionTreeRegressor(criterion="mse")
tree_regressor.fit(X, y)

# Predict
tree_pred = tree_regressor.predict([[6.5]])
def hyperopt_obj(param, feat_folder, feat_name, trial_counter):
    global loaded, split_data
    if loaded is None:
        split_data, X_train_all, labels_train_all = load_data(1, 1)
    log_loss_cv = np.zeros((split_data.shape[0], split_data.shape[1]),
                           dtype=float)
    year = datetime.datetime.now().year
    # for run in range(1, split_data.shape[0] + 1):  # range(start, end)前包括后不包括
    #     for fold in range(1, split_data.shape[1] + 1):
    #         rng = np.random.RandomState(datetime.datetime.now().year + 1000 * run + 10 * fold)
    #         #### all the path
    #         path = "%s/Run%d/Fold%d" % (feat_folder, run, fold)
    #         save_path = "%s/Run%d/Fold%d" % (output_path, run, fold)
    #         if not os.path.exists(save_path):
    #             os.makedirs(save_path)
    #         # feat: combine feat file
    #         feat_train_path = "%s/valid.feat" % path
    #         feat_valid_path = "%s/train.feat" % path
    #         raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)  #
    #         rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)  #
    #         X_train, labels_train, X_valid, labels_valid = split_data[run -1, fold -1]
    #         numTrain = X_train.shape[0]
    #         numValid = X_valid.shape[0]
    #         Y_valid = labels_valid
    #         # ## make evalerror func 评价函数
    #         # evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid)
    #         # evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid)
    #         # evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid)
    #         # evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold)
    #         # evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid)
    #
    #         ##############
    #         ## Training ##
    #         ##############
    #         ## you can use bagging to stabilize the predictions 还可以使用 bagging 来使模型更加稳定
    #         preds_bagging = np.zeros((numValid, bagging_size), dtype=float)
    #         for n in range(bagging_size):
    #             if bootstrap_replacement:
    #                 sampleSize = int(numTrain * bootstrap_ratio)  # bootstrap_ratio: 使用训练样本的比例
    #                 index_base = rng.randint(numTrain, size=sampleSize)
    #                 index_meta = [i for i in range(numTrain) if i not in index_base]
    #             else:
    #                 randnum = rng.uniform(size=numTrain)  # 产生 0-1 之间的唯一的均匀分布的随机数
    #                 index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio]
    #                 index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio]
    #
    #             # 如果是xgb则先把数据转换成xgb需要的格式
    #             if "booster" in param:
    #                 dvalid_base = xgb.DMatrix(X_valid, label=labels_valid)  # , weight=weight_valid
    #                 dtrain_base = xgb.DMatrix(X_train[index_base],
    #                                           label=labels_train[index_base])  # , weight=weight_train[index_base]
    #
    #                 watchlist = []
    #                 if verbose_level >= 2:
    #                     watchlist = [(dtrain_base, 'train'), (dvalid_base, 'valid')]
    #
    #             ## various models
    #             if param["task"] in ["regression", "ranking"]:
    #                 ## regression & pairwise ranking with xgboost
    #                 bst = xgb.train(param, dtrain_base, param['num_round'],
    #                                 watchlist)  # , feval=evalerror_regrank_valid
    #                 pred = bst.predict(dvalid_base)
    #
    #             if param["task"] in ["classification"]:
    #                 ## regression & pairwise ranking with xgboost
    #                 bst = xgb.train(param, dtrain_base, param['num_round'],
    #                                 watchlist)  # , feval=evalerror_regrank_valid
    #                 pred = bst.predict(dvalid_base)
    #
    #             elif param["task"] in ["softmax"]:
    #                 ## softmax regression with xgboost
    #                 bst = xgb.train(param, dtrain_base, param['num_round'],
    #                                 watchlist)  # , feval=evalerror_softmax_valid
    #                 pred = bst.predict(dvalid_base)
    #                 w = np.asarray(range(1, numValid))
    #                 pred = pred * w[np.newaxis,
    #                               :]  # np.newaxis: 插入一个维度,等价于w[np.newaxis],这里pred是n*1矩阵,而w[np.newaxis,:]是1*n矩阵,注意w原是数组
    #                 pred = np.sum(pred, axis=1)
    #
    #             elif param["task"] in ["softkappa"]:
    #                 ## softkappa with xgboost 自定义损失函数
    #                 # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
    #                 bst = xgb.train(param, dtrain_base, param['num_round'],
    #                                 watchlist)  # , obj=obj, feval=evalerror_softkappa_valid
    #                 pred = softmax(bst.predict(dvalid_base))
    #                 w = np.asarray(range(1, numValid))
    #                 pred = pred * w[np.newaxis, :]
    #                 pred = np.sum(pred, axis=1)
    #
    #             elif param["task"] in ["ebc"]:
    #                 ## ebc with xgboost 自定义损失函数
    #                 # obj = lambda preds, dtrain: ebcObj(preds, dtrain)
    #                 bst = xgb.train(param, dtrain_base, param['num_round'],
    #                                 watchlist)  # , obj=obj, feval=evalerror_ebc_valid
    #                 pred = sigmoid(bst.predict(dvalid_base))
    #                 pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)
    #
    #             elif param["task"] in ["cocr"]:
    #                 ## cocr with xgboost 自定义损失函数
    #                 # obj = lambda preds, dtrain: cocrObj(preds, dtrain)
    #                 bst = xgb.train(param, dtrain_base, param['num_round'],
    #                                 watchlist)  # , obj=obj, feval=evalerror_cocr_valid
    #                 pred = bst.predict(dvalid_base)
    #                 pred = applyCOCRRule(pred)
    #
    #             elif param['task'] == "reg_skl_rf":
    #                 ## regression with sklearn random forest regressor
    #                 rf = RandomForestRegressor(n_estimators=param['n_estimators'],
    #                                            max_features=param['max_features'],
    #                                            n_jobs=param['n_jobs'],
    #                                            random_state=param['random_state'])
    #                 rf.fit(X_train[index_base], labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #                 pred = rf.predict(X_valid)
    #
    #             elif param['task'] == "reg_skl_etr":
    #                 ## regression with sklearn extra trees regressor
    #                 etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
    #                                           max_features=param['max_features'],
    #                                           n_jobs=param['n_jobs'],
    #                                           random_state=param['random_state'])
    #                 etr.fit(X_train[index_base], labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #                 pred = etr.predict(X_valid)
    #
    #             elif param['task'] == "reg_skl_gbm":
    #                 ## regression with sklearn gradient boosting regressor
    #                 gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'],
    #                                                 max_features=param['max_features'],
    #                                                 learning_rate=param['learning_rate'],
    #                                                 max_depth=param['max_depth'],
    #                                                 subsample=param['subsample'],
    #                                                 random_state=param['random_state'])
    #                 gbm.fit(X_train.toarray()[index_base],
    #                         labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #                 pred = gbm.predict(X_valid.toarray())
    #
    #             elif param['task'] == "clf_skl_lr":
    #                 ## classification with sklearn logistic regression
    #                 lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5,
    #                                         C=param['C'], fit_intercept=True, intercept_scaling=1.0,
    #                                         class_weight='auto', random_state=param['random_state'])
    #                 lr.fit(X_train[index_base], labels_train[index_base])
    #                 pred = lr.predict_proba(X_valid)
    #                 w = np.asarray(range(1, numValid))
    #                 pred = pred * w[np.newaxis, :]
    #                 pred = np.sum(pred, axis=1)
    #
    #             elif param['task'] == "reg_skl_svr":
    #                 ## regression with sklearn support vector regression
    #                 X_train, X_valid = X_train.toarray(), X_valid.toarray()
    #                 scaler = StandardScaler()
    #                 X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #                 X_valid = scaler.transform(X_valid)
    #                 svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'],
    #                           degree=param['degree'], kernel=param['kernel'])
    #                 svr.fit(X_train[index_base], labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #                 pred = svr.predict(X_valid)
    #
    #             elif param['task'] == "reg_skl_ridge":
    #                 ## regression with sklearn ridge regression
    #                 ridge = Ridge(alpha=param["alpha"], normalize=True)
    #                 ridge.fit(X_train[index_base], labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #                 pred = ridge.predict(X_valid)
    #
    #             elif param['task'] == "reg_skl_lasso":
    #                 ## regression with sklearn lasso
    #                 lasso = Lasso(alpha=param["alpha"], normalize=True)
    #                 lasso.fit(X_train[index_base], labels_train[index_base])
    #                 pred = lasso.predict(X_valid)
    #
    #             elif param['task'] == 'reg_libfm':
    #                 ## regression with factorization machine (libfm)
    #                 ## to array
    #                 X_train = X_train.toarray()
    #                 X_valid = X_valid.toarray()
    #
    #                 ## scale
    #                 scaler = StandardScaler()
    #                 X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #                 X_valid = scaler.transform(X_valid)
    #
    #                 ## dump feat
    #                 dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path + ".tmp")
    #                 dump_svmlight_file(X_valid, labels_valid, feat_valid_path + ".tmp")
    #
    #                 ## train fm
    #                 cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
    #                     libfm_exe, feat_train_path + ".tmp", feat_valid_path + ".tmp", raw_pred_valid_path, \
    #                     param['dim'], param['iter'])
    #                 os.system(cmd)
    #                 os.remove(feat_train_path + ".tmp")
    #                 os.remove(feat_valid_path + ".tmp")
    #
    #                 ## extract libfm prediction
    #                 pred = np.loadtxt(raw_pred_valid_path, dtype=float)
    #                 ## labels are in [0,1,2,3]
    #                 pred += 1
    #
    #             # elif param['task'] == "reg_keras_dnn":
    #             #     ## regression with keras' deep neural networks
    #             #     model = Sequential()
    #             #     ## input layer
    #             #     model.add(Dropout(param["input_dropout"]))
    #             #     ## hidden layers
    #             #     first = True
    #             #     hidden_layers = param['hidden_layers']
    #             #     while hidden_layers > 0:
    #             #         if first:
    #             #             dim = X_train.shape[1]
    #             #             first = False
    #             #         else:
    #             #             dim = param["hidden_units"]
    #             #         model.add(Dense(dim, param["hidden_units"], init='glorot_uniform'))
    #             #         if param["batch_norm"]:
    #             #             model.add(BatchNormalization((param["hidden_units"],)))
    #             #         if param["hidden_activation"] == "prelu":
    #             #             model.add(PReLU((param["hidden_units"],)))
    #             #         else:
    #             #             model.add(Activation(param['hidden_activation']))
    #             #         model.add(Dropout(param["hidden_dropout"]))
    #             #         hidden_layers -= 1
    #             #
    #             #     ## output layer
    #             #     model.add(Dense(param["hidden_units"], 1, init='glorot_uniform'))
    #             #     model.add(Activation('linear'))
    #             #
    #             #     ## loss
    #             #     model.compile(loss='mean_squared_error', optimizer="adam")
    #             #
    #             #     ## to array
    #             #     X_train = X_train.toarray()
    #             #     X_valid = X_valid.toarray()
    #             #
    #             #     ## scale
    #             #     scaler = StandardScaler()
    #             #     X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #             #     X_valid = scaler.transform(X_valid)
    #             #
    #             #     ## train
    #             #     model.fit(X_train[index_base], labels_train[index_base],
    #             #                 nb_epoch=param['nb_epoch'], batch_size=param['batch_size'],
    #             #                 validation_split=0, verbose=0)
    #             #
    #             #     ##prediction
    #             #     pred = model.predict(X_valid, verbose=0)
    #             #     pred.shape = (X_valid.shape[0],)
    #
    #             elif param['task'] == "reg_rgf":
    #                 ## regression with regularized greedy forest (rgf)
    #                 ## to array
    #                 X_train, X_valid = X_train.toarray(), X_valid.toarray()
    #
    #                 train_x_fn = feat_train_path + ".x"
    #                 train_y_fn = feat_train_path + ".y"
    #                 valid_x_fn = feat_valid_path + ".x"
    #                 valid_pred_fn = feat_valid_path + ".pred"
    #
    #                 model_fn_prefix = "rgf_model"
    #
    #                 np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t')
    #                 np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t')
    #                 np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t')
    #                 # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')
    #
    #
    #                 pars = [
    #                     "train_x_fn=", train_x_fn, "\n",
    #                     "train_y_fn=", train_y_fn, "\n",
    #                     # "train_w_fn=",weight_train_path,"\n",
    #                     "model_fn_prefix=", model_fn_prefix, "\n",
    #                     "reg_L2=", param['reg_L2'], "\n",
    #                     # "reg_depth=", 1.01, "\n",
    #                     "algorithm=", "RGF", "\n",
    #                     "loss=", "LS", "\n",
    #                     # "opt_interval=", 100, "\n",
    #                     "valid_interval=", param['max_leaf_forest'], "\n",
    #                     "max_leaf_forest=", param['max_leaf_forest'], "\n",
    #                     "num_iteration_opt=", param['num_iteration_opt'], "\n",
    #                     "num_tree_search=", param['num_tree_search'], "\n",
    #                     "min_pop=", param['min_pop'], "\n",
    #                     "opt_interval=", param['opt_interval'], "\n",
    #                     "opt_stepsize=", param['opt_stepsize'], "\n",
    #                     "NormalizeTarget"
    #                 ]
    #                 pars = "".join([str(p) for p in pars])
    #
    #                 rfg_setting_train = "./rfg_setting_train"
    #                 with open(rfg_setting_train + ".inp", "wb") as f:
    #                     f.write(pars)
    #
    #                 ## train fm
    #                 cmd = "perl %s %s train %s >> rgf.log" % (
    #                     call_exe, rgf_exe, rfg_setting_train)
    #                 # print cmd
    #                 os.system(cmd)
    #
    #                 model_fn = model_fn_prefix + "-01"
    #                 pars = [
    #                     "test_x_fn=", valid_x_fn, "\n",
    #                     "model_fn=", model_fn, "\n",
    #                     "prediction_fn=", valid_pred_fn
    #                 ]
    #
    #                 pars = "".join([str(p) for p in pars])
    #
    #                 rfg_setting_valid = "./rfg_setting_valid"
    #                 with open(rfg_setting_valid + ".inp", "wb") as f:
    #                     f.write(pars)
    #                 cmd = "perl %s %s predict %s >> rgf.log" % (
    #                     call_exe, rgf_exe, rfg_setting_valid)
    #                 # print cmd
    #                 os.system(cmd)
    #
    #                 pred = np.loadtxt(valid_pred_fn, dtype=float)
    #
    #             ## weighted averageing over different models
    #             pred_valid = pred
    #             ## this bagging iteration
    #             preds_bagging[:, n] = pred_valid  # preds_bagging的第n+1列为pred_valid
    #             pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1)  # 按行(同行多列)进行平均值
    #             # pred_rank = pred_raw.argsort().argsort()    # argsort: 获取排序的索引值(index),但索引值本身不排序,第二次是归位
    #             # pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) # 根据cdf来生成分数
    #             # kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) # 计算kappa分数
    #             log_loss_valid = elementwise.log_loss(Y_valid, pred_raw)
    #             print('Y_valid mean:', np.mean(Y_valid))
    #             print('pred_raw mean:', np.mean(pred_raw))
    #             if (n + 1) != bagging_size:
    #                 print("              {:>3}   {:>3}   {:>3}   {:>6}   {} x {}".format(
    #                     run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1]))
    #             else:
    #                 print("                    {:>3}       {:>3}      {:>3}    {:>8}  {} x {}".format(
    #                     run, fold, n + 1, np.round(log_loss_valid, 6), X_train.shape[0], X_train.shape[1]))
    #         log_loss_cv[run - 1, fold - 1] = log_loss_valid
    #         ## save this prediction 保存的是单行的预测值
    #         dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw})
    #         dfPred.to_csv(raw_pred_valid_path, index=False, header=True, columns=["target", "prediction"])
    #         # save this prediction 保存的是根据预测值排序之后,然后使用cdf来生成的预测值
    #         # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank})
    #         # dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"])
    #
    # log_loss_cv_mean = np.mean(log_loss_cv)
    # log_loss_cv_std = np.std(log_loss_cv)
    # if verbose_level >= 1:
    #     print("              Mean: %.6f" % log_loss_cv_mean)
    #     print("              Std: %.6f" % log_loss_cv_std)

    ####################
    #### Retraining ####
    ####################
    #### all the path
    path = "%s/All" % (feat_folder)
    save_path = "%s/All" % output_path
    subm_path = "%s/Subm" % output_path
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(subm_path):
        os.makedirs(subm_path)
    # feat
    feat_train_path = "%s/train.feat" % path
    feat_test_path = "%s/test.feat" % path
    # weight
    # weight_train_path = "%s/train.feat.weight" % path
    # info
    info_train_path = "%s/train.info" % path
    info_test_path = "%s/test.info" % path
    # cdf
    # cdf_test_path = "%s/test.cdf" % path
    # raw prediction path (rank)
    raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d]_[Run_Time@%s].csv" % (
        save_path, feat_name, trial_counter,
        time.strftime("%Y%m%d%H%M%S", time.localtime()))
    rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (
        save_path, feat_name, trial_counter)
    # submission path (is_duplicate as in [0, 1])
    # subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, log_loss_cv_mean, log_loss_cv_std)

    #### load data
    ## load feat
    # X_train, labels_train = load_svmlight_file(feat_train_path)
    X_train, labels_train = X_train_all, labels_train_all
    print('X_train_all.shape:', X_train_all.shape)
    print('labels_train_all.mean:', np.mean(labels_train_all))
    X_test, labels_test = load_svmlight_file(feat_test_path)
    # if X_test.shape[1] < X_train.shape[1]:
    #     X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))])
    # elif X_test.shape[1] > X_train.shape[1]:
    #     X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))])
    # X_train = X_train.tocsr()
    # X_test = X_test.tocsr()
    # 缩小训练数据比例以使训练数据和测试数据比例一致

    ## load train weight
    # weight_train = np.loadtxt(weight_train_path, dtype=float)
    ## load test info
    info_train = pd.read_csv(info_train_path)
    numTrain = X_train.shape[0]
    info_test = pd.read_csv(info_test_path)
    numTest = info_test.shape[0]
    id_test = info_test["test_id"]
    numValid = info_test.shape[0]
    ## load cdf
    # cdf_test = np.loadtxt(cdf_test_path, dtype=float)
    # ## 评价函数
    # evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test)
    # evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test)
    # evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test)
    # evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold)
    # evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test)
    ## bagging
    preds_bagging = np.zeros((numTest, bagging_size), dtype=float)
    for n in range(bagging_size):
        print("", n, " runs training start")
        rng = np.random.RandomState(datetime.datetime.now().year + 1000 * n +
                                    10 * 1)
        if bootstrap_replacement:
            sampleSize = int(numTrain * bootstrap_ratio)
            #index_meta = rng.randint(numTrain, size=sampleSize)
            #index_base = [i for i in range(numTrain) if i not in index_meta]
            index_base = rng.randint(numTrain, size=sampleSize)
            index_meta = [i for i in range(numTrain) if i not in index_base]
        else:
            randnum = rng.uniform(size=numTrain)
            index_base = [
                i for i in range(numTrain) if randnum[i] < bootstrap_ratio
            ]
            index_meta = [
                i for i in range(numTrain) if randnum[i] >= bootstrap_ratio
            ]

        # 如果是xgb则先把数据转换成xgb需要的格式
        if "booster" in param:
            dtest = xgb.DMatrix(X_test, label=labels_test)
            dtrain = xgb.DMatrix(X_train[index_base],
                                 label=labels_train[index_base]
                                 )  # , weight=weight_train[index_base]

            watchlist = []
            if verbose_level >= 2:
                watchlist = [(dtrain, 'train')]

        ## train
        if param["task"] in ["regression", "ranking"]:
            bst = xgb.train(param, dtrain, param['num_round'],
                            watchlist)  # , feval=evalerror_regrank_test
            pred = bst.predict(dtest)

        if param["task"] in ["classification"]:
            ## regression & pairwise ranking with xgboost
            bst = xgb.train(param, dtrain, param['num_round'],
                            watchlist)  # , feval=evalerror_softmax_test
            pred = bst.predict(dtest)

        elif param["task"] in ["softmax"]:
            bst = xgb.train(param, dtrain, param['num_round'],
                            watchlist)  # , feval=evalerror_softmax_test
            pred = bst.predict(dtest)
            w = np.asarray(range(1, numValid))
            pred = pred * w[np.newaxis, :]
            pred = np.sum(pred, axis=1)

        elif param["task"] in ["softkappa"]:
            #  自定义损失函数
            # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
            bst = xgb.train(
                param, dtrain, param['num_round'],
                watchlist)  # , obj=obj, feval=evalerror_softkappa_test
            pred = softmax(bst.predict(dtest))
            w = np.asarray(range(1, numValid))
            pred = pred * w[np.newaxis, :]
            pred = np.sum(pred, axis=1)

        elif param["task"] in ["ebc"]:
            #  自定义损失函数
            # obj = lambda preds, dtrain: ebcObj(preds, dtrain)
            bst = xgb.train(param, dtrain, param['num_round'],
                            watchlist)  # , obj=obj, feval=evalerror_ebc_test
            pred = sigmoid(bst.predict(dtest))
            pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)

        elif param["task"] in ["cocr"]:
            #  自定义损失函数
            obj = lambda preds, dtrain: cocrObj(preds, dtrain)
            bst = xgb.train(param, dtrain, param['num_round'],
                            watchlist)  # , obj=obj, feval=evalerror_cocr_test
            pred = bst.predict(dtest)
            pred = applyCOCRRule(pred)

        elif param['task'] == "reg_skl_rf":
            ## random forest regressor
            rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                       max_features=param['max_features'],
                                       n_jobs=param['n_jobs'],
                                       random_state=param['random_state'])
            rf.fit(X_train[index_base], labels_train[index_base]
                   )  # , sample_weight=weight_train[index_base]
            pred = rf.predict(X_test)

        elif param['task'] == "reg_skl_etr":
            ## extra trees regressor
            etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                      max_features=param['max_features'],
                                      n_jobs=param['n_jobs'],
                                      random_state=param['random_state'])
            etr.fit(X_train[index_base], labels_train[index_base]
                    )  # , sample_weight=weight_train[index_base]
            pred = etr.predict(X_test)

        elif param['task'] == "reg_skl_gbm":
            ## gradient boosting regressor
            gbm = GradientBoostingRegressor(
                n_estimators=param['n_estimators'],
                max_features=param['max_features'],
                learning_rate=param['learning_rate'],
                max_depth=param['max_depth'],
                subsample=param['subsample'],
                random_state=param['random_state'])
            gbm.fit(X_train.toarray()[index_base], labels_train[index_base]
                    )  #, sample_weight=weight_train[index_base]
            pred = gbm.predict(X_test.toarray())

        elif param['task'] == "clf_skl_lr":
            lr = LogisticRegression(penalty="l2",
                                    dual=True,
                                    tol=1e-5,
                                    C=param['C'],
                                    fit_intercept=True,
                                    intercept_scaling=1.0,
                                    class_weight='auto',
                                    random_state=param['random_state'])
            lr.fit(X_train[index_base], labels_train[index_base])
            pred = lr.predict_proba(X_test)
            w = np.asarray(range(1, numValid))
            pred = pred * w[np.newaxis, :]
            pred = np.sum(pred, axis=1)

        elif param['task'] == "reg_skl_svr":
            ## regression with sklearn support vector regression
            X_train, X_test = X_train.toarray(), X_test.toarray()
            scaler = StandardScaler()
            X_train[index_base] = scaler.fit_transform(X_train[index_base])
            X_test = scaler.transform(X_test)
            svr = SVR(C=param['C'],
                      gamma=param['gamma'],
                      epsilon=param['epsilon'],
                      degree=param['degree'],
                      kernel=param['kernel'])
            svr.fit(X_train[index_base], labels_train[index_base]
                    )  # , sample_weight=weight_train[index_base]
            pred = svr.predict(X_test)

        elif param['task'] == "reg_skl_ridge":
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(X_train[index_base], labels_train[index_base]
                      )  # , sample_weight=weight_train[index_base]
            pred = ridge.predict(X_test)

        elif param['task'] == "reg_skl_lasso":
            lasso = Lasso(alpha=param["alpha"], normalize=True)
            lasso.fit(X_train[index_base], labels_train[index_base])
            pred = lasso.predict(X_test)

        elif param['task'] == 'reg_libfm':
            ## to array
            X_train, X_test = X_train.toarray(), X_test.toarray()

            ## scale
            scaler = StandardScaler()
            X_train[index_base] = scaler.fit_transform(X_train[index_base])
            X_test = scaler.transform(X_test)

            ## dump feat
            dump_svmlight_file(X_train[index_base], labels_train[index_base],
                               feat_train_path + ".tmp")
            dump_svmlight_file(X_test, labels_test, feat_test_path + ".tmp")

            ## train fm
            cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
                        libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \
                        param['dim'], param['iter'])
            os.system(cmd)
            os.remove(feat_train_path + ".tmp")
            os.remove(feat_test_path + ".tmp")

            ## extract libfm prediction
            pred = np.loadtxt(raw_pred_test_path, dtype=float)
            ## labels are in [0,1,2,3]
            pred += 1

        # elif param['task'] == "reg_keras_dnn":
        #     ## regression with keras deep neural networks
        #     model = Sequential()
        #     ## input layer
        #     model.add(Dropout(param["input_dropout"]))
        #     ## hidden layers
        #     first = True
        #     hidden_layers = param['hidden_layers']
        #     while hidden_layers > 0:
        #         if first:
        #             dim = X_train.shape[1]
        #             first = False
        #         else:
        #             dim = param["hidden_units"]
        #         model.add(Dense(dim, param["hidden_units"], init='glorot_uniform'))
        #         if param["batch_norm"]:
        #             model.add(BatchNormalization((param["hidden_units"],)))
        #         if param["hidden_activation"] == "prelu":
        #             model.add(PReLU((param["hidden_units"],)))
        #         else:
        #             model.add(Activation(param['hidden_activation']))
        #         model.add(Dropout(param["hidden_dropout"]))
        #         hidden_layers -= 1
        #
        #     ## output layer
        #     model.add(Dense(param["hidden_units"], 1, init='glorot_uniform'))
        #     model.add(Activation('linear'))
        #
        #     ## loss
        #     model.compile(loss='mean_squared_error', optimizer="adam")
        #
        #     ## to array
        #     X_train = X_train.toarray()
        #     X_test = X_test.toarray()
        #
        #     ## scale
        #     scaler = StandardScaler()
        #     X_train[index_base] = scaler.fit_transform(X_train[index_base])
        #     X_test = scaler.transform(X_test)
        #
        #     ## train
        #     model.fit(X_train[index_base], labels_train[index_base],
        #                 nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0)
        #
        #     ##prediction
        #     pred = model.predict(X_test, verbose=0)
        #     pred.shape = (X_test.shape[0],)

        elif param['task'] == "reg_rgf":
            ## to array
            X_train, X_test = X_train.toarray(), X_test.toarray()

            train_x_fn = feat_train_path + ".x"
            train_y_fn = feat_train_path + ".y"
            test_x_fn = feat_test_path + ".x"
            test_pred_fn = feat_test_path + ".pred"

            model_fn_prefix = "rgf_model"

            np.savetxt(train_x_fn,
                       X_train[index_base],
                       fmt="%.6f",
                       delimiter='\t')
            np.savetxt(train_y_fn,
                       labels_train[index_base],
                       fmt="%d",
                       delimiter='\t')
            np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t')
            # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')

            pars = [
                "train_x_fn=",
                train_x_fn,
                "\n",
                "train_y_fn=",
                train_y_fn,
                "\n",
                #"train_w_fn=",weight_train_path,"\n",
                "model_fn_prefix=",
                model_fn_prefix,
                "\n",
                "reg_L2=",
                param['reg_L2'],
                "\n",
                #"reg_depth=", 1.01, "\n",
                "algorithm=",
                "RGF",
                "\n",
                "loss=",
                "LS",
                "\n",
                "test_interval=",
                param['max_leaf_forest'],
                "\n",
                "max_leaf_forest=",
                param['max_leaf_forest'],
                "\n",
                "num_iteration_opt=",
                param['num_iteration_opt'],
                "\n",
                "num_tree_search=",
                param['num_tree_search'],
                "\n",
                "min_pop=",
                param['min_pop'],
                "\n",
                "opt_interval=",
                param['opt_interval'],
                "\n",
                "opt_stepsize=",
                param['opt_stepsize'],
                "\n",
                "NormalizeTarget"
            ]
            pars = "".join([str(p) for p in pars])

            rfg_setting_train = "./rfg_setting_train"
            with open(rfg_setting_train + ".inp", "wb") as f:
                f.write(pars)

            ## train fm
            cmd = "perl %s %s train %s >> rgf.log" % (call_exe, rgf_exe,
                                                      rfg_setting_train)
            #print cmd
            os.system(cmd)

            model_fn = model_fn_prefix + "-01"
            pars = [
                "test_x_fn=", test_x_fn, "\n", "model_fn=", model_fn, "\n",
                "prediction_fn=", test_pred_fn
            ]

            pars = "".join([str(p) for p in pars])

            rfg_setting_test = "./rfg_setting_test"
            with open(rfg_setting_test + ".inp", "wb") as f:
                f.write(pars)
            cmd = "perl %s %s predict %s >> rgf.log" % (call_exe, rgf_exe,
                                                        rfg_setting_test)
            #print cmd
            os.system(cmd)

            pred = np.loadtxt(test_pred_fn, dtype=float)

        ## weighted averageing over different models
        pred_test = pred
        # if abs(np.mean(pred_test) - 0.17426) < 0.1:
        preds_bagging[:, n] = pred_test
        print('pred_test mean:', np.mean(pred_test))

    # 去掉误差太大的
    # cols = []
    # for col in range(0, preds_bagging.shape[1]):
    #     if abs(np.mean(preds_bagging[:, col]) - 0.17426) < 0.1:
    #         cols.append(col)
    # if len(cols) > 0:
    pred_raw = np.mean(preds_bagging, axis=1)
    # pred_rank = pred_raw.argsort().argsort()
    #
    ## write
    output = pd.DataFrame({"test_id": id_test, "is_duplicate": pred_raw})
    output.to_csv(raw_pred_test_path, index=False)
X = df.drop(['Total_Feeder'], axis=1).values
y = df['Total_Feeder'].values

tscv = TimeSeriesSplit()

for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# train_split = 0.9
# num_train = int(len(X) * 0.9)
# X_train = X[:num_train]
# X_test = X[num_train:]
#
# y_train = y[:num_train]
# y_test = y[num_train:]

svr = SVR(kernel='rbf', C=40, gamma='auto')

svr.fit(X_train, y_train)
accuracy = svr.score(X_test, y_test)
print(accuracy)  ## only 20%
predictions = svr.predict(X)

df['Prediction'] = predictions
df['Total_Feeder'].plot()
df['Prediction'].plot()
plt.show()
class SVMRegression():
    def __init__(self, dataset, type_of_kernel = 'rbf', cross_validation_type = 'loo', split_for_validation = None, dataset_validation = None, svm_random_state = 1, 
                svm_degree=3, svm_gamma='scale', svm_coef=0.0, svm_tol=1e-10, svm_epsilon=0.1, svm_max_iter=-1):
        self.dataset = dataset
        self.kernel = type_of_kernel
        self.cross_validation_type = cross_validation_type
        self.split_for_validation = split_for_validation
        self.dataset_validation = dataset_validation
        self.svm_random_state = svm_random_state
        
        self.degree = svm_degree
        self.gamma = svm_gamma
        self.coef0 = svm_coef
        self.tol = svm_tol
        self.epsilon = svm_epsilon
        self.max_iter = svm_max_iter

        self._xCal = pd.DataFrame()
        self._xVal = pd.DataFrame()
        self._yCal = pd.DataFrame()
        self._yVal = pd.DataFrame()

        self._cv = None

        self.metrics = {}

        # checking if the parameters was inserted correctly

        if not isinstance(self.dataset, pd.DataFrame):
            raise ValueError('The dataset should be a pd.DataFrame.')

        if (self.dataset_validation is None) and (self.split_for_validation is None):
            raise ValueError('Should be defined the samples for validation or size of test size for split the dataset.')

        # x = dataset.iloc[:, 2:]
        # y = dataset.iloc[:, 1]
        
        if (not self.split_for_validation is None) and (self.dataset_validation is None):
            if self.split_for_validation == 'all':
                self._xCal = self.dataset.iloc[:, 2:]
                self._yCal = self.dataset.iloc[:, 1]
            elif isinstance(self.split_for_validation, float):
                self._xCal, self._xVal, self._yCal, self._yVal = train_test_split(self.dataset.iloc[:, 2:], self.dataset.iloc[:, 1], test_size=split_for_validation, random_state=self.svm_random_state)
            else:
                raise ValueError("split_for_validation need be a float value between 0 and 1 for split dataset. Use 1 for calibrate with all samples of dataset.")


        if not self.dataset_validation is None:
            if isinstance(self.dataset_validation, pd.DataFrame):
                self._xCal = self.dataset.iloc[:, 2:]
                self._yCal = self.dataset.iloc[:, 1]
                self._xVal = self.dataset_validation.iloc[:, 2:]
                self._yVal = self.dataset_validation.iloc[:, 1]
            else:
                raise ValueError("dataset_validation need be a pd.DataFrame")


        if isinstance(cross_validation_type, str):
            if cross_validation_type == "loo":
                self._cv = LeaveOneOut()
        elif (type(cross_validation_type) in [int]) and (cross_validation_type > 0):
            cv = KFold(cross_validation_type, shuffle=True, random_state=self.svm_random_state)
            self._cv = cv
        else:
            raise ValueError("The cross_validation_type should be a positive integer for k-fold method ou 'loo' for leave one out cross validation.")
    

    def search_hyperparameters(self, kernel = ['rbf'], degree = [ 3 ], gamma=[ 'scale' ], coef0=[ 0.0, 0.1 ], epsilon=[ 0.1, 2.0 ], 
                               tol = [1e-3, 1e-10], max_iter = [ -1 ], n_processors = 1, verbose = 0, 
                               scoring = 'neg_root_mean_squared_error'):
        
        step_value = lambda list_of_values: 0.5 if (len(list_of_values) < 3) else list_of_values[2]
        epsilon = [round(x, 3) for x in np.arange(start = epsilon[0], stop = epsilon[1], step = step_value(epsilon))]
        coef0 = [round(x, 3) for x in np.arange(start = coef0[0], stop = coef0[1], step = step_value(coef0))]

        random_grid = { "kernel": kernel,
                        "degree": degree,
                        "gamma": gamma,
                        "coef0": coef0,
                        "epsilon": epsilon,
                        "max_iter": max_iter,
                        "tol": tol
                       }
    
        svm_regression = SVR()

        svm_regresion_grid = GridSearchCV(estimator = svm_regression, param_grid = random_grid, cv = self._cv, n_jobs = n_processors, verbose=verbose, scoring=scoring)
        svm_regresion_grid.fit(self._xCal, self._yCal)

        get_params = lambda dict_params, param, default_params: dict_params[param] if (param in dict_params) else default_params
        
        self._best_params = svm_regresion_grid.best_params_
        self.kernel = get_params(svm_regresion_grid.best_params_, 'kernel', self.kernel)
        self.degree = get_params(svm_regresion_grid.best_params_, 'degree', self.degree)
        self.gamma = get_params(svm_regresion_grid.best_params_, 'gamma', self.gamma)
        self.coef0 = get_params(svm_regresion_grid.best_params_, 'coef0', self.coef0)
        self.tol = get_params(svm_regresion_grid.best_params_, 'tol', self.tol)
        self.epsilon = get_params(svm_regresion_grid.best_params_, 'epsilon', self.epsilon)
        self.max_iter = get_params(svm_regresion_grid.best_params_, 'max_iter', self.max_iter)

    def calibrate(self):
        
        self.model = SVR(kernel = self.kernel, degree = self.degree, gamma = self.gamma, coef0 = self.coef0, tol = self.tol, 
                         epsilon = self.epsilon, max_iter = self.max_iter)

        self.model.fit(self._xCal, self._yCal)

        y_cal_predict = self.model.predict(self._xCal)
        r_correlation = np.corrcoef(self._yCal, y_cal_predict)[0][1]
        r2_cal = self.model.score(self._xCal, self._yCal)
        rmse = mean_squared_error(self._yCal, y_cal_predict, squared=False)

        nsamples = self._xCal.shape[0]

        calibration_metrics = {'n_samples': nsamples, 'R': r_correlation, 'R2': r2_cal, 'RMSE': rmse}

        self.metrics['calibration'] = calibration_metrics  
    


    def cross_validate(self):
        
        r_correlation, r2_cv, rmse_cv, bias, predicted_values = cross_validation(self.model, self._xCal, self._yCal, self._cv, correlation_based=False)

        method = 'Leave One Out'
        if isinstance(self._cv, KFold):
            method = "{}-fold".format(self._cv.n_splits)
        
        cross_validation_metrics = {'R': r_correlation, 'R2': r2_cv, 'RMSE': rmse_cv, 'bias': bias, 'method': method, 'predicted_values': predicted_values }
        
        self.metrics['cross_validation'] = cross_validation_metrics
    
    def validate(self):

        r_correlation, r2_ve, rmse_ve, bias, predicted_values = external_validation(self.model, self._xVal, self._yVal, correlation_based=False)

        nsamples = self._xVal.shape[0]
        validation = {'R': r_correlation, 'R2': r2_ve, 'RMSE': rmse_ve, 'bias': bias, 'n_samples': nsamples, 'predicted_values': predicted_values}

        self.metrics['validation'] = validation

    def create_model(self):

        self.calibrate()
        self.cross_validate()
        self.validate()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1, 1))

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(sc_X.transform(np.array(([[6.5]]))))
y_pred = sc_y.inverse_transform(y_pred)
"""# Applying k-Fold Cross Validation (model evaluation)
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X, y = y, cv = 10)
accuracies.mean()
accuracies.std()"""

# Visualising the SVR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
Beispiel #40
0
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_confidence = lr.score(x_test, y_test)
lr_confidence = "{:.2%}".format(lr_confidence)
print("LR confidence: ", lr_confidence)

# Kernel Ridge Regression:
kridge = KernelRidge(alpha=1.0)
kridge.fit(x_train, y_train)
kridge_confidence = kridge.score(x_test, y_test)
kridge_confidence = "{:.2%}".format(kridge_confidence)
print("Kernel Ridge Confidence: ", kridge_confidence)

x_forecast = np.array(df.drop(['Prediction'], 1))[-forecast_out:]
lr_prediction = lr.predict(x_forecast)
svm_prediction = svr_rbf.predict(x_forecast)
kridge_prediction = kridge.predict(x_forecast)

last_date = datetime.datetime.strptime(df_full['Date'].iloc[-1], '%Y-%m-%d')
last_unix = last_date
next_unix = last_unix + datetime.timedelta(days=1)
df_full['LR Prediction'] = np.NaN
df_full['SVM Prediction'] = np.NaN
df_full['Ridge Prediction'] = np.NaN

for i, j, k in zip(lr_prediction, svm_prediction, kridge_prediction):
    next_date = next_unix
    next_unix += datetime.timedelta(days=1)
    next_date_str = next_date.strftime('%Y-%m-%d')
    df_full.loc[len(df_full)] = [
        next_date_str, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, i, j, k
features_train = train.ix[:,:175]
target_train = train.ix[:,[176]]

features_test = test.ix[:,:175]
target_test= test.ix[:,[176]]

#convert to a numpy array
features_train_np = np.array(features_train)
target_train_np = np.array(target_train)

features_test_np = np.array(features_test)
target_test_np = np.array(target_test)

# Our dataset and targets
X = features_train_np
y = target_train_np

imp.fit(X)
X = imp.transform(X)

imp.fit(y)
y = imp.transform(y).ravel()


print X.shape, y.shape

clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X, y)
clf.score(X, y)
predicted= clf.predict(features_test_np)
Beispiel #42
0
svr_model.fit(x,y)
print("        finished at", datetime.now())
print("  sv = ",svr_model.n_support_)

print("Generating plot...")
plt.figure(figsize=(14,5), dpi=100)
plt.plot(pd.to_datetime(dates),y, marker='x', markersize=3, linewidth=0, label=selected_feature+" - actual data")
plt.plot(pd.to_datetime(dates[:-3]), \
    	 np.convolve(np.pad(y,3,mode='edge'), np.ones(7)/7, mode='valid')[:-3], \
    		alpha=0.5, label=selected_feature+" - 7 days moving average")
#plt.plot(pd.to_datetime(dates), np.convolve(np.pad(y,7,mode='edge'), np.ones(15)/15, mode='valid'), \
#    		alpha=0.5, label=selected_feature+" - 15-days window mean")
plt.plot(pd.to_datetime(dates)[:-15], \
    	 np.convolve(np.pad(y,15,mode='edge'), np.ones(31)/31, mode='valid')[:-15], \
    		alpha=0.5, label=selected_feature+" - 31 days moving average")
plt.plot(pd.to_datetime(dates),svr_model.predict(x), color='red', label=selected_feature+" - SVR model")

days_to_predict = 14
x_pred = np.arange(x[-1][0],x[-1][0]+days_to_predict).reshape(-1,1)
x_pred_dates = [pd.to_datetime(dates)[-1] + pd.Timedelta(i,'day') for i in range(0,days_to_predict)]
plt.plot(x_pred_dates,svr_model.predict(x_pred), ':', color='red', label=selected_feature+" - SVR prediction")

plt.legend()

plt.gcf().savefig('Figura-4-SVR.pdf')
print("Saving and opening export folder...")
os.system("open ./")
#plt.show()
plt.close()

Beispiel #43
0
def WZ_result(X1, y1, X, y, wz):
    #X = X.values
    y = y.values
    rmses = []
    rf = []

    loo = LeaveOneOut()
    for train, test in loo.split(X):

        train_X, test_X, train_y, test_y = X[train], X[test], y[train], y[test]
        clf = SVR(kernel='rbf', C=10, gamma=0.01)
        clf.fit(train_X, train_y)
        predicted = clf.predict(test_X)

        rmse = mean_squared_error(test_y, predicted)**0.5
        rmses.append(rmse)

        rf.append(clf)

    index = rmses.index(min(rmses))
    predict = rf[index].predict(X1)

    # rmse  mae r2
    r2 = r2_score(y1, predict)
    print("R2", r2)

    mae = mean_absolute_error(y1, predict)
    print("mae", mae)

    rmse = mean_squared_error(y1, predict)**0.5
    print("rmse", rmse)

    figsize = 9, 9
    figure, ax = plt.subplots(figsize=figsize)

    p0, = plt.plot([0, 8], [0, 8],
                   '--',
                   color='black',
                   label='line',
                   linewidth=1.0)

    color = ['limegreen', 'mediumslateblue', 'cyan', 'gold']
    marker = ['*', 'o', 'd', '<']

    p1 = plt.scatter(y1[0:12],
                     predict[0:12],
                     c=color[0],
                     marker=marker[0],
                     label='NQ1',
                     s=280,
                     edgecolors='black')
    p2 = plt.scatter(y1[12:24],
                     predict[12:24],
                     c=color[1],
                     marker=marker[1],
                     label='NQ5',
                     s=150,
                     edgecolors='black')
    p3 = plt.scatter(y1[24:36],
                     predict[24:36],
                     c=color[2],
                     marker=marker[2],
                     label='NQ7',
                     s=180,
                     edgecolors='black')
    p4 = plt.scatter(y1[36:48],
                     predict[36:48],
                     c=color[3],
                     marker=marker[3],
                     label='NNQ9',
                     s=180,
                     edgecolors='black')

    ############# 设置坐标刻度值的大小以及刻度值的字体 #############
    #plt.xlim(3, 7.5)
    #plt.ylim(3, 7.5)
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.tick_params(labelsize=18)

    labels = ax.get_yticklabels()
    [label.set_fontname('Times New Roman') for label in labels]

    labels = ax.get_xticklabels()
    [label.set_fontname('Times New Roman') for label in labels]

    ############# 设置图例并且设置图例的字体及大小 #############
    font1 = {
        'family': 'Times New Roman',
        'weight': 'normal',
        'size': 25,
    }

    plt.legend(prop=font1, frameon=False)  # 图例
    plt.ylabel('      (g / 100g)', font1)
    plt.xlabel('      (g / 100g)', font1)

    ax = plt.gca()
    ax.set_aspect(1)

    #plt.savefig('C:\\Users\\shaoqi\\Desktop\\'+ wz +'.eps', dpi=2000)
    plt.show()
    return predict
X = np.array(ct.fit_transform(X))

# SPLITTING THE DATA
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# FEATURE SCALING
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train[:, 42:] = sc_X.fit_transform(X_train[:, 42:])
X_test[:, 42:] = sc_X.transform(X_test[:, 42:])

y_train = sc_y.fit_transform(y_train)
y_test = sc_y.transform(y_test)

# TRAIN THE SVR MODEL
regressor = SVR(
    kernel='rbf')  # rfb = radial basis function (non linear function)
regressor.fit(X_train, y_train)

# PREDICTING TEST SET RESULTS
y_pred = sc_y.inverse_transform(regressor.predict(X_test))
np.set_printoptions(precision=2)
print(
    np.concatenate(
        (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

# EVALUATING THE SVR PERFORMANCE
print(r2_score(sc_y.inverse_transform(y_test), y_pred))
Beispiel #45
0
'''
支持向量迴歸
使用支持向量迴歸,建議先將X.Y做標準化或規一化
可以看DataPreprocessing的ScaleTransform.py
下面以標準化為例
SVR內參數可調整
kernel: 'linear','poly','rbf','sigmoid','precomputed'
C: 懲罰項 越大越容易overfittin
gamma: 越大,支持向量越少,越小,支持向量越多
'''

#標準化
from sklearn.preprocessing import StandardScaler

scaler_X = StandardScaler()
scaler_Y = StandardScaler()
X = scaler_X.fit_transform(X)
Y = scaler_Y.fit_transform(Y.reshape(-1, 1))

# SVR訓練
from sklearn.svm import SVR

SVR_regressor = SVR(kernel='rbf')
SVR_regressor.fit(X_train, Y)

# SVR預測,預測完的Y記得轉換回來
X_test = scaler_X.transform(X_test)
y_pred = SVR_regressor.predict(X_test)
y_pred = scaler_Y.inverse_transform(y_pred)
Beispiel #46
0
# temp_max2 是 dist2 中城市的对应最高温度
temp_max2 = temp_max[5:10]

# 我们调用SVR函数,在参数中规定了使用线性的拟合函数
# 并且把 C 设为1000来尽量拟合数据(因为不需要精确预测不用担心过拟合)
svr_lin1 = SVR(kernel='linear', C=1e3)
svr_lin2 = SVR(kernel='linear', C=1e3)

# 加入数据,进行拟合(这一步可能会跑很久,大概10多分钟,休息一下:) )
svr_lin1.fit(dist1, temp_max1)
svr_lin2.fit(dist2, temp_max2)

# 关于 reshape 函数请看代码后面的详细讨论
xp1 = np.arange(10, 100, 10).reshape((9, 1))
xp2 = np.arange(50, 400, 50).reshape((7, 1))
yp1 = svr_lin1.predict(xp1)
yp2 = svr_lin2.predict(xp2)

#限制了X轴的取值范围
ax.plot(xp1, yp1, c='b', label='Strong sea effect')
ax.plot(xp2, yp2, c='g', label='Light sea effect')

fig

print(svr_lin1.coef_)  #斜率
print(svr_lin1.intercept_)  #截距
print(svr_lin2.coef_)
print(svr_lin2.intercept_)


# 定义了第一条拟合直线
import numpy as np
from sklearn.svm import SVR
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3

model = SVR()
model.fit(X, y)
pred = model.predict(X)

print(pred)
Beispiel #48
0
for max_depth in range(1, 51):
    for min_samples_split in range(1, 102, 5):
        tree = DecisionTreeRegressor(max_depth=max_depth,
                                     min_samples_split=min_samples_split)
        tree.fit(X_train, y_train)
        prediction = tree.predict(X_test)
        mae.append(mean_absolute_error(y_test, prediction))
print "Minimum MAE TREE = ", min(mae)
test_maes_dictionary["Dec. Tree"] = min(mae)

## SUPPORT VECTORS MACHINE TRAINING
mae = []
for kernel in ["rbf", "linear", "poly", "sigmoid"]:
    svr = SVR(kernel=kernel)
    svr.fit(X_train, y_train)
    prediction = svr.predict(X_test)
    mae.append(mean_absolute_error(y_test, prediction))
print "Minimum MAE SVR = ", min(mae)
test_maes_dictionary["SVM"] = min(mae)

## RANDOM FOREST TRAINING
mae = []
for n_estimators in range(10, 1100, 100):
    rf = RandomForestRegressor(n_estimators=n_estimators)
    rf.fit(X_train, y_train)
    prediction = rf.predict(X_test)
    mae.append(mean_absolute_error(y_test, prediction))
print "Minimum MAE R.Forest = ", min(mae)
test_maes_dictionary["R. Forest"] = min(mae)

#############################################################################################
Beispiel #49
0
y = dataset.iloc[:, -1].values
y = y.reshape(len(y), )

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
sc_y = StandardScaler()
y = sc_y.fit_transform(y)

from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X, y)

# no try to predict level 6.5
x_interest = sc_X.transform([[6.5]])
y_interest = svr.predict(x_interest)
# now inverse y
final_y = sc_y.inverse_transform(y_interest)

# scatter the real.
# plot the pred
X_org = dataset.iloc[:, 1:-1].values
plt.scatter(X_org, sc_y.inverse_transform(y), color='red')
plt.plot(X_org, sc_y.inverse_transform(svr.predict(X)), color='blue')
plt.show()

## same thing. but smoth lines
X_plot = np.arange(min(X_org), max(X_org), 0.1)
X_plot = X_plot.reshape(len(X_plot), 1)
plt.scatter(X_org, sc_y.inverse_transform(y), color='red')
plt.plot(X_plot,
Beispiel #50
0
 def tune_cv(x_train, y_train, x_test, y_test, C, gamma):
     model = SVR(C=C, gamma=gamma).fit(x_train, y_train)
     predictions = model.predict(x_test)
     return optunity.metrics.mse(y_test, predictions)
# Scale X
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
# Scale y
sc_y = StandardScaler()
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
# ==========================
from sklearn.svm import SVR
regressor = SVR(kernel="rbf", gamma="scale")
regressor.fit(X, y)

# Predicting the new result
# =========================
temp_pred = regressor.predict(sc_X.transform(np.array([[6.5]])))
y_pred = sc_y.inverse_transform(temp_pred)

# Visualising the SVR results
# ===========================
# Create higher precision for X axis.
x_grid = np.arange(min(X), max(X), 0.1)
x_grid = x_grid.reshape((len(x_grid), 1))

plt.scatter(X, y, color="red")
# Use x_grid in place of X for smoother line plot.
plt.plot(x_grid, regressor.predict(x_grid), color="blue")
plt.title("Salary Guide (Polynomial Regression)")
plt.xlabel("Position level")
plt.ylabel("Salary")
plt.show()
Beispiel #52
0
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1234)
# Fit regression model
svr_lin = SVR(kernel='linear')
svr_rbf = SVR(kernel='rbf', gamma=0.1)
svr_poly = SVR(kernel='poly', degree=2)
y_lin = svr_lin.fit(X_train, y_train).predict(X_train)
y_rbf = svr_rbf.fit(X_train, y_train).predict(X_train)
y_poly = svr_poly.fit(X_train, y_train).predict(X_train)

print("Linear train error: ", mean_squared_error(y_train, y_lin),
      " test error: ", mean_squared_error(y_test, svr_lin.predict(X_test)))

print("RBF train error: ", mean_squared_error(y_train, y_rbf), " test error: ",
      mean_squared_error(y_test, svr_rbf.predict(X_test)))

print("Polynomial train error: ", mean_squared_error(y_train, y_poly),
      " test error: ", mean_squared_error(y_test, svr_rbf.predict(X_test)))

plt.figure(figsize=(20, 10))
plt.scatter(X_train[:, feature], y_train, color='darkorange', label='data')
plt.scatter(X_train[:, feature], y_lin, color='c', label='Linear model')
plt.scatter(X_train[:, feature], y_rbf, color='navy', label='RBF model')
plt.scatter(X_train[:, feature],
            y_poly,
            color='cornflowerblue',
            label='Polynomial model')
plt.scatter(X_svr, y_svr)
plt.show()

#%% Implementación del modelo de SVR

#Separo los datos de "train" en entrenamiento y prueba para probar los algoritmos
X_train, X_test, y_train, y_test = train_test_split(X_svr,
                                                    y_svr,
                                                    test_size=0.2)

#Defino el algoritmo a utilizar
svr = SVR(kernel='linear', C=1.0, epsilon=0.2)
#svr = SVR()

#Entreno el modelo
svr.fit(X_train, y_train)

#Realizo una predicción
Y_pred = svr.predict(X_test)

#%% Resultados del modelo

#Graficamos los datos junto con el modelo
plt.scatter(X_test, y_test)
plt.plot(X_test, Y_pred, color='red', linewidth=3)
plt.show()
print()
print('DATOS DEL MODELO VECTORES DE SOPORTE REGRESIÓN')
print()
print('Precisión del modelo:')
print(svr.score(X_train, y_train))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(6.5)
y_pred = sc_y.inverse_transform(y_pred)

# Visualising the SVR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the SVR results (for higher resolution and smoother curve)
X_grid = np.arange(
    min(X), max(X), 0.01
)  # choice of 0.01 instead of 0.1 step because the data is feature scaled
X_grid = X_grid.reshape((len(X_grid), 1))
def main():
    horses98 = HorseParserNoHandicaps('./../Data/born98.csv').horses
    horses05 = HorseParserNoHandicaps('./../Data/born05.csv').horses

    races98 = RaceParserNoHandicaps('./../Data/born98.csv').races
    races05 = RaceParserNoHandicaps('./../Data/born05.csv').races

    print 'HorsesBorn98 Dataset'
    horses_train_98, horses_test_98 = split_dataset(horses98)

    horses_98_X_train = []
    horses_98_y_train = []
    for h in horses_train_98:
        v, s = compute_vector(h)
        horses_98_X_train.append(v)
        horses_98_y_train.append(s)

    print 'No. of instances in training set:'
    print len(horses_98_X_train)
    print len(horses_98_y_train)
    print ''

    horses_98_X_test = []
    horses_98_y_test = []
    for h in horses_test_98:
        v, s = compute_vector(h)
        horses_98_X_test.append(v)
        horses_98_y_test.append(s)

    print 'No. of instances in testing set:'
    print len(horses_98_X_test)
    print len(horses_98_y_test)
    print ''

    print 'Create SVR object'
    # Create svr object
    svr98 = SVR(kernel='linear', C=1e3)  #, gamma=0.1)

    print 'Training SVR'
    # Train the model using the training sets
    svr98.fit(horses_98_X_train, horses_98_y_train)

    print 'Predicting'
    horses_98_y_pred = svr98.predict(horses_98_X_test)

    # Explained variance score: 1 is perfect prediction
    print 'Variance score:'
    print svr98.score(horses_98_X_test, horses_98_y_test)
    print ''

    print 'Mean absolute error:'
    print mean_absolute_error(horses_98_y_test, horses_98_y_pred)
    print ''

    print 'Explained variance:'
    print explained_variance_score(horses_98_y_test, horses_98_y_pred)
    print ''

    print 'Mean squared error:'
    print mean_squared_error(horses_98_y_test, horses_98_y_pred)
    print ''

    print 'R2 score:'
    print r2_score(horses_98_y_test, horses_98_y_pred)
    print ''
Beispiel #56
0
print(r2_score(Y , lin_reg2.predict(poly_reg.fit_transform(X))))



# SVR 
from sklearn.preprocessing import StandardScaler
sc1 = StandardScaler()
sc2 = StandardScaler()
x_olcekli = sc1.fit_transform(X)
y_olcekli = sc2.fit_transform(Y)

from sklearn.svm import SVR
svr_reg = SVR(kernel = 'rbf')
svr_reg.fit(x_olcekli, y_olcekli)
print("SVR OLS:")
model3 = sm.OLS(svr_reg.predict(x_olcekli),x_olcekli)
print(model3.fit().summary())
print("SVR R-square value:")
print(r2_score(Y , svr_reg.predict(x_olcekli)))



# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt_r = DecisionTreeRegressor(random_state=0)
dt_r.fit(X,Y)
print("Decision Tree OLS:")
model4 = sm.OLS(dt_r.predict(X),X)
print(model4.fit().summary())
print("Decision Tree R-square value:")
print(r2_score(Y , dt_r.predict(X)))
Beispiel #57
0
        (start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))

X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(
    days_since_1_22, turkey_cases, test_size=0.36, shuffle=False)

# In[6]:

# svm_confirmed = svm_search.best_estimator_
svm_confirmed = SVR(shrinking=True,
                    kernel='poly',
                    gamma=0.01,
                    epsilon=1,
                    degree=4,
                    C=0.1)
svm_confirmed.fit(X_train_confirmed, y_train_confirmed)
svm_pred = svm_confirmed.predict(future_forcast)

# check against testing data
svm_test_pred = svm_confirmed.predict(X_test_confirmed)

# In[7]:

# transform our data for polynomial regression
poly = PolynomialFeatures(degree=3)
poly_X_train_confirmed = poly.fit_transform(X_train_confirmed)
poly_X_test_confirmed = poly.fit_transform(X_test_confirmed)
poly_future_forcast = poly.fit_transform(future_forcast)

bayesian_poly = PolynomialFeatures(degree=4)
bayesian_poly_X_train_confirmed = bayesian_poly.fit_transform(
    X_train_confirmed)
Beispiel #58
0
# Shuffle the data
X, y = shuffle(data.data, data.target, random_state=7)

# Split the data into training and testing datasets
num_training = int(0.8 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]

# Create Support Vector Regression model
sv_regressor = SVR(kernel='linear', C=1.0, epsilon=0.1)

# Train Support Vector Regressor
sv_regressor.fit(X_train, y_train)

# Evaluate performance of Support Vector Regressor
y_test_pred = sv_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
evs = explained_variance_score(y_test, y_test_pred)
print("\n#### Performance ####")
print("Mean squared error =", round(mse, 2))
ai02_3_url1 = round(mse, 2)
print("Explained variance score =", round(evs, 2))
ai02_3_url2 = round(evs, 2)

# Test the regressor on test datapoint
test_data = [
    3.7, 0, 18.4, 1, 0.87, 5.95, 91, 2.5052, 26, 666, 20.2, 351.34, 15.27
]
print("\nPredicted price:", sv_regressor.predict([test_data])[0])
ai02_3_url3 = sv_regressor.predict([test_data])[0]
Beispiel #59
0
#%%
from sklearn.preprocessing import StandardScaler

sc1 = StandardScaler()
x_olcekli = sc1.fit_transform(X)
sc2 = StandardScaler()
y_olcekli = sc2.fit_transform(Y)

from sklearn.svm import SVR

svrReg = SVR(kernel='rbf')

svrReg.fit(x_olcekli, y_olcekli)

plt.scatter(x_olcekli, y_olcekli, color="red")
plt.plot(x_olcekli, svrReg.predict(x_olcekli), color="black")
plt.xlabel("SVR")
plt.show()

print(svrReg.predict(np.array([6.6]).reshape(-1, 1)))
#%% Decision Tree
from sklearn.tree import DecisionTreeRegressor

r_dt = DecisionTreeRegressor(random_state=0)

r_dt.fit(X, Y)

plt.scatter(X, Y, color="red")
plt.plot(X, r_dt.predict(X))
plt.show()
Beispiel #60
0
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)
y = sc_y.fit_transform(y.reshape(-1, 1))

# SVR regression
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(x, y)

#prediction
y_pred = sc_y.inverse_transform(
    regressor.predict(sc_x.transform(np.array([[6.5]]))))
print(y_pred)

#visualising SVR result
plt.scatter(x, y, color='red')
plt.plot(x, regressor.predict(x), color='blue')
plt.title('SVR Model')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()

## more continuous graph
x_grid = np.arange(min(x), max(x), 0.1)
x_grid = x_grid.reshape(-1, 1)
plt.scatter(x, y, color='red')
plt.plot(x_grid, regressor.predict(x_grid), color='blue')