Ejemplo n.º 1
32
def svr_main(X, Y):
    X_train = X[:TRAIN_SIZE]
    Y_train = Y[:TRAIN_SIZE]
    X_test = X[TRAIN_SIZE:]
    Y_test = Y[TRAIN_SIZE:]

    clf = SVR(kernel='rbf', C=1e3, gamma=0.00001)
    #clf.fit(X_train,Y_train)
    #y_pred = clf.predict(X_test)
    #plt.plot(X_test, y_pred, linestyle='-', color='red') 

    #clf = GradientBoostingRegressor(n_estimators=100,max_depth=1)
    #clf = DecisionTreeRegressor(max_depth=25)
    #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14)
    #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25)
    #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7)
    predict_list = []
    for i in xrange(TEST_SIZE):
        X = [ [x] for x in xrange(i, TRAIN_SIZE+i)]
        clf.fit(X, Y[i:TRAIN_SIZE+i])
        y_pred = clf.predict([TRAIN_SIZE+1+i])
        predict_list.append(y_pred)

    print "mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)
    print "sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))
    origin_data = Y_test
    print "origin data:%s"%origin_data
    plt.plot([ x for x in xrange(TRAIN_SIZE+1, TRAIN_SIZE+TEST_SIZE+1)], predict_list, linestyle='-', color='red', label='prediction model')  
    plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') 
    plt.legend(loc=1, prop={'size': 12})
    plt.show()
Ejemplo n.º 2
0
def SVM(train, test, tunings=None, smoteit=True, bin=True, regress=False):
    "SVM "
    if not isinstance(train, pd.core.frame.DataFrame):
        train = csv2DF(train, as_mtx=False, toBin=bin)

    if not isinstance(test, pd.core.frame.DataFrame):
        test = csv2DF(test, as_mtx=False, toBin=True)

    if smoteit:
        train = SMOTE(train, resample=True)
        # except: set_trace()
    if not tunings:
        if regress:
            clf = SVR()
        else:
            clf = SVC()
    else:
        if regress:
            clf = SVR()
        else:
            clf = SVC()

    features = train.columns[:-1]
    klass = train[train.columns[-1]]
    # set_trace()
    clf.fit(train[features], klass)
    actual = test[test.columns[-1]].as_matrix()
    try:
        preds = clf.predict(test[test.columns[:-1]])
    except:
        set_trace()
    return actual, preds
Ejemplo n.º 3
0
def test_check_is_fitted():
    # Check is ValueError raised when non estimator instance passed
    assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_")
    assert_raises(TypeError, check_is_fitted, "SVR", "support_")

    ard = ARDRegression()
    svr = SVR()

    try:
        assert_raises(NotFittedError, check_is_fitted, ard, "coef_")
        assert_raises(NotFittedError, check_is_fitted, svr, "support_")
    except ValueError:
        assert False, "check_is_fitted failed with ValueError"

    # NotFittedError is a subclass of both ValueError and AttributeError
    try:
        check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s")
    except ValueError as e:
        assert_equal(str(e), "Random message ARDRegression, ARDRegression")

    try:
        check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s")
    except AttributeError as e:
        assert_equal(str(e), "Another message SVR, SVR")

    ard.fit(*make_blobs())
    svr.fit(*make_blobs())

    assert_equal(None, check_is_fitted(ard, "coef_"))
    assert_equal(None, check_is_fitted(svr, "support_"))
Ejemplo n.º 4
0
def getError1(signal, normedDay, period, phase):
    '''
    Gets the error for a list of points across a normed day given a sklean 
    model, the period, and the phase of the fitted signal.
    
    Here I'm using the Euclidean distance as the error measurement.  This 
    requires a little more computation due to the need to fit an inverse
    model, but provides better fits.
    
    Returns the squared Euclidean error.
    '''
    
    if rank(normedDay.index[0]) > 0:
        t0= round((array(normedDay.index.get_level_values(0))- phase)%period,3)
    else:
        t0 = round((array(normedDay.index,dtype=float) - phase)%period,3)
    nD = Series(normedDay, index=t0)
    
    tUp = array([arange(0,period+.1,.1)]).T
    invSignal = SVR(kernel='rbf', C=signal.C, gamma=signal.gamma, 
                    epsilon=signal.epsilon)
    
    invSignal.fit(array([signal.predict(tUp)]).T, tUp.flatten())
    
    xDiff = nD - signal.predict(array([array(nD)]).T)
    yDiff = nD - signal.predict(array([nD.index]).T)
    
    error = sum(pow(xDiff/period,2) + pow(yDiff/2,2))
    return error
def train_svm(train_file):
    test_X, test_Y, weight = load_data(train_file, get_avg(train_file))
    svr = SVR(kernel='rbf', C=100, gamma=1)
    print("start train")
    svr.fit(test_X, test_Y)
    print("train finish")
    return svr
Ejemplo n.º 6
0
def train_svm(data):
    test_X, test_Y = load_data(data)
    svr = SVR(kernel='rbf', C=100, gamma=1)
    print("start train")
    svr.fit(test_X, test_Y)
    print("train finish")
    return svr
Ejemplo n.º 7
0
    def RunSVRScikit():
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      # Use the last row of the training set as the responses.
      X, y = SplitTrainData(self.dataset)

      # Get all the parameters.
      opts = {}
      if "c" in options:
        opts["C"] = float(options.pop("c"))
      if "epsilon" in options:
        opts["epsilon"] = float(options.pop("epsilon"))
      if "gamma" in options:
        opts["gamma"] = float(options.pop("gamma"))
      opts["kernel"] = "rbf"

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      try:
        with totalTimer:
          # Perform SVR.
          model = SSVR(**opts)
          model.fit(X, y)
      except Exception as e:
        return -1

      return totalTimer.ElapsedTime()
Ejemplo n.º 8
0
class HotTweets:
	''' Train and get tweet hotness '''

	def __init__(self, kernel='rbf', C=1e3, gamma=0.1, epsilon=0.1, n_comp=100):
		''' Prepare support vector regression ''' 
		self.svr = SVR(kernel=kernel, C=C, gamma=gamma, epsilon=epsilon, verbose=True)
		#self.svr = LogisticRegression(random_state=42, verbose=0)
		self.n_comp = n_comp

	def fit_scaler(self, dev, i_dev):
		''' Train normalizers for features and importances '''
		# importance scaler
		self.std_scaler_i = sklearn.preprocessing.StandardScaler()
		self.std_scaler_i.fit(i_dev)
		self.norm = sklearn.preprocessing.StandardScaler()
		self.norm.fit(dev[:,0:self.n_comp])
		self.n_comp = self.n_comp
	
	def train(self, features, importances):
		''' Train regression '''
		importances = self.std_scaler_i.transform(importances)
		features = self.norm.transform(features[:,0:self.n_comp])
		self.svr.fit(features, importances)
		
		
	def predict(self, features):
		''' Predict importances '''
		features = self.norm.transform(features[:,0:self.n_comp])
		results = self.svr.predict(features)
		#print results[0:100:5]
		results = self.std_scaler_i.inverse_transform(results)
		#print results[0:100:5]
		return results
Ejemplo n.º 9
0
    def fit(self, start_date, end_date):

        for ticker in self.tickers:
            self.stocks[ticker] = Stock(ticker)

        params_svr = [{
            'kernel': ['rbf', 'sigmoid', 'linear'],
            'C': [0.01, 0.1, 1, 10, 100],
            'epsilon': [0.0000001, 0.000001, 0.00001]
            }]
        params = ParameterGrid(params_svr)

        # Find the split for training and CV
        mid_date = train_test_split(start_date, end_date)
        for ticker, stock in self.stocks.items():

            X_train, y_train = stock.get_data(start_date, mid_date, fit=True)
            # X_train = self.pca.fit_transform(X_train.values)
            X_train = X_train.values
            # pdb.set_trace()
            X_cv, y_cv = stock.get_data(mid_date, end_date)
            # X_cv = self.pca.transform(X_cv.values)
            X_cv = X_cv.values

            lowest_mse = np.inf
            for i, param in enumerate(params):
                svr = SVR(**param)
                # ada = AdaBoostRegressor(svr)
                svr.fit(X_train, y_train.values)
                mse = mean_squared_error(
                    y_cv, svr.predict(X_cv))
                if mse <= lowest_mse:
                    self.models[ticker] = svr

        return self
Ejemplo n.º 10
0
def train_single_model(train_data, train_labels, algo):
	"""
	Train the model for a single label dimension
	"""
	if algo == 'svr_rbf':
		"""
		SVM regression, RBF kernel
		"""
		svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
		svr_rbf.fit(train_data, train_labels)
		return svr_rbf

	if algo == 'svr_lin':
		"""
		SVM regression, linear
		"""
		svr_lin = SVR(kernel='linear')
		svr_lin.fit(train_data, train_labels)
		return svr_lin

	if algo == 'ridge':
		"""
		Ridge regression
		"""
		clf = Ridge(alpha = 0.5)
		clf.fit(train_data, train_labels)
		return clf

	# No hit algorithm
	print "unimplemented model type"
	return None
    def train(self, x, y, param_names, random_search=100,
              kernel_cache_size=2000, **kwargs):
        if self._debug:
            print "First training sample\n", x[0]
        start = time.time()
        scaled_x = self._set_and_preprocess(x=x, param_names=param_names)

        # Check that each input is between 0 and 1
        self._check_scaling(scaled_x=scaled_x)

        if self._debug:
            print "Shape of training data: ", scaled_x.shape
            print "Param names: ", self._used_param_names
            print "First training sample\n", scaled_x[0]
            print "Encode: ", self._encode

        # Do a random search
        c, gamma = self._random_search(random_iter=random_search, x=scaled_x,
                                       y=y, kernel_cache_size=kernel_cache_size)

        # Now train model
        try:
            svr = SVR(gamma=gamma, C=c, random_state=self._rng,
                      cache_size=kernel_cache_size)
            svr.fit(scaled_x, y)
            self._model = svr
        except Exception, e:
            print "Training failed", e.message
            svr = None
Ejemplo n.º 12
0
def predict_device_byday_SVR():
    X,Y_unique,Y_all,X_raw = load_device_counter_byday()

    from sklearn.svm import SVR
    model = SVR()
    # model = SVR(kernel='linear')
    training_size = 160
    # model.fit(X[:training_size],Y_unique[:training_size])
    model.fit(X[:training_size],Y_all[:training_size])

    start_index = 180
    end_index = 190
    X_to_predict = X[start_index:end_index]
    # X_to_predict.append([date_str_toordinal('2017-04-18')])
    # X_to_predict.append([date_str_toordinal('2017-03-27')])

    print X_to_predict
    # Y_real = Y_unique[start_index:end_index]
    Y_real = Y_all[start_index:end_index]
    print X_raw[start_index:end_index]
    y_predicted=model.predict(X_to_predict)
    # print y_predicted
    y_predicted = np.array(y_predicted).astype(int)
    print y_predicted
    print Y_real
    # print y_predicted - np.array(Y_real)

    # plt.subplot(111)
    # plt.scatter(X_to_predict,Y_real,c='r')
    plt.scatter(X_to_predict,y_predicted)
    # plt.plot(X_to_predict,y_predicted)
    plt.show()
Ejemplo n.º 13
0
def main(args):
    (training_file, label_file, test_file, test_label, c, e) = args
    svr = SVR(C=float(c), epsilon=float(e), kernel='rbf')
    X = load_feat(training_file)
    y = [float(line.strip()) for line in open(label_file)]
    
    X = np.asarray(X)
     
    y = np.asarray(y)
    
    test_X = load_feat(test_file)
    test_X = np.asarray(test_X)
    test_X[np.isnan(test_X)] = 0

    svr.fit(X, y)
    
    pred = svr.predict(test_X)
    if test_label != 'none':
        test_y = [float(line.strip()) for line in open(test_label)]
        test_y = np.asarray(test_y)
        print 'MAE: ', mean_absolute_error(test_y, pred)
        print 'RMSE: ', sqrt(mean_squared_error(test_y, pred))
        print 'corrpearson: ', sp.stats.pearsonr(test_y, pred)
        print 'r-sqr: ', sp.stats.linregress(test_y, pred)[2] ** 2
        print mquantiles(test_y, prob=[0.10, 0.90])
        print mquantiles(pred, prob=[0.10, 0.90])
    with open(test_file + '.svr.pred', 'w') as output:
        for p in pred:
            print >>output, p
    return
Ejemplo n.º 14
0
def train_learning_model_svm(df):
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    regressor = SVR()
    regressor.fit(X_train, y_train)
    calculate_results(regressor, X_train, X_test, y_train, y_test)
Ejemplo n.º 15
0
def train_SVR(viper):

	from sklearn.svm import SVR
	model = SVR(C=10, kernel='rbf', shrinking=False, verbose=True)
	model.fit(viper.train_feat, viper.train_y)

	return model
Ejemplo n.º 16
0
    def svr(self, X, y):
        """ Train support vector regression model

        Parameters
        ----------
        X : numpy ndarray with numeric values
            Array containing input parameters
            for the model. Model will try to
            learn the output y[i] in terms of
            inputs X[i]

        y : columnar numpy array with numeric values
            Array containing single column of
            output values. Entry at y[i] corresponds
            to value of the underlying experiment
            for input parameters X[i]

        Returns
        -------
        result : model
                Model learnt from incoming input
                inputs and outputs

        """
        clf = SVR(C=1.0, epsilon=0.2)
        clf.fit(X, y)
        return clf
Ejemplo n.º 17
0
def draw_svr_single(real_data, name):
    history = []
    for i in range(1, 32):
        h = [i]
        history.append(h)
    
    from sklearn.svm import SVR
    svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
    svr_lin = SVR(kernel='linear', C=1e3)
    svr_poly = SVR(kernel='poly', C=1e3, degree=2)
    y_rbf = svr_rbf.fit(history, real_data).predict(history)
    y_lin = svr_lin.fit(history, real_data).predict(history)
    y_poly = svr_poly.fit(history, real_data).predict(history)
    
    import pylab as pl
    pl.scatter(history, real_data, c='k', label='data')
    pl.hold('on')
    pl.plot(history, y_rbf, c='g', label='RBF model')
    pl.plot(history, y_lin, c='r', label='Linear model')
    pl.plot(history, y_poly, c='b', label='Polynomial model')
    pl.xlabel('data')
    pl.ylabel('target')
    pl.title('Support Vector Regression: ' + name)
    pl.legend()
    pl.show()
def machinelearning(csv_file):
  # parse CSV
  d = {}
  d['date'] = []
  d['radiation'] = []
  d['humidity'] = []
  d['temperature'] = []
  d['wind'] = []
  d['demand'] = []

  dictreader = csv.DictReader(csv_file, fieldnames=['date', 'radiation', 'humidity', 'temperature', 'wind', 'demand'], delimiter=',')

  next(dictreader)
  for row in dictreader:
    for key in row:
      d[key].append(row[key])

  # interpolate weather data
  interpolate(d['radiation'])
  interpolate(d['humidity'])
  interpolate(d['temperature'])
  interpolate(d['wind'])

  # train machine learning algorithm
  training_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[:32])
  training_y = np.array(d['demand'][:32])

  poly_svr = SVR(kernel='poly', degree=2)
  poly_svr.fit(training_x, training_y)

  prediction_x = np.array(zip(d['radiation'], d['humidity'], d['temperature'], d['wind'])[32:])
  demand_predictions = poly_svr.predict(prediction_x)

  return demand_predictions
Ejemplo n.º 19
0
class SVR(PlayerModel):
    ### a wrapper for support vector regression using scikit-learn for this project
    def __init__(self):
        PlayerModel.__init__(self)
        # configure support vector regression and start training
        self.regr = SupportVectorRegression(kernel = 'linear', C = 1000)
        self.regr.fit(self.dataset_X_train, self.dataset_Y_train)
        print "Finish building player model."
        print "Parameters: ", self.regr.get_params()
        print "============================================================"

    def testScore(self, test_X):
        score = self.regr.predict(self.normalizeTest(test_X))
        return np.mean(score)

    def getParams(self):
        return self.regr.get_params()

    def visualize(self):
        x = np.zeros((10, self.col - 1))
        mean = self.dataset_X_train.mean(0)
        for i in range(10):
            x[i, :] = mean
        x[:, 0:1] = np.array([np.arange(0.0, 1.1, 0.11)]).T
        # print x
        y = self.regr.predict(x)
        # print y
        pyplot.scatter(self.dataset_X_train[:, 0:1], self.dataset_Y_train, c='k', label='data')
        pyplot.hold('on')
        pyplot.plot(x[:, 0:1], y, c = "r", label='Support Vector Regression')
        pyplot.xlabel('data collect from player')
        pyplot.ylabel('score')
        pyplot.title('Support Vector Regression')
        pyplot.legend()
        pyplot.show()
Ejemplo n.º 20
0
def CaSVRModel(X_train, Y_train, X_test, Y_test, cv_iterator):
#     
#     param_grid = {'C':[10000],
#                    'epsilon':[0.001, 0.01, 0.05, 0.1, 0.15, 1]
#                    }
#       
#     svr = SVR(random_state=42, cache_size=1000, verbose=2)
#     search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", n_jobs= 1, iid=True, cv=cv_iterator)
#     search.fit(X_train, Y_train["Ca"])
#     #search.grid_scores_
#       
#     model = search.best_estimator_

    #scaler = StandardScaler()

    model = SVR(C=10000, epsilon = 0.01, cache_size=1000)
    model.fit(X_train, Y_train["Ca"])
    #model.fit(X_train, Y_train["Ca"])
    
    #model.fit(X_train, Y_train["Ca"])
    
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    yhat_svr = model.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_svr))
    
    return model, test_error
Ejemplo n.º 21
0
def train_svm(train_file, avg={}):
    test_X, test_Y, weight = load_data(train_file, avg)
    svr = SVR(kernel='rbf', C=100, gamma=1, verbose=True, cache_size=1024)
    print("start train")
    svr.fit(test_X, test_Y)
    print("train finish")
    return svr
Ejemplo n.º 22
0
def Sand_SVR(X_train, Y_train, X_test, Y_test, cv_iterator):
    
    #===========================================================================
    # param_grid = {'C':[100,500,1000, 5000, 10000, 100000],
    #               'epsilon':[0.075,0.1, 0.125]
    #               }
    #  
    # svr = SVR(cache_size = 1000, random_state=42)
    # search = GridSearchCV(svr, param_grid, scoring="mean_squared_error", cv=cv_iterator)
    #===========================================================================
    #search.fit(X_train, Y_train["Sand"])
    #search.grid_scores_
    
    #svr = search.best_estimator_ 
    #svr.fit(X_train, Y_train["SAND"])
    
    #test = cross_val_score(svr, X_train.astype('float64'), Y_train["Ca"].astype('float64'), scoring="mean_squared_error", cv=cv_iterator)
    
    svr = SVR(C=10000)
    svr.fit(X_train, Y_train["Sand"])
    
    yhat_svr = svr.predict(X_test)
    test_error = math.sqrt(mean_squared_error(Y_test["Sand"], yhat_svr))
    
    return svr, test_error
Ejemplo n.º 23
0
    def RunSVRScikit(q):
      totalTimer = Timer()

      # Load input dataset.
      Log.Info("Loading dataset", self.verbose)
      # Use the last row of the training set as the responses.
      X, y = SplitTrainData(self.dataset)

      # Get all the parameters.
      c = re.search("-c (\d+\.\d+)", options)
      e = re.search("-e (\d+\.\d+)", options)
      g = re.search("-g (\d+\.\d+)", options)

      C = 1.0 if not c else float(c.group(1))
      epsilon = 1.0 if not e else float(e.group(1))
      gamma = 0.1 if not g else float(g.group(1))

      try:
        with totalTimer:
          # Perform SVR.
          model = SSVR(kernel='rbf', C=C, epsilon=epsilon, gamma=gamma)
          model.fit(X, y)
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
Ejemplo n.º 24
0
def train_model(train, test, labels):
    clf = SVR(C=1.0, epsilon=0.2)
    clf.fit(train, labels)
    #clf = GaussianNB()
    #clf.fit(train, labels)
    print "Good!"
    predictions = clf.predict(test)
    print predictions.shape
    predictions = pd.DataFrame(predictions, columns = ['relevance'])
    print "Good again!"
    print "Predictions head -------"
    print predictions.head()
    print predictions.shape
    print "TEST head -------"
    print test.head()
    print test.shape
    test['id'].to_csv("TEST_TEST.csv",index=False)
    predictions.to_csv("PREDICTIONS.csv",index=False)
    #test = test.reset_index()
    #predictions = predictions.reset_index()
    #test = test.groupby(level=0).first()
    #predictions = predictions.groupby(level=0).first()
    predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False)
    print predictions
    return predictions
Ejemplo n.º 25
0
def learn(X, y):
    # do pca
    pca = PCA(n_components=6)
    pca_6 = pca.fit(X)

    print('variance ratio')
    print(pca_6.explained_variance_ratio_)
    X = pca.fit_transform(X)

    # X = np.concatenate((X_pca[:, 0].reshape(X.shape[0], 1), X_pca[:, 5].reshape(X.shape[0], 1)), axis=1)
    # do svr
    svr_rbf = SVR(kernel='rbf', C=1)
    svr_rbf.fit(X, y)
    # print(model_rbf)

    y_rbf = svr_rbf.predict(X)
    print(y_rbf)
    print(y)

    # see difference
    y_rbf = np.transpose(y_rbf)
    deviation(y, y_rbf)

    # pickle model
    with open('rbfmodel.pkl', 'wb') as f:
        pickle.dump(svr_rbf, f)

    with open('pcamodel.pkl', 'wb') as f:
        pickle.dump(pca_6, f)
Ejemplo n.º 26
0
class SVMLearner(object):

    def __init__(self, kernel="linear", C=1e3, gamma=0.1, degree=2, verbose = False):
		self.name = "{} Support Vector Machine Learner".format(kernel.capitalize())
		self.kernel=kernel
		if kernel=="linear":
			self.svr = SVR(kernel=kernel, C=C)
		elif kernel=="rbf":
			self.svr = SVR(kernel=kernel, C=C, gamma=gamma)
		elif kernel=="poly":
			self.svr = SVR(kernel=kernel, C=C, degree=degree)

    def addEvidence(self,dataX,dataY):
        """
        @summary: Add training data to learner
        @param dataX: X values of data to add
        @param dataY: the Y training values
        """
        # build and save the model
        self.svr.fit(dataX, dataY)
        
    def query(self,points):
        """
        @summary: Estimate a set of test points given the model we built.
        @param points: should be a numpy array with each row corresponding to a specific query.
        @returns the estimated values according to the saved model.
        """
        return self.svr.predict(points)
Ejemplo n.º 27
0
def test_regression_custom_mse():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)

    svm = SVR(kernel='rbf', gamma='auto')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric=mean_squared_error,
        num_rounds=1,
        seed=123)

    norm_imp_vals = imp_vals / np.abs(imp_vals).max()

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert norm_imp_vals[0] == -1.
Ejemplo n.º 28
0
 def train(self, pairings):
     X, Y = self.getXY(pairings)
     self.svms = []
     for i in range(self.wine_feat_len):
         svm = SVR(kernel='rbf')
         svm.fit(X, Y[:, i])
         self.svms.append(svm)
Ejemplo n.º 29
0
def test_regression():

    X, y = make_regression(n_samples=1000,
                           n_features=5,
                           n_informative=2,
                           n_targets=1,
                           random_state=123,
                           shuffle=False)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=123)

    svm = SVR(kernel='rbf')
    svm.fit(X_train, y_train)

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=svm.predict,
        X=X_test,
        y=y_test,
        metric='r2',
        num_rounds=1,
        seed=123)

    assert imp_vals.shape == (X_train.shape[1], )
    assert imp_all.shape == (X_train.shape[1], 1)
    assert imp_vals[0] > 0.2
    assert imp_vals[1] > 0.2
    assert sum(imp_vals[3:]) <= 0.01
Ejemplo n.º 30
0
Y = Y.reshape(-1, 1)

# In[37]:

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_Y = StandardScaler()
X = sc_X.fit_transform(X)
Y = sc_Y.fit_transform(Y)

# In[39]:

from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, Y)

# In[40]:

Y_Pred = sc_Y.inverse_transform(
    regressor.predict(sc_X.transform(np.array([6.5]).reshape(-1, 1))))

# In[42]:

import matplotlib.pyplot as plt
plt.scatter(X, Y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Regression Results')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
Ejemplo n.º 31
0
def run_vary_cutoff(arg):
    k, cat = arg

    ibp = IBP(cutoff=k, enable_cluster=True, n_max_iter=10000)
    ibp.fit(training_votes[2], training_votes[1], cats=training_cats[:, 0])
    y_per = training_votes[1] / training_votes[2].astype(float)
    y_ibp = ibp(training_votes[2], training_votes[1], training_cats[:, 0])

    clf_per = SVR(**cat_parameters['PER'][cat])
    clf_ibp = SVR(**cat_parameters['IBP'][cat])

    X = np.array(
        list(
            map(
                lambda x: x[1],
                filter(lambda x: x[0] == cat,
                       zip(training_cats[:, 0], training_x)))))
    X_tsb = np.array(
        list(
            map(lambda x: x[1],
                filter(lambda x: x[0] == cat, zip(tsb_cats[:, 0], tsb_x)))))
    y_tsb = np.array(
        list(
            map(lambda x: x[1],
                filter(lambda x: x[0] == cat, zip(tsb_cats[:, 0],
                                                  tsb_truth)))))

    clf_per.fit(
        X,
        np.array(
            list(
                map(
                    lambda x: x[1],
                    filter(lambda x: x[0] == cat,
                           zip(training_cats[:, 0], y_per))))))
    clf_ibp.fit(
        X,
        np.array(
            list(
                map(
                    lambda x: x[1],
                    filter(lambda x: x[0] == cat,
                           zip(training_cats[:, 0], y_ibp))))))

    tsb_y_hat_per = clf_per.predict(X_tsb)
    tsb_y_hat_ibp = clf_ibp.predict(X_tsb)

    mse_tsb_per = ((tsb_y_hat_per - y_tsb)**2).mean()
    mae_tsb_per = abs(tsb_y_hat_per - y_tsb).mean()
    rmse_tsb_per = mse_tsb_per**0.5
    mse_tsb_ibp = ((tsb_y_hat_ibp - y_tsb)**2).mean()
    mae_tsb_ibp = abs(tsb_y_hat_ibp - y_tsb).mean()
    rmse_tsb_ibp = mse_tsb_ibp**0.5

    print(2, cat, 'tsb', (training_cats[:, 0] == cat).astype(int).sum(),
          (tsb_cats[:, 0] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp,
          (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp,
          (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per,
          rmse_tsb_ibp, (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per)

    return [[
        2, cat, 'tsb', (training_cats[:, 0] == cat).astype(int).sum(),
        (tsb_cats[:, 0] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp,
        (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp,
        (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per, rmse_tsb_ibp,
        (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per,
        ttest_rel(tsb_y_hat_per, tsb_y_hat_ibp).pvalue
    ]]
Ejemplo n.º 32
0
def home():
    """Renders the home page."""
    if request.method == 'POST':
        ticker = request.form.get("ticker")
        ticker = ticker.upper()
        sns.set_style("whitegrid")

        files = []
        files_SMA = []
        ticker_list = []
        #global ticker_list
        # ticker_list = ticker.split()
        ticker_list = ticker

        #global ticker_list
        #global files
        today = date.today()

        start_date = "2016-01-01"
        end_date = today

        def getData(ticker):  # downloading data
            try:
                data = pdr.get_data_yahoo(ticker,
                                          start=start_date,
                                          end=end_date)
                # dataname = ticker + '_' + str(start_date) + '-' + str(end_date)
                data['SMA_200'] = data.iloc[:, 5].rolling(window=200).mean()
                data['SMA_50'] = data.iloc[:, 5].rolling(window=50).mean()
                files_SMA.append(ticker)
                files.append(ticker)
                SaveData(data, ticker)
            except RemoteDataError:
                pass

        def SaveData(df, filename):
            # df.to_csv('./data/' + filename + ".csv")
            dnew = df.iloc[200:]
            dnew.to_csv(filename + '.csv')

        # def SMA(filename):
        #     df = pd.read_csv(filename + ".csv")
        #     df['SMA_200'] = df.iloc[:, 5].rolling(window=200).mean()
        #     df['SMA_50'] = df.iloc[:, 5].rolling(window=50).mean()
        #     dataname = filename + "_with_SMA"
        #     files_SMA.append(filename)
        #     SaveData(df, filename)

        #for tik in ticker_list:
        #getData(tik)
        getData(ticker_list)
        # for i in files:
        #     SMA(i)

        filename = ticker

        # filename = input('enter ticker symbol: ')
        df = pd.read_csv(filename + '.csv')

        # Remove the date
        del df['Date']

        # A variable for predicting 'n' days out into the future
        forecast_out = 30  # 'n=30' days
        # Create another column (the target ) shifted 'n' units up
        df['Prediction'] = df[['Adj Close']].shift(-forecast_out)
        # print(df.tail())

        # Convert the dataframe to a numpy array
        X = np.array(df.drop(['Prediction'], 1))

        # Remove the last '30' rows
        X = X[:-forecast_out]

        # Convert the data frame to a numpy array
        y = np.array(df['Prediction'])
        # Get all of the y values except the last '30' rows
        y = y[:-forecast_out]

        # Split the data into 80% training and 20% testing
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)

        # Create and train the Support Vector Machine (Regressor)
        svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
        svr_rbf.fit(x_train, y_train)
        svr_confidence = 0
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)
        stop = 1
        # while loop to get the best results
        while svr_confidence <= stop:
            x_train, x_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2)
            svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
            svr_rbf.fit(x_train, y_train)
            svr_confidence = svr_rbf.score(x_test, y_test)
            stop -= 0.01

        # Create and train the Linear Regression  Model
        lr = LinearRegression()
        lr.fit(x_train, y_train)
        lr_confidence = 0
        stop = 1
        # while loop to get the best results
        while lr_confidence <= stop:
            lr.fit(x_train, y_train)
            lr_confidence = lr.score(x_test, y_test)
            stop -= 0.01

        # Set x_forecast equal to the last 30 rows of the original data set from Adj. Close column
        x_forecast = np.array(df.drop(['Prediction'], 1))[-forecast_out:]

        # Print linear regression model predictions for the next '30' days
        lr_prediction = lr.predict(x_forecast)
        old = df[['Adj Close']]
        for x in lr_prediction:
            new_row = {
                'Open': 0,
                'High': 0,
                'Low': 0,
                'Close': 0,
                'Adj Close': x,
                'Volume': 0
            }
            df = df.append(new_row, ignore_index=True)

        # svm_prediction = svr_rbf.predict(x_forecast)

        plt.rcParams.update({'font.size': 18})
        plt.figure(figsize=(15, 11))
        plt.xlim([len(df) - 100, len(df) - 1])
        plt.ylim([(df['Adj Close'].tail(100).min() -
                   df['Adj Close'].tail(100).min() * 0.1),
                  (df['Adj Close'].tail(100).max() +
                   df['Adj Close'].tail(100).max() * 0.1)])
        plt.plot(df['Adj Close'], color='red', label='Predicted Price')
        plt.plot(old['Adj Close'], color='k', label="Past Data")
        plt.plot(df['SMA_200'], color='b', label='SMA 200')
        plt.plot(df['SMA_50'], color='g', label='SMA 50')
        # plt.yticks(np.arange(int(df['Adj Close'].tail(100).min() * 1.1), int(df['Adj Close'].tail(100).max() * 1.1), step=(int(df['Adj Close'].tail(100).max() * 1.1))/10))
        plt.title("30 Day Prediction of " + filename)
        plt.xlabel("Days")
        plt.ylabel("Adj. Close Price $")
        plt.legend()
        plt.savefig("FlaskWebProject1\\static\\images\\_graph.png")

        # plt.show()
        return render_template(
            'index.html',
            title='Home Page',
            url='static/images/_graph.png',
            year=datetime.now().year,
        )
    else:
        return render_template(
            'index.html',
            title='Home Page',
            year=datetime.now().year,
        )
Ejemplo n.º 33
0
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, y)

# Predicting a new result
y_pred = regressor.predict(6.5)

# Visualising the SVR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
Ejemplo n.º 34
0
def clip_to_100(val):
    if val < 0:
        return 0
    if val > 100:
        return 100
    return val


# In[6]:

train_df[input_var_names] = train_df.word.apply(get_features)

# In[7]:

valid_df[input_var_names] = valid_df.word.apply(get_features)

# In[11]:

predict_df = valid_df.copy()
for feat_name in output_var_names:
    #model = LinearRegression()
    model = SVR()

    model.fit(train_df[input_var_names], train_df[feat_name])
    predict_df[feat_name] = model.predict(predict_df[input_var_names])
    predict_df[feat_name] = predict_df[feat_name].apply(clip_to_100)

# In[12]:

src.eval_metric.evaluate(predict_df, valid_df)
# Visualizando e descrevendo  o dataset
df.describe()

df.head(5)

# Definindo as variáveis indepedentes e dependentes
X = df.loc[:, 'LotArea'].values.reshape(-1,1)
y = df.loc[:, 'SalePrice'].values.reshape(-1,1)

# Dividindo o dataset em conjunto de treinamento e testes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Normalização das features
X_train = feature_scaling(X_train)
X_test = feature_scaling(X_test)

# Treinando o modelo de regressão linear com o conjunto de treinamento
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

# Avaliando o modelo com a métrica r2
regressor.score(X_test, y_test)

# Prevendo os resultados com o conjunto de testes
y_pred = regressor.predict(X_test)

# Visualizando os resultados do conjunto de treinamento
plot_results_reg(X_train, y_train, regressor, 'SVR (Conj. de Treinamento)')

# Visualizando os resultados do conjunto de testes
plot_results_reg(X_test, y_test, regressor, 'SVR (Conj. de Testes)')
target = 'G3'
X = np.array(df_new.drop([target], 1))
y = np.array(df_new[target])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

linear = linear_model.LinearRegression()
tree = DecisionTreeRegressor()
svr = SVR(kernel='rbf', C=8)

linear.fit(X_train, y_train)
tree.fit(X_train, y_train)
svr.fit(X_train, y_train)

linear_predict = pd.DataFrame(linear.predict(X_test))
tree_predict = pd.DataFrame(tree.predict(X_test))
svr_predict = pd.DataFrame(svr.predict(X_test))
y_real = pd.DataFrame(y_test)

new_df = pd.DataFrame()

new_df[['y_real']] = y_real
new_df[['linear_predict']] = linear_predict
new_df[['tree_predict']] = tree_predict
new_df[['svr_predict']] = svr_predict

print(new_df.head())
Ejemplo n.º 37
0
def apply_regression(filename, inCol, outCol, predValues):
	# read input output columns from filename
	degree,rank,isString = load_all(filename, inCol, outCol);

	printV(zip(degree,rank))
	#TODO : check that output column can't be string
	
	if (len(degree)==0 or len(rank)==0 ):
		print 'ERROR : Input or Output data is empty.'
		return [];
	
	
	# REPLACE STRINGS with numbers
	#degree, uniqDegree = removeStrings(degree,isString);
	
	degree = array(degree); rank = array(rank);
	#degree = addStrings(degree, uniqDegree);
	
	"""
	print 'Degree: ';
	printV(degree);
	print 'uniqDegree: ';
	printV(uniqDegree);
	"""
	
	
	degree,Dscalers = normalizeColumns(degree);
	print degree;
	rank,Rscalers = normalizeColumns(rank);
	
	#printV(zip(degree,rank));
	"""
	degree= deNormalizeColumns(degree,Dscalers);
	rank= deNormalizeColumns(rank,Rscalers);
	
	printV(zip(degree,rank));
	"""
	
	# generate prediction inputs
	order = [];
	pv = [];
	for e in predValues:
		v = linspace(e[1], (e[1]+(e[2]-1)*e[3]), e[2] );
		v=v.reshape(-1,1);
		if (len(pv)==0):
			pv=v;
		else:
			pv=hstack((pv,v));
		order.append(e[0]);
	
	#print 'Order:',order
	#print 'Predicted Values:\n',pv
	
	runs=1;

	# convert both degree,rank to log scale for better prediction accuracy
	#rank = [ log(x) for x in rank ]
	#degree = [ log(x) for x in degree ]

	degree = array(degree);
	rank = array(rank);
	#freq = array(freq);
	
	"""
	# normalize degrees and ranks between [0,1]
	MaxDegree = max(degree)*2;
	MinDegree = min(degree);
	MaxRank = max(rank);
	degree = array( [(x)/float(MaxDegree) for x in degree] );
	rank = rank/float(MaxRank);
	"""
	if (pri>3):
		printV(zip(degree,rank))
	
	print degree.shape, rank.shape

	N = len(degree)

	AvgErr = 0;
	AvgWerr = 0;

	for rr in xrange(0,runs):

		degree,rank=doShuffle(degree,rank);
		
		"""
		ff = open('in.txt','w');
		
		ff.write(str(MaxRank)+'\n');
		
		for e in zip(degree,rank):
			ff.write(str(e[0])+' '+str(e[1])+'\n');
		"""
		
		if (len(inCol)==1):
			degree = degree.reshape(-1,1);
		if (len(rank.shape)==1):
			rank = rank.reshape(-1,1);
		
		printV( zip(degree,rank) )
		
		
		# split data into training and testing instances
		splitRatio=0.9 # splitRatio determines how many instances are used for training and testing. eg: 0.2 means 20% train, 80% test
		spl= int(splitRatio*N); #split location for train-test
		
		trI=array(degree[:spl]); trL=array(rank[:spl]); # trI - training instances, trL - training labels
		teI=array(degree[spl:]); teL=array(rank[spl:]); # teI - testing instances, teL - testing labels
		
		trI=trI.astype('float'); teI=teI.astype('float');
		
		
		"""
		print 'Train data:\n'
		printV(zip(trI,trL));
		print 'Test data:\n'
		printV(zip(teI,teL));
		print '\n\n\n'
		"""
		print trI.shape, trL.shape;

		print "Train : ",int(splitRatio*N),"\t Test: ",int((1.0-splitRatio)*N)
		
		
		useSVM=1;
		NoInputs = 1;
		ignoreExtra =1;
		svr=SVR();
		
		"""
		if (useSVM==0):
		
			# set parameters of neural network regression model
			nn = MLPRegressor(hidden_layer_sizes=(20), activation='tanh', solver='lbfgs', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, max_iter=500, shuffle=True, random_state=None, tol=0.00001, verbose=False, momentum=0.5, early_stopping=True, validation_fraction=0.15)
		else:
		"""
		svr = SVR(C=100, cache_size=200, epsilon=0.00001, gamma=3,kernel='rbf', max_iter=-1, shrinking=True, tol=0.000001, verbose=False)
		# gamma is fitting parameter, small gamma-> simpler curve, >gamma-> complex curve
		
			
		# train NN regression model
		  
		svr.fit(trI,trL);
		
		
		# test model to get accuracy
		
		#res = svr.score(teI,teL);
		# 'res' represents how well regression model is learned. 
		# It is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() 
		# and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()
		
		if (pri>2):	
			print 'Accuracy measure: ', res
			
		# predict label/rank for test instances/degrees for calculating error
		
		yres = svr.predict(teI);
		
		sum=0;
		wsum=0;
		if (pri>2):
			print 'Predicted','\t','Actual Rank'
				
		linSum = 0;
		"""
		# calculate deviation from true vaue for each test instance
		for e in sorted(zip(yres,teL,teI)):
			
			prank = max(1, (e[0]*MaxRank)) # predicted rank
			trank =  (e[1]*MaxRank) # true rank
			
			sum+=abs(prank-trank)
						
			lrank =0 ;
						
			if (pri>2):
				print int(prank),"\t", trank, '\t',e[2],'\t',lrank
			
		if (pri>2):	
			print 'Avg error: ',(sum/len(yres))    
		
		AvgErr+=(sum/len(yres)) 
		AvgWerr+=(wsum/len(yres))
	if (pri>2):	
		print 'Avg error: ',(AvgErr/runs)
	"""
	
	#pv = array( [(x)/float(MaxDegree) for x in pv] );
	pv = normalizeValues(pv,Dscalers);
	
	Pred = svr.predict(pv);
	#printV(zip(pv,Pred));
	
	#print(pv)
	
	"""
	Pred = ( [(x)*MaxRank for x in Pred] );
	yres = ( [(x)*MaxRank for x in yres] );
	pv = array( [(x)*float(MaxDegree) for x in pv] );
	teI = array( [(x)*float(MaxDegree) for x in teI] );
	trI = array( [(x)*float(MaxDegree) for x in trI] );
	teL = ( [(x)*MaxRank for x in teL] );
	trL = ( [(x)*MaxRank for x in trL] );
	"""
	if (len(Pred.shape)==1):
		Pred = Pred.reshape(-1,1);
	
	Pred = deNormalizeColumns(Pred,Rscalers);
	trI = deNormalizeColumns(trI,Dscalers);
	trL = deNormalizeColumns(trL,Rscalers);
	teI = deNormalizeColumns(teI,Dscalers);
	teL = deNormalizeColumns(teL,Rscalers);
	pv = deNormalizeColumns(pv,Dscalers);
	
	#printV(zip(pv,Pred));
	#printV(zip(teI,teL));
	
	# show plot of predicted rank(dotted line) and actual rank (continuous line). 
	# NOTE : x-axis is degree. Both degree(x-axis) and rank(y-axis) are on log scale and normalized
	
	#z=array(sorted(zip(teL,yres,teI[:,0])));
	#plt.plot(z[:,2],z[:,1],'x',ms=3)
	#plt.plot(z[:,2],z[:,0],'-',ms=2)
	
	plt.plot(trI[:,0],trL,'o',ms=2)
	
	#plt.plot(teI[:,0],teL,'o',ms=5)
	
	plt.plot(pv,Pred,'x',ms=3)
	
	#plt.xlabel('Year')
	#plt.ylabel('Mortality Rate')
	#plt.show()
	savefig('C:\\xampp1\\htdocs\\ogd\\visual.jpg');
	
	pv = pv.tolist();
	Pred = [e[0] for e in Pred]
	pv = [e[0] for e in pv]
	result = [ [e[0],e[1]] for e in zip(pv,Pred) ]
	#result = result.tolist();
	return result;

#Pr = apply_regression('data.txt',[0],[1],[[0,2013,20,1]]);
#printV (Pr);
Ejemplo n.º 38
0
def impute_regression(x_train, y_train, x_test):
    svr_rbf = SVR(kernel='rbf', C=1, gamma=0.15)
    model = svr_rbf.fit(x_train, y_train)
    y_test = model.predict(x_test)

    return y_test
Ejemplo n.º 39
0
# The mathematical definition of "kernels" and "support vector machines" is
# beyond the scope of this course. We encourage interested readers with a
# mathematical training to have a look at the scikit-learn [documentation on
# SVMs](https://scikit-learn.org/stable/modules/svm.html) for more details.
#
# For the rest of us, let us just develop some intuitions on the relative
# expressive power of support vector machines with linear and non-linear
# kernels by fitting them on the same dataset.
#
# First, consider a support vector machine with a linear kernel:

# %%
from sklearn.svm import SVR

svr = SVR(kernel="linear")
svr.fit(data, target)
target_predicted = svr.predict(data)
mse = mean_squared_error(target, target_predicted)

# %%
ax = sns.scatterplot(data=full_data, x="input_feature", y="target")
ax.plot(data, target_predicted, color="tab:orange")
_ = ax.set_title(f"Mean squared error = {mse:.2f}")

# %% [markdown]
#
# The predictions of our SVR with a linear kernel are all aligned on a straight
# line. `SVR(kernel="linear")` is indeed yet another example of a linear model.
#
# The estimator can also be configured to use a non-linear kernel. Then, it can
# learn a prediction function that computes non-linear interaction between
Ejemplo n.º 40
0
def train(DO_x, DO_y):
    DO_net = SVR(kernel='rbf')
    DO_net.fit(DO_x, DO_y)
    return DO_net
Ejemplo n.º 41
0
sc_y_DR.fit(y_DR_train)
sc_y_VT.fit(y_VT_train)
sc_y_VV.fit(y_VV_train)
# transform training dataset
y_DR_train = sc_y_DR.transform(y_DR_train)
y_VT_train = sc_y_VT.transform(y_VT_train)
y_VV_train = sc_y_VV.transform(y_VV_train)
# transform test dataset
y_DR_test = sc_y_DR.transform(y_DR_test)
y_VT_test = sc_y_VT.transform(y_VT_test)
y_VV_test = sc_y_VV.transform(y_VV_test)

regr = SVR(kernel='rbf', gamma='scale', C=100., epsilon=0.01, coef0=0.0)
regr = MultiOutputRegressor(estimator=regr)

regr.fit(x_DR_train, y_DR_train)
y_DR_regr = regr.predict(x_DR_test)
regr.fit(x_VT_train, y_VT_train)
y_VT_regr = regr.predict(x_VT_test)
regr.fit(x_VV_train, y_VV_train)
y_VV_regr = regr.predict(x_VV_test)

# open a file to append
#outF = open("output_MO.txt", "a")
#print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit, file=outF)
#print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict),file=outF)
#print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_regr), file=outF)
#print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_regr), file=outF)
#print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_regr)), file=outF)
#outF.close()
Ejemplo n.º 42
0
y_train = target[:480]

x_test = data[480:]
y_true = target[480:]

line = LinearRegression()
lasso = Lasso()
ridge = Ridge()
tree = DecisionTreeRegressor()
svr = SVR()

line.fit(x_train, y_train)
lasso.fit(x_train, y_train)
ridge.fit(x_train, y_train)
tree.fit(x_train, y_train)
svr.fit(x_train, y_train)

line_y_pre = line.predict(x_test)
lasso_y_pre = lasso.predict(x_test)
ridge_y_pre = ridge.predict(x_test)
tree_y_pre = tree.predict(x_test)
svr_y_pre = svr.predict(x_test)

line.score = r2_score(y_true, line_y_pre)
lasso.score = r2_score(y_true, lasso_y_pre)
ridge.score = r2_score(y_true, ridge_y_pre)
tree.score = r2_score(y_true, tree_y_pre)
svr.score = r2_score(y_true, svr_y_pre)
print(line.score)
print(lasso.score)
print(ridge.score)
#Feature scaling as SVR doesnot apply..

from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
sc_y = StandardScaler()

x = sc_x.fit_transform(x)
y = sc_y.fit_transform(y)



#fitting svr
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
#regressor = SVR(kernel='linear')
regressor.fit(x,y)
y_pred = regressor.predict(x)

print(y)
print(y_pred)

plt.scatter(x, y, c='green', label="regression line")
plt.plot(x,y_pred,label="predicted line")
plt.xlabel("X parameters")
plt.ylabel("Y parameters")
plt.legend()
plt.show()


Ejemplo n.º 44
0
save_fig("svm_regression_plot")
plt.show()

# In[30]:

np.random.seed(42)
m = 100
X = 2 * np.random.rand(m, 1) - 1
y = (0.2 + 0.1 * X + 0.5 * X**2 + np.random.randn(m, 1) / 10).ravel()

# In[31]:

from sklearn.svm import SVR

svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
svm_poly_reg.fit(X, y)

# In[32]:

from sklearn.svm import SVR

svm_poly_reg1 = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
svm_poly_reg2 = SVR(kernel="poly", degree=2, C=0.01, epsilon=0.1)
svm_poly_reg1.fit(X, y)
svm_poly_reg2.fit(X, y)

# In[33]:

plt.figure(figsize=(9, 4))
plt.subplot(121)
plot_svm_regression(svm_poly_reg1, X, y, [-1, 1, 0, 1])
Ejemplo n.º 45
0
 X_data = data_after_lag.iloc[:, 2:]
 y_data = transformation_fn(tran_type=tran_type,
                            data=data_after_lag["Total_Daily_Trnx"])
 X_train, X_test, y_train, y_test = split_train_test(
     X_data=X_data,
     y_data=y_data,
     split_type=split_type,
     test_size=test_size)
 for k in ker:
     for g in gam:
         for costi in cost:
             svmFit = SVR(kernel=str(k),
                          gamma=g,
                          C=costi,
                          verbose=False)
             svmFit.fit(X_train, y_train)
             MAPE = accuracy_metric(metric="MAPE",
                                    actual=y_test,
                                    pred=svmFit.predict(X_test),
                                    tran_type=tran_type)
             result_MAPE.append([
                 s_type, fc, l, svmFit.kernel, svmFit.gamma, svmFit.C,
                 svmFit.epsilon, svmFit.tol, svmFit.degree, MAPE
             ])
             print(
                 str(ind) + "/" + str(Total_run), [
                     s_type, fc, l, svmFit.kernel, svmFit.gamma,
                     svmFit.C, svmFit.epsilon, svmFit.tol,
                     svmFit.degree, MAPE
                 ])
             ind = ind + 1
Ejemplo n.º 46
0
class TimeSeriesSVR(TimeSeriesSVMMixin, RegressorMixin,
                    TimeSeriesBaseEstimator):
    """Time-series specific Support Vector Regressor.

    Parameters
    ----------
    C : float, optional (default=1.0)
        Penalty parameter C of the error term.

    kernel : string, optional (default='gak')
         Specifies the kernel type to be used in the algorithm.
         It must be one of 'gak' or a kernel accepted by ``sklearn.svm.SVC``.
         If none is given, 'gak' will be used. If a callable is given it is
         used to pre-compute the kernel matrix from data matrices; that matrix
         should be an array of shape ``(n_samples, n_samples)``.

    degree : int, optional (default=3)
        Degree of the polynomial kernel function ('poly').
        Ignored by all other kernels.

    gamma : float, optional (default='auto')
        Kernel coefficient for 'gak', 'rbf', 'poly' and 'sigmoid'.
        If gamma is 'auto' then:

        - for 'gak' kernel, it is computed based on a sampling of the training
          set (cf :ref:`tslearn.metrics.gamma_soft_dtw <fun-tslearn.metrics.gamma_soft_dtw>`)
        - for other kernels (eg. 'rbf'), 1/n_features will be used.

    coef0 : float, optional (default=0.0)
        Independent term in kernel function.
        It is only significant in 'poly' and 'sigmoid'.

    tol : float, optional (default=1e-3)
        Tolerance for stopping criterion.

    epsilon : float, optional (default=0.1)
         Epsilon in the epsilon-SVR model. It specifies the epsilon-tube
         within which no penalty is associated in the training loss function
         with points predicted within a distance epsilon from the actual
         value.

    shrinking : boolean, optional (default=True)
        Whether to use the shrinking heuristic.

    cache_size :  float, optional (default=200.0)
        Specify the size of the kernel cache (in MB).

    n_jobs : int or None, optional (default=None)
        The number of jobs to run in parallel for GAK cross-similarity matrix
        computations.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See scikit-learns'
        `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
        for more details.

    verbose : int, default: 0
        Enable verbose output. Note that this setting takes advantage of a
        per-process runtime setting in libsvm that, if enabled, may not work
        properly in a multithreaded context.

    max_iter : int, optional (default=-1)
        Hard limit on iterations within solver, or -1 for no limit.

    Attributes
    ----------
    support_ : array-like, shape = [n_SV]
        Indices of support vectors.
        
    support_vectors_ : array of shape [n_SV, sz, d]
        Support vectors in tslearn dataset format

    dual_coef_ : array, shape = [1, n_SV]
        Coefficients of the support vector in the decision function.

    coef_ : array, shape = [1, n_features]
        Weights assigned to the features (coefficients in the primal
        problem). This is only available in the case of a linear kernel.
        `coef_` is readonly property derived from `dual_coef_` and
        `support_vectors_`.

    intercept_ : array, shape = [1]
        Constants in decision function.

    sample_weight : array-like, shape = [n_samples]
        Individual weights for each sample

    svm_estimator_ : sklearn.svm.SVR
        The underlying sklearn estimator

    Examples
    --------
    >>> from tslearn.generators import random_walk_blobs
    >>> X, y = random_walk_blobs(n_ts_per_blob=10, sz=64, d=2, n_blobs=2)
    >>> import numpy
    >>> y = y.astype(numpy.float) + numpy.random.randn(20) * .1
    >>> reg = TimeSeriesSVR(kernel="gak", gamma="auto")
    >>> reg.fit(X, y).predict(X).shape
    (20,)
    >>> sv = reg.support_vectors_
    >>> sv.shape  # doctest: +ELLIPSIS
    (..., 64, 2)
    >>> sv.shape[0] <= 20
    True


    References
    ----------
    Fast Global Alignment Kernels.
    Marco Cuturi.
    ICML 2011.
    """
    def __init__(self,
                 C=1.0,
                 kernel="gak",
                 degree=3,
                 gamma="auto",
                 coef0=0.0,
                 tol=0.001,
                 epsilon=0.1,
                 shrinking=True,
                 cache_size=200,
                 n_jobs=None,
                 verbose=0,
                 max_iter=-1):
        self.C = C
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma
        self.coef0 = coef0
        self.tol = tol
        self.epsilon = epsilon
        self.shrinking = shrinking
        self.cache_size = cache_size
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.max_iter = max_iter

    @property
    def n_iter_(self):
        warnings.warn('n_iter_ is always set to 1 for TimeSeriesSVR, since '
                      'it is non-trivial to access the underlying libsvm')
        return 1

    @deprecated
    def support_vectors_time_series_(self, X=None):
        """Support vectors as time series.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Training time series dataset.
        """
        if X is not None:
            warnings.warn('The use of '
                          '`support_vectors_time_series_` is deprecated in '
                          'tslearn v0.4 and will be removed in v0.6. Use '
                          '`support_vectors_` property instead.')
        check_is_fitted(self, '_X_fit')
        return self._X_fit[self.svm_estimator_.support_]

    @property
    def support_vectors_(self):
        check_is_fitted(self, '_X_fit')
        return self._X_fit[self.svm_estimator_.support_]

    def fit(self, X, y, sample_weight=None):
        """Fit the SVM model according to the given training data.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.
            
        y : array-like of shape=(n_ts, )
            Time series labels.
            
        sample_weight : array-like of shape (n_samples,), default=None
            Per-sample weights. Rescale C per sample. Higher weights force the 
            classifier to put more emphasis on these points.
        """
        sklearn_X, y = self._preprocess_sklearn(X, y, fit_time=True)

        self.svm_estimator_ = SVR(C=self.C,
                                  kernel=self.estimator_kernel_,
                                  degree=self.degree,
                                  gamma=self.gamma_,
                                  coef0=self.coef0,
                                  shrinking=self.shrinking,
                                  tol=self.tol,
                                  cache_size=self.cache_size,
                                  verbose=self.verbose,
                                  max_iter=self.max_iter)
        self.svm_estimator_.fit(sklearn_X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
        """Predict class for a given set of time series.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.

        Returns
        -------
        array of shape=(n_ts, ) or (n_ts, dim_output), depending on the shape
        of the target vector provided at training time.
            Predicted targets
        """
        sklearn_X = self._preprocess_sklearn(X, fit_time=False)
        return self.svm_estimator_.predict(sklearn_X)

    def _more_tags(self):
        return {
            'non_deterministic': True,
            'allow_nan': True,
            'allow_variable_length': True
        }
Ejemplo n.º 47
0
def filter_genes_dispersion(data,
                            flavor='seurat',
                            min_disp=None,
                            max_disp=None,
                            min_mean=None,
                            max_mean=None,
                            n_bins=20,
                            n_top_genes=None,
                            log=True,
                            copy=False):
    """Extract highly variable genes.
    The normalized dispersion is obtained by scaling with the mean and standard
    deviation of the dispersions for genes falling into a given bin for mean
    expression of genes. This means that for each bin of mean expression, highly
    variable genes are selected.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    flavor : {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing
        'seurat', this expects non-logarithmized data - the logarithm of mean
        and dispersion is taken internally when `log` is at its default value
        `True`. For 'cell_ranger', this is usually called for logarithmized data
        - in this case you should set `log` to `False`. In their default
        workflows, Seurat passes the cutoffs whereas Cell Ranger passes
        `n_top_genes`.
    min_mean=0.0125, max_mean=3, min_disp=0.5, max_disp=`None` : `float`, optional
        If `n_top_genes` unequals `None`, these cutoffs for the means and the
        normalized dispersions are ignored.
    n_bins : `int` (default: 20)
        Number of bins for binning the mean gene expression. Normalization is
        done with respect to each bin. If just a single gene falls into a bin,
        the normalized dispersion is artificially set to 1. You'll be informed
        about this if you set `settings.verbosity = 4`.
    n_top_genes : `int` or `None` (default: `None`)
        Number of highly-variable genes to keep.
    log : `bool`, optional (default: `True`)
        Use the logarithm of the mean to variance ratio.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    If an AnnData `adata` is passed, returns or updates `adata` depending on \
    `copy`. It filters the `adata` and adds the annotations
    """
    adata = data.copy() if copy else data
    set_initial_size(adata)
    if n_top_genes is not None and adata.n_vars < n_top_genes:
        logg.info(
            'Skip filtering by dispersion since number of variables are less than `n_top_genes`'
        )
    else:
        if flavor is 'svr':
            mu = adata.X.mean(0).A1 if issparse(adata.X) else adata.X.mean(0)
            sigma = np.sqrt(adata.X.multiply(adata.X).mean(0).A1 -
                            mu**2) if issparse(adata.X) else adata.X.std(0)
            log_mu = np.log2(mu)
            log_cv = np.log2(sigma / mu)

            from sklearn.svm import SVR
            clf = SVR(gamma=150. / len(mu))
            clf.fit(log_mu[:, None], log_cv)
            score = log_cv - clf.predict(log_mu[:, None])
            nth_score = np.sort(score)[::-1][n_top_genes]
            adata._inplace_subset_var(score >= nth_score)
        else:
            from scanpy.api.pp import filter_genes_dispersion
            filter_genes_dispersion(adata,
                                    flavor=flavor,
                                    min_disp=min_disp,
                                    max_disp=max_disp,
                                    min_mean=min_mean,
                                    max_mean=max_mean,
                                    n_bins=n_bins,
                                    n_top_genes=n_top_genes,
                                    log=log)
    return adata if copy else None
Ejemplo n.º 48
0
# feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# splitting data into train_test_split
"""
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 1/3, random_state = 0)"""

# fitting SVR in dataset
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X, y.ravel())

# predict the model
y_pred = sc_y.inverse_transform(
    regressor.predict(sc_X.transform((np.array([6.5]).reshape(1, -1)))))
# Note: Scale back the data to the original representation = inververse_transform

# visualize the SVR
plt.scatter(X, y, c='red')
plt.plot(X, regressor.predict(X), c='green')
plt.title('truth or dare(SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
print(type(y_val))
print(y_val.shape)

# Change value
#x_val = countryGDI
#y_val = countryHDI_f

# Define Regression Function
svr_lin = SVR(kernel='linear', C=1e3)
svr_poly = SVR(kernel='poly', C=1e3, degree=3)
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
rvm = RVR(kernel='rbf', gamma=1)  ### CHANGE TO RVR

# Proceed regression using Support Vector Regression (SVR)
t1 = time.time()
y_rbf = svr_rbf.fit(x_val, y_val).predict(x_val)
t2 = time.time()
t_svr_rbf = t2 - t1
print('Support Vector Regression with RBF kernel takes {} s'.format(t_svr_rbf))

t1 = time.time()
y_lin = svr_lin.fit(x_val, y_val).predict(x_val)
t2 = time.time()
t_svr_lin = t2 - t1
print('Support Vector Regression with linear  kernel takes {} s'.format(
    t_svr_lin))

t1 = time.time()
y_poly = svr_poly.fit(x_val, y_val).predict(x_val)
t2 = time.time()
t_svr_poly = t2 - t1
                     cv=cross_validation)  # グリッドサーチの設定
gs_cv.fit(autoscaled_x_train, autoscaled_y_train)  # グリッドサーチ + クロスバリデーション実施
optimal_linear_svr_c = gs_cv.best_params_['C']  # 最適な C
optimal_linear_svr_epsilon = gs_cv.best_params_['epsilon']  # 最適な ε

# 結果の確認
print('最適化された C : {0} (log(C)={1})'.format(optimal_linear_svr_c,
                                           np.log2(optimal_linear_svr_c)))
print('最適化された ε : {0} (log(ε)={1})'.format(
    optimal_linear_svr_epsilon, np.log2(optimal_linear_svr_epsilon)))

# モデル構築
model = SVR(kernel='linear',
            C=optimal_linear_svr_c,
            epsilon=optimal_linear_svr_epsilon)  # SVRモデルの宣言
model.fit(autoscaled_x_train, autoscaled_y_train)  # モデル構築

# 標準回帰係数
standard_regression_coefficients = pd.DataFrame(
    model.coef_.T,
    index=x_train.columns,
    columns=['standard_regression_coefficients'])  # Pandas の DataFrame 型に変換
standard_regression_coefficients.to_csv(
    'standard_regression_coefficients_svr_linear.csv'
)  # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください

# トレーニングデータの推定
autoscaled_estimated_y_train = model.predict(autoscaled_x_train)  # y の推定
estimated_y_train = autoscaled_estimated_y_train * y_train.std(
) + y_train.mean()  # スケールをもとに戻す
estimated_y_train = pd.DataFrame(estimated_y_train,
Ejemplo n.º 51
0
    test_x = np.reshape(test_x, (3 * 480, 2))
    test_y = np.reshape(test_y, (3 * 480, 1))
    train_x = test_x
    train_y = test_y
    """
    train_x.shape :  (1261, 3)
    train_y.shape :  (1261, 1)  
    test_x.shape :  (620, 3)
    test_y.shape :  (620, 1)
    """

    #todo svr method
    svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.01)
    #svr_lin = SVR(kernel='linear', C=1e3)
    #svr_poly = SVR(kernel='poly', C=1e3, degree=2)
    y_rbf = svr_rbf.fit(train_x, train_y).predict(test_x)
    #y_lin = svr_lin.fit(train_x, train_y).predict(test_x)
    #y_poly = svr_poly.fit(train_x, train_y).predict(test_x)

    #反标准化
    #y_rbf=np.reshape(y_rbf,(len(y_rbf),-1))
    #y_rbf=scaler.inverse_transform(y_rbf)
    #test_y=scaler.inverse_transform(test_y)

    plt.plot(y_rbf, label='y_rbf')
    #plt.plot(y_lin,label='y_lin')
    #plt.plot(y_poly,label='y_poly')
    plt.plot(test_y, label='true')
    plt.legend(loc='upper right')
    plt.show()
Ejemplo n.º 52
0
mse = mean_squared_error(test_label, predict_r)
sgd_score = np.sqrt(mse)
sgd_score
#cross_val_Stochastic_gradient
sgd = SGDRegressor(penalty='l2', n_iter_no_change=100, alpha=0.05)
score = cross_val_score(sgd,
                        train,
                        train_label,
                        cv=10,
                        scoring='neg_mean_squared_error')
sgd_score_cross = np.sqrt(-score)
np.mean(sgd_score_cross), np.std(sgd_score_cross)

from sklearn.svm import SVR
svm = SVR(epsilon=15, kernel='linear')
svm.fit(train, train_label)
predict_r = svm.predict(test)
mse = mean_squared_error(test_label, predict_r)
svm_score = np.sqrt(mse)
svm_score
#cross_val_SVR
svm = SVR(epsilon=15, kernel='linear')
score = cross_val_score(svm,
                        train,
                        train_label,
                        cv=10,
                        scoring='neg_mean_squared_error')
svm_score_cross = np.sqrt(-score)
np.mean(svm_score_cross), np.std(svm_score_cross)

from sklearn.tree import DecisionTreeRegressor
Ejemplo n.º 53
0
#import dataset
dataset = pa.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2].values

#feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y.reshape(-1, 1))

#reshape is important

#Regressor
from sklearn.svm import SVR
Regressor = SVR(kernel='rbf')
Regressor.fit(X, y)

y_pred = Regressor.predict(sc_X.transform([[6.5]]))
y_pred = sc_y.inverse_transform(y_pred)
#Feature scaling should be done

#plot
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape(len(X_grid), 1)
plt.scatter(X, y, color='red')
plt.plot(X_grid, Regressor.predict(X_grid), color='blue')
plt.show()
Ejemplo n.º 54
0
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

# Fitting SVR to the dataset
from sklearn.svm import SVR

regressor = SVR(kernel='rbf')
regressor.fit(X, y)  #train the SVR machine on the dataset

# Predicting a new result
#un-scale the salary
y_pred = sc_y.inverse_transform(
    regressor.predict(sc_X.transform(np.array([[6.5]]))))

# Visualising the SVR results
plt.scatter(X, y, color='red')
plt.plot(X, regressor.predict(X), color='blue')
plt.title('Truth or Bluff (SVR)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

# Visualising the Regression results (for higher resolution and smoother curve)
forest_rmse 
# forest_rmse = 21933.31414779769

#CrossValueScore_RandomForest
from sklearn.model_selection import cross_val_score
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

# SupportVectorRegression_SVR
from sklearn.svm import SVR
svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse
#rmse 111094.6308539982 - not great very high  rmse

# __________________________________________________________________________________________
# Fine-tune the model - GridSearch RandomForest

# GridSearch: Searching for the hyperparameters (instead of manually) using RandomForest
from sklearn.model_selection import GridSearchCV
# These hyperparameters are stated within dictionaries
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
Ejemplo n.º 56
0
 def createSupportVectorMachineModel(mode, ticker, startDate, endDate,
                                     days):
     df = mode.createInitialDataFrame(ticker, startDate, endDate)
     df.fillna(value=-99999, inplace=True)
     df, xTrain, yTrain = mode.trainingData(df, "Close", 1)
     svrClose = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrClose.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "MACD", 1)
     svrMACD = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrMACD.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "WR", 1)
     svrWR = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrWR.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "UO", 1)
     svrUO = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrUO.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "SMA", 1)
     svrSMA = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrSMA.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "ROCP", 1)
     svrROCP = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrROCP.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "ROCV", 1)
     svrROCV = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrROCV.fit(xTrain, yTrain)
     df = mode.createInitialDataFrame(ticker, startDate, endDate)
     lastDate = df.iloc[-1].name
     predictionData = np.array(df.iloc[-1])
     predictionData = predictionData.reshape(1, len(predictionData))
     for i in range(days):
         lastClosePrediction = svrClose.predict(predictionData)
         lastMACDPrediction = svrMACD.predict(predictionData)
         lastWRPrediction = svrWR.predict(predictionData)
         lastUOPrediction = svrUO.predict(predictionData)
         lastSMAPrediction = svrSMA.predict(predictionData)
         lastROCPPrediction = svrROCP.predict(predictionData)
         lastROCVPrediction = svrROCV.predict(predictionData)
         newDate = lastDate + datetime.timedelta(days=1)
         newRow=pd.Series(data={"Close": float(lastClosePrediction),\
         "MACD": float(lastMACDPrediction), "WR": float(lastMACDPrediction),\
         "UO": float(lastUOPrediction), "SMA": float(lastSMAPrediction),\
         "ROCP": float(lastROCPPrediction),\
         "ROCV": float(lastROCVPrediction)}, name=newDate)
         df = df.append(newRow, ignore_index=False)
         df, predictionData, svrClose, svrMACD, svrWR, svrUO, svrSMA,\
         svrROCP, svrROCV=mode.updateSVMModel(df)
         lastDate = newDate
     plt.plot(df["Close"][:-days])
     plt.plot(df["Close"][-days - 1:])
     axi = plt.axes()
     axi.xaxis.set_major_locator(plt.MaxNLocator(4))
     for tick in axi.xaxis.get_major_ticks():
         tick.label.set_fontsize(8)
     plt.xticks(rotation=30)
     plt.show()
Ejemplo n.º 57
0
class SVM(object):
    def __split(self):
        """
        Splits data into training and test data for the SVM
        :return: Xtrain, ytrain, Xtest, ytest
        """
        Xtrain = self.scaledData[:self.startDay]
        ytrain = pd.DataFrame(self.scaledData[1:self.startDay + 1]).iloc[:, 4]
        Xtest = self.scaledData[self.startDay:-1]
        ytest = pd.DataFrame(self.scaledData[self.startDay + 1:]).iloc[:, 4]
        return Xtrain, ytrain.values, Xtest, ytest.values

    def __init__(self, C, epsilon, ticker, manager, startDay, kernel='rbf'):
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.model = SVR(C=C, epsilon=epsilon, kernel=kernel)
        self.ticker = ticker
        self.manager = manager
        self.startDay = startDay
        self.data = get_multifeature_data(manager, ticker)
        self.stationaryData = diff_multifeature(self.data)
        self.scaledData = self.scale(self.stationaryData)
        self.Xtrain, self.ytrain, self.Xtest, self.ytest = self.__split()
        self.raw_prices = list(self.data['vwap'])

    def scale(self, df):
        """
        Normalizes the data between 0 and 1
        :param df: dataframe
        :return: scaled dataframe
        """
        values = df.values
        scaled = self.scaler.fit_transform(values)
        return scaled

    def unscale(self, series):
        """
        Unnormalizes the data from the output
        :param series: series of scaled points
        :return: unscaled series
        """
        padded = pd.DataFrame()
        reshaped = series.reshape(1, len(series))[0]
        for i in range(4):
            padded[i] = [0 for j in range(len(series))]
        padded['unscaled'] = reshaped
        padded[5] = [0 for j in range(len(series))]
        unscaled = pd.DataFrame(self.scaler.inverse_transform(padded.values))
        unscaled = unscaled.iloc[:, 4]
        return list(unscaled)

    def test_and_error(self):
        """
        Used in development for deciding the architecture
        :return: None
        """
        self.fit()
        raw_predictions = self.model.predict(self.Xtest)
        unscaled_predictions = self.unscale(raw_predictions)
        predictions = undifference(self.data.iloc[self.startDay, 4],
                                   unscaled_predictions)
        print(mean_squared_error(self.ytest, raw_predictions))

        days = create_timeseries(self.manager, self.ticker)[1]
        days = [days[x] for x in range(0, len(days), 2)]
        actual = list(self.data['vwap'])

        plt.plot(days, actual, color='black', label='Actual')
        plt.plot(days[self.startDay + 3:],
                 predictions[1:],
                 color='red',
                 label='LSTM predictions')
        plt.xlabel('day')
        plt.title(self.ticker)
        plt.ylabel('price')
        plt.legend(loc=2)
        plt.savefig('plots/SVM/SVM_{0}_predictions.pdf'.format(self.ticker))
        plt.show()

    def fit(self):
        """
        Trains the model
        :return: None
        """
        self.model.fit(self.Xtrain, self.ytrain)

    def predict(self, D):
        """
        Predicts the next price
        :param D: day index
        :return: prediction
        """
        d = D - len(self.Xtrain) - 1

        if d == -1:
            x = self.Xtrain[len(self.Xtrain) - 1].reshape(1, 6)
        else:
            x = self.Xtest[d].reshape(1, 6)
        previousPrice = self.raw_prices[D - 1]
        diff_pred = self.unscale(self.model.predict(x))
        prediction = previousPrice + diff_pred[0]
        return prediction
Ejemplo n.º 58
0
 def updateSVMModel(mode, df):
     df, xTrain, yTrain = mode.trainingData(df, "Close", 1)
     svrClose = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrClose.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "MACD", 1)
     svrMACD = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrMACD.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "WR", 1)
     svrWR = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrWR.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "UO", 1)
     svrUO = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrUO.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "SMA", 1)
     svrSMA = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrSMA.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "ROCP", 1)
     svrROCP = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrROCP.fit(xTrain, yTrain)
     df, xTrain, yTrain = mode.trainingData(df, "ROCV", 1)
     svrROCV = SVR(kernel="rbf", C=1e3, gamma=0.1)
     svrROCV.fit(xTrain, yTrain)
     if "result" in df.columns:
         df = df.drop(["result"], 1)
     predictionData = np.array(df.iloc[-1])
     predictionData = predictionData.reshape(1, len(predictionData))
     return df, predictionData, svrClose, svrMACD, svrWR, svrUO, svrSMA,\
     svrROCP, svrROCV
Ejemplo n.º 59
0
def run_particular(arg):
    i, cat = arg

    ibp = IBP(enable_cluster=False)
    ibp.fit(training_votes[2], training_votes[1])
    y_per = training_votes[1] / training_votes[2].astype(float)
    y_ibp = ibp(training_votes[2], training_votes[1])

    X = np.array(
        list(
            map(
                lambda x: x[1],
                filter(lambda x: x[0] == cat,
                       zip(training_cats[:, i], training_x)))))
    X_tsb = np.array(
        list(
            map(lambda x: x[1],
                filter(lambda x: x[0] == cat, zip(tsb_cats[:, i], tsb_x)))))
    y_tsb = np.array(
        list(
            map(lambda x: x[1],
                filter(lambda x: x[0] == cat, zip(tsb_cats[:, i],
                                                  tsb_truth)))))

    clf_per = SVR(C=1, gamma=0.001)
    clf_ibp = SVR(C=1000, gamma=0.0001)
    clf_per.fit(
        X,
        np.array(
            list(
                map(
                    lambda x: x[1],
                    filter(lambda x: x[0] == cat,
                           zip(training_cats[:, i], y_per))))))
    clf_ibp.fit(
        X,
        np.array(
            list(
                map(
                    lambda x: x[1],
                    filter(lambda x: x[0] == cat,
                           zip(training_cats[:, i], y_ibp))))))

    tsb_y_hat_per = clf_per.predict(X_tsb)
    tsb_y_hat_ibp = clf_ibp.predict(X_tsb)

    mse_tsb_per = ((tsb_y_hat_per - y_tsb)**2).mean()
    mae_tsb_per = abs(tsb_y_hat_per - y_tsb).mean()
    rmse_tsb_per = mse_tsb_per**0.5
    mse_tsb_ibp = ((tsb_y_hat_ibp - y_tsb)**2).mean()
    mae_tsb_ibp = abs(tsb_y_hat_ibp - y_tsb).mean()
    rmse_tsb_ibp = mse_tsb_ibp**0.5

    print(2**(i + 1), cat, 'tsb', (training_cats[:,
                                                 i] == cat).astype(int).sum(),
          (tsb_cats[:, i] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp,
          (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp,
          (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per,
          rmse_tsb_ibp, (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per)

    return [[
        2**(i + 1), cat, 'tsb', (training_cats[:, i] == cat).astype(int).sum(),
        (tsb_cats[:, i] == cat).astype(int).sum(), mse_tsb_per, mse_tsb_ibp,
        (mse_tsb_per - mse_tsb_ibp) / mse_tsb_per, mae_tsb_per, mae_tsb_ibp,
        (mae_tsb_per - mae_tsb_ibp) / mae_tsb_per, rmse_tsb_per, rmse_tsb_ibp,
        (rmse_tsb_per - rmse_tsb_ibp) / rmse_tsb_per,
        ttest_rel(tsb_y_hat_per, tsb_y_hat_ibp).pvalue
    ]]
Ejemplo n.º 60
-19
def analyze(data, label, num_folds):
    # Partition data into folds
    n = len(data) // num_folds
    data_folds = [data[i:i+n] for i in range(0, len(data), n)]
    label_folds = [label[i:i+n] for i in range(0, len(label), n)]

    lin_reg_error = 0
    
    cs = [4**c for c in range(-10, 0, 1)]
    svm_error = [0] * len(cs)
    svm_std = [0] * len(cs)
    # for i in range(0, num_folds):
    #     test_data = data_folds[i]
    #     test_label = label_folds[i]
    #     train_data = []
    #     train_label = []
    #     for j in range(num_folds):
    #         if i != j:
    #             train_data += data_folds[j]
    #             train_label += label_folds[j]

    # model = linear_model.LinearRegression()
    # model.fit(data, label)
    # return model
        # lin_reg_error += np.mean(abs(model.predict(test_data) - test_label))
        #
        # for i2 in range(len(cs)):
        #     svm_classifier = SVR(gamma=cs[i2])
        #     svm_classifier.fit(train_data, train_label)
        #     svm_error[i2] += np.mean(abs(svm_classifier.predict(test_data) - test_label))
        #     svm_std[i2] += np.std(abs(svm_classifier.predict(test_data) - test_label))

    svm_c = SVR(gamma=4**-7)
    svm_c.fit(data, label)
    return svm_c