def evaluate_learner(X_train, X_test, y_train, y_test): ''' Run multiple times with different algorithms to get an idea of the relative performance of each configuration. Returns a sequence of tuples containing: (title, expected values, actual values) for each learner. ''' # Use a support vector machine for regression from sklearn.svm import SVR # Train using a radial basis function svr = SVR(kernel='rbf', gamma=0.1) svr.fit(X_train, y_train) y_pred = svr.predict(X_test) r_2 = svr.score(X_test, y_test) yield 'RBF Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred # Train using a linear kernel svr = SVR(kernel='linear') svr.fit(X_train, y_train) y_pred = svr.predict(X_test) r_2 = svr.score(X_test, y_test) yield 'Linear Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred # Train using a polynomial kernel svr = SVR(kernel='poly', degree=2) svr.fit(X_train, y_train) y_pred = svr.predict(X_test) r_2 = svr.score(X_test, y_test) yield 'Polynomial Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
def train_SVM(X, Y, kernel='rbf', shrinking=True, tol=0.001, cache_size=1500, verbose=True, max_iter=-1): """Assumes all irrelevant features have been removed from X and Y""" """Learns several hundred SVMs""" clf = SVR(kernel=kernel, tol=tol, cache_size=cache_size, verbose=verbose, max_iter=max_iter) pipeline = Pipeline(zip([ "imputate", "vart", "scale", "svm" ], [ Imputer(), VarianceThreshold(), StandardScaler(), clf ])) param_grid = dict(svm__C=[0.1, 1, 10, 100, 1000], svm__gamma=[0.001, 0.01, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3) results = [] for i in range(Y[0].shape[1]): Y_new = np.fromiter((x[:, i][0, 0] for x in Y), np.double) X_new = np.array([np.matrix(x.data).flatten().tolist() for x in X], np.double) #X_new = np.fromiter((np.matrix(x.data) for x in X), np.double) X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new, Y_new, test_size = 0.2) X_train = flatten(X_train) X_test = flatten(X_test) grid_search.fit(X_train, Y_train) results.append( (grid_search.best_estimator_, clf.score(X_test, Y_test))) print("Best estimators (C): {0}, Score: {1}".format(grid_search.best_estimator_, clf.score(X_test, Y_test))) return results
def test(): n_samples, n_features = 10,5 np.random.seed(0) y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) print y print X clf = SVR(C=1.0,epsilon=0.2) clf.fit(X,y) print clf.score(X,y)
def svmRegressorStudy(X, Y, setSize, comment): #runs svm regressor on the data X_train=X[:setSize] X_test=X[setSize:] Y_train=Y[:setSize] Y_test=Y[setSize:] svm=SVR() svm.fit(X_train, Y_train) print 'svm regressor '+comment s1 = svm.score(X_train, Y_train) s2 = svm.score(X_test, Y_test) print 'svm score for ', s1, s2
class SVRegression: def __init__(self, kernel_value, c_value, iter_value): self.kernel = kernel_value self.c = c_value self.iter = iter_value self.svr_lin = None def fit_predict(self, x_train, y_train, x_test): self.svr_lin = SVR(kernel=self.kernel, C=self.c, max_iter=self.iter) y_lin = self.svr_lin.fit(x_train, y_train).predict(x_test) return y_lin def computeC(self, x_train): print "ARRAY ", type(x_train) print x_train array = x_train.todense() print "ARRAY ", type(array) print array result = array.sum(axis=1, dtype='float') result = pow(result, 2) total = result.sum(axis=0, dtype='float') rows, columns = x_train.shape total = float(total)/float(rows) total = pow(total,-1) print "C", total self.c = total def computeAccuracy(self, x, y): return self.svr_lin.score(x, y)
def trainmodel(train_data, dev_data, train_label=train_labels, dev_label=test_labels, params={ "kernel":"rbf"}): rf=SVR().set_params(**params) rf.fit(train_data, train_label) rf_pred=rf.predict(dev_data) #r2=mean_squared_error(dev_label, rf_pred) r2 = rf.score(dev_data, dev_label) return r2, rf
def correlation_search_map(line,variable,lag_variable,kernel_variable): predictor_data = line.split(';') predictor_key = predictor_data[0] predictor_name = predictor_data[1] predictor = cjson.decode(predictor_data[2]) predictor = transform_serie(predictor) key = str(predictor_key) original_predictor = predictor.values() original_X = np.array(original_predictor,ndmin=2) original_X = original_X.reshape((-1,1)) #original_X = scale(original_X) results = {} for i in range(lag_variable.value+1): lagged_predictor = lag_serie(predictor, i) (list_key,list_variable, list_predictor) = serie_join(variable.value, lagged_predictor) if len(list_predictor) < 6: results[str(i)] = {'r2' : 0} continue y = np.array(serie_std(list_variable),ndmin=1) X = np.array(serie_std(list_predictor),ndmin=2) X = X.reshape((-1,1)) clf = SVR(kernel=kernel_variable.value) clf.fit(X, y) r_squared = clf.score(X, y) if r_squared < 0.5: results[str(i)] = {'r2' : 0} continue result = {} result["r2"] = r_squared results[str(i)] = result return { 'id':key, 'results': results, 'name':predictor_name}
class SvrBlockade(BlockadeModel): def __init__(self): super(SvrBlockade, self).__init__() self.name = "SVR" self.svr_cache = {} def _svr_predict(self, feature_vec): """ Predicts signal for a feature vector """ if feature_vec not in self.svr_cache: np_feature = np.array(feature_vec).reshape(1, -1) self.svr_cache[feature_vec] = self.predictor.predict(np_feature)[0] return self.svr_cache[feature_vec] def train(self, peptides, signals, C=1000, gamma=0.001, epsilon=0.01): """ Trains SVR model """ self.predictor = SVR(kernel="rbf", C=C, gamma=gamma, epsilon=epsilon) features = map(lambda p: self._peptide_to_features(p), peptides) train_features = np.array(sum(features, [])) train_signals = np.array(sum(signals, [])) assert len(train_features) == len(train_signals) self.predictor.fit(train_features, train_signals) print(self.predictor.score(train_features, train_signals)) def peptide_signal(self, peptide): """ Generates theoretical signal for a given peptide """ assert self.predictor is not None features = self._peptide_to_features(peptide) signal = np.array(map(lambda x: self._svr_predict(x), features)) #normalize the signal's amplitude signal = signal / np.std(signal) return signal def _peptide_to_features(self, peptide): """ Converts peptide into a list of feature vectors """ aa_weights = _aa_to_weights(peptide) num_peaks = len(aa_weights) + self.window - 1 flanked_peptide = ("-" * (self.window - 1) + aa_weights + "-" * (self.window - 1)) features = [] for i in xrange(0, num_peaks): kmer = flanked_peptide[i : i + self.window] feature = _kmer_to_features(kmer) features.append(feature) return features
def obj(self, cfg): # create the regressor with given params clsf = SVR(C = 10.0 ** cfg['C'], epsilon= 10.0 ** cfg['epsilon'], gamma= 10.0 ** cfg['gamma'], kernel=cfg['kernel']) # fit the regressor clsf.fit(self.X, self.Y) # get the validation score score = clsf.score(self.Xv, self.Yv) return score
def main(): log.info('main start') dfs = [] for year in range(YEAR_START+1, YEAR_END+1): dfs.append(loadHistory(year)) # print dfs df = pd.concat(dfs, ignore_index=True) # print df.head(1) # print df['score'] log.info('{} total rows'.format(len(df))) # clean player data df_cleaned = cleanPlayerData(df) df_cleaned.to_csv('{}/data.csv'.format(FOLDER)) # get labels and features log.info('getting labels and features...') labels = df_cleaned['score'] print 'labels\n', labels # print df_merged.columns[-10:] df_cleaned = df_cleaned.drop('score', axis=1).astype(float) features = scale(df_cleaned) # print df_merged.columns[-10:] print 'features\n', features[0] X_train, X_test, y_train, y_test = train_test_split(features, labels) # CV # clf = GradientBoostingRegressor() clf = SVR(kernel='linear') # clf = ExtraTreesRegressor(n_estimators=1000) cv = cross_val_score(clf, X_train, y_train, cv=5, scoring='r2') cv = [abs(n) for n in cv] log.info('CV mean {} std {}'.format(np.mean(cv), np.std(cv))) # print 'cv', cv # train clf.fit(X_train, y_train) print 'score', clf.score(X_test, y_test) # predict prediction = predictCurrent(clf, YEAR_END+1, df_cleaned.columns) # calculate winnings # calculateWinnings(prediction) log.info('main end')
def CalculateSVR(data=None): """ Function is used to classify review text based on Support Vector Regression Classifier :param data: Review text with the rating from the data set :return: print the accuracy Score """ vectorizer = TfidfVectorizer(tokenizer=pre_process) classifier = SVR(kernel='linear') train, test = train_test_split([(i['text'], i['stars']) for i in data], test_size=.2, random_state=10) x_train = vectorizer.fit_transform(i[0] for i in train) x_test = vectorizer.transform(i[0] for i in test) classifier.fit(x_train, [i[1] for i in train]) score = classifier.score(x_test, [i[1] for i in test]) print score
def train_SVM(trainingdataX,trainingdataY,testdataX,testdataY): clf = SVR(C=C, epsilon=epsilon,kernel='rbf') clf.fit(trainingdataX,(trainingdataY)) samples = [] labels = [] pred = [] for sample in range(len(testdataX)): samples.append(testdataX[sample]) labels.append((float)(testdataY[sample][0])) pred = clf.predict(samples) errs = pred - labels r2 = clf.score(samples,labels) print("SVM: R^2: {} C: {} eps: {}".format(r2,C,epsilon)) return clf, r2
def make_encoding_model (X,y1,y2,movie_idx): #### #Encoding model pipeline #### #1. Fit linear regression at each voxel of training data #2. Get model fit at each voxel of testing data #3. Find max loading PE for each voxel (assuming these are sorted) and project these to a new brain volume #Experimenter knobs: #1. Crossval is being done on run_1/run_2. Could change this to 90%/10% having averaged together 1st/2nd half. #2. Should be trained on 90% of the movie and tested on the other 10% #Fit the voxel timecourse Y with X #clf = linear_model.RidgeCV(alphas=[0.001, 0.1, 1, 10],fit_intercept=True) #clf = linear_model.Ridge(alpha=0.1,fit_intercept=True) clf = SVR(kernel='linear', C=1e3, gamma=0.1) #clf = linear_model.Ridge(alpha=1,fit_intercept=True).fit(X,rh_1_array) #clf.fit(X,y1) #Fit to first half #coeffs = clf.coef_ #y2_hat = clf.predict(X) #MSE = metrics.mean_squared_error(y2, y2_hat) #compare [predicted y2] to y2 #r2 = clf.score(object_model, y2) #compare [predicted y2] to y2 #return coeffs, r2, MSE, y2_hat ntimes = movie_idx.shape[0] coeffs = np.zeros((ntimes,X.shape[1])) r2_array = np.zeros(ntimes) r_array = np.zeros(ntimes) mse_array = np.zeros(ntimes) y2_hat_array = [] for idx in range(0,ntimes - 1): #seperate out this run from the rest start_idx = movie_idx[idx] end_idx = movie_idx[idx + 1] - 1 tx = X[start_idx:end_idx,:] ty1 = y1[start_idx:end_idx] ty2 = y2[start_idx:end_idx] clf = SVR(kernel='linear', C=10, gamma=0.1, verbose = False, max_iter = 1000) clf.fit(tx,ty1) #Fit to first half coeffs[idx,:] = clf.coef_ y2_hat = clf.predict(tx) y2_hat_array.append(y2_hat) r2_array[idx] = clf.score(tx, ty2) r_array[idx] = np.corrcoef(ty2, y2_hat)[0,1] mse_array[idx] = metrics.mean_squared_error(ty2,y2_hat) #y2_hat_array = np.vstack(y2_hat_array) return coeffs, r2_array, mse_array, y2_hat_array, r_array
def main(): """Load images, train classifier, score classifier.""" parser = argparse.ArgumentParser(description="Train an SVM model to locate cat faces in images.") parser.add_argument("--dataset", required=True, help="Path to your 10k cats dataset directory") args = parser.parse_args() # initialize dataset subdir_names = ["CAT_00", "CAT_01", "CAT_02", "CAT_03", "CAT_04", "CAT_05", "CAT_06"] subdirs = [os.path.join(args.dataset, subdir) for subdir in subdir_names] dataset = Dataset(subdirs) # load images and labels print("Loading images...") X, y = load_xy(dataset, NB_CROPS, NB_AUGMENTATIONS) assert X.dtype == np.float32 assert np.max(X) <= 1.0 assert np.min(X) >= 0.0 # split train and val """ nb_images = X.shape[0] nb_train = int(nb_images * (1 - SPLIT)) X_train = X[0:nb_train, ...] y_train = y[0:nb_train, ...] X_val = X[nb_train:, ...] y_val = y[nb_train:, ...] """ X_val, X_train = X[0:NB_VALIDATION, ...], X[NB_VALIDATION:, ...] y_val, y_train = y[0:NB_VALIDATION, ...], y[NB_VALIDATION:, ...] print("%d of %d values in y_train are 1, %d of %d values in y_val" % (np.count_nonzero(y_train), y_train.shape[0], np.count_nonzero(y_val), y_val.shape[0])) print("Training...") #svc = SVC(C=0.1, class_weight="auto", kernel="poly") svc = SVR(C=0.1, verbose=True) svc.fit(X_train, y_train) print("Predictions...") preds = svc.predict(X_val) for i in range(preds.shape[0]): print("%d: pred=%.2f, label=%.2f" % (i, preds[i], y_val[i])) print("Scoring...") acc = svc.score(X_val, y_val) print("accuracy = %.4f" % (acc))
def main(): if debug: print "\n\n\tdrugBind.py" # obtain training data try: train_x, train_y, newData = getFeatures(featuresFilename) except IOError: makeFeatures(featuresFilename) train_x, train_y, newData = getFeatures(featuresFilename) # machine learning steps # fit a SVM model to the data model = SVR() model.fit(train_x, train_y) if debug: print model print "\nUsing training data to test model accuracy:" # make predictions expected = train_y predicted = model.predict(train_x) # summarize the fit of the model mse = numpy.mean((predicted-expected)**2) # mean of squared errors if debug: print("\n\tMean of squared errors: {}".format(mse)) ''' Returns the coefficient of determination R^2 of the prediction. The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares ((y_true - y_pred) ** 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) ** 2).sum(). Best possible score is 1.0, lower values are worse. ''' if debug: print("\tModel score: {}".format(model.score(train_x, train_y)))
def _random_search(self, random_iter, x, y, kernel_cache_size): # Default Values c = 1.0 gamma = 0.0 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) param_dist = {"C": numpy.power(2.0, range(-5, 16)), "gamma": numpy.power(2.0, range(-15, 4))} param_list = [{"C": c, "gamma": gamma}, ] param_list.extend(list(ParameterSampler(param_dist, n_iter=random_iter-1, random_state=self._rng))) train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) for idx, d in enumerate(param_list): svr = SVR(kernel='rbf', gamma=d['gamma'], C=d['C'], random_state=self._rng, cache_size=kernel_cache_size) svr.fit(train_x, train_y) sc = svr.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc c = d['C'] gamma = d['gamma'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using C: %f and Gamma: %f\n" % (c, gamma)) return c, gamma
def linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares'): if cost_fun == 'ordinary_least_squares': regr = linear_model.LinearRegression() elif cost_fun == 'Ridge_Regression': regr = linear_model.Ridge(alpha=1) elif cost_fun == 'Bayesian_Regression': regr = linear_model.BayesianRidge() elif cost_fun == 'SVR': regr = SVR(C=1.0, epsilon=0.2, kernel='linear') elif cost_fun == 'KNN_Reg': regr = neighbors.KNeighborsRegressor(5, weights='distance') else: raise Exception('The type of cost function is not specified.') # Train the model using the training sets regr.fit(X_train, Y_train) predict = regr.predict(X_test) # record the experiment performance, Explained variance score: 1 is perfect prediction np.seterr(invalid='ignore') print(list(predict)[:100]) print(Y_test[:100]) evaluate(list(predict), np.array(Y_test), 'linear regression ' + 'Explained variance score: %.2f' % regr.score(X_test, Y_test))
print(x_test.shape) print(y_test.shape) # Define the algorithm to be used alg = SVR(kernel='linear', C=1.0, epsilon=0.2) # Fit the model alg.fit(x_train, y_train) # Verify the prediction model using the test data y_pred = alg.predict(x_test) print(y_pred.shape) # Graph the test data with the regretion line plt.scatter(x_test, y_test) plt.plot(x_test, y_pred, color='red', linewidth=3) plt.title('Support Vector Regression') plt.xlabel('Rooms Number') plt.ylabel('Mean Value') plt.show() # Obtain the parameters ai for this model a0 = alg.intercept_ print('a0:', a0) a = alg.coef_ print('ai:', a) print('y =', a0, '+', a[0], '* x') # Verify the model error based on R² print('certainty:', alg.score(x_train, y_train) * 100, '%')
def support_vector_regression(data): """ Main method for support vector regression. Trains a RBF-Model, Polynomial-Model and Linear-Model. Args: data (DataML): The dataset to perform the regression on. """ global dataset dataset = data global feature_mapping feature_mapping = create_dict() # Configure regression model svr_rbf = SVR(kernel='rbf', C=1, gamma=0.5, epsilon=0.165, cache_size=1000) svr_lin = SVR(kernel='linear', C=1, epsilon=0.165, cache_size=1000) svr_poly = SVR(kernel='poly', C=100000, degree=2, epsilon=0.165, cache_size=1000) output_dir = create_output_dir() results = '' results += calculate_crossvalidation(svr_lin, svr_poly, svr_rbf, Config.CV_FOLDS_REGRESSION) rbf_count_exact = [] rbf_count_close = [] poly_count_exact = [] poly_count_close = [] lin_count_exact = [] lin_count_close = [] test_set_start = Config.TRAIN_SET_START while test_set_start <= Config.TEST_SET_STOP: if test_set_start >= Config.TEST_SET_STOP: test_set_end = Config.TRAIN_SET_END else: test_set_end = test_set_start + Config.TEST_SET_SIZE X = np.concatenate( [data.data[Config.TRAIN_SET_START:test_set_start], data.data[test_set_end:Config.TRAIN_SET_END]]) y = np.concatenate( [data.target[Config.TRAIN_SET_START:test_set_start], data.target[test_set_end:Config.TRAIN_SET_END]]) X_test = data.data[test_set_start:test_set_end] y_test = data.target[test_set_start:test_set_end] # Do the mapping for target values do_mapping(y_test) do_mapping(y) rbf = svr_rbf.fit(X, y) lin = svr_lin.fit(X, y) poly = svr_poly.fit(X, y) score_rbf = svr_rbf.score(X_test, y_test) score_poly = svr_poly.score(X_test, y_test) score_lin = svr_lin.score(X_test, y_test) y_rbf_predicted = rbf.predict(X_test) y_lin_predicted = lin.predict(X_test) y_poly_predicted = poly.predict(X_test) results += add_to_results(test_set_end, test_set_start) count_rbf_exact, count_rbf_close, results = calculate_metrics(y_test, y_rbf_predicted, "RBF-Kernel", score_rbf, results) count_poly_exact, count_poly_close, results = calculate_metrics(y_test, y_poly_predicted, "Poly-Kernel", score_poly, results) count_lin_exact, count_lin_close, results = calculate_metrics(y_test, y_lin_predicted, "Linear-Kernel", score_lin, results) rbf_count_exact.append(count_rbf_exact) rbf_count_close.append(count_rbf_close) poly_count_exact.append(count_poly_exact) poly_count_close.append(count_poly_close) lin_count_exact.append(count_lin_exact) lin_count_close.append(count_lin_close) graph_dir = '%s/%s_%s_predicted_graph.png' % (output_dir, test_set_start, test_set_end) x_axis = np.arange(test_set_start, test_set_end) draw_results(y_test, y_lin_predicted, y_poly_predicted, y_rbf_predicted, x_axis, graph_dir) test_set_start = test_set_start + Config.TEST_SET_SIZE results = add_counts_to_results(lin_count_close, lin_count_exact, poly_count_close, poly_count_exact, rbf_count_close, rbf_count_exact, results) # save file with open(output_dir + "/scoring_results.txt", 'w') as file: file.write(results)
######################## Escalonamento Dados ######################## scaler_x = StandardScaler() x_scaled = scaler_x.fit_transform(x) scaler_y = StandardScaler() y_scaled = scaler_y.fit_transform(y) # Split x_sc_treinamento, x_sc_teste, y_sc_treinamento, y_sc_teste = train_test_split( x_scaled, y_scaled, test_size=0.5, random_state=101) ######################## SVR ######################## regressor_SVM = SVR(kernel='rbf', C=100) regressor_SVM.fit(x_treinamento, y_treinamento) regressor_SVM.score(x_treinamento, y_treinamento) regressor_SVM.score(x_teste, y_teste) previsao = regressor_SVM.predict(x_teste) n_toPlot = -365 plt.scatter(Pivot_Data.index[n_toPlot:], Pivot_Data['USDBRL Curncy'].values[n_toPlot:], s=5, c="b", label="USDBRL Curncy") plt.plot(Pivot_Data.index[n_toPlot:], regressor_SVM.predict(x)[n_toPlot:], c="r", label="Estimado")
y_train = ss_y.fit_transform(y_train) y_test = ss_y.transform(y_test) #线性核函数 linear_svr = SVR(kernel='linear') linear_svr.fit(X_train, y_train) linear_svr_y_predict = linear_svr.predict(X_test) #多项式核函数 poly_svr = SVR(kernel='poly') poly_svr.fit(X_train, y_train) poly_svr_y_predict = poly_svr.predict(X_test) #径向基核函数 rbf_svr = SVR(kernel='rbf') rbf_svr.fit(X_train, y_train) rbf_svr_y_predict = rbf_svr.predict(X_test) print('-------------The result of linear SVR-------------') print('R-squared', linear_svr.score(X_test, y_test)) print( 'MSE:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))) print( 'MAE:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))) print('-------------The result of poly SVR-------------') print('R-squared', poly_svr.score(X_test, y_test)) print( 'MSE:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))) print(
trainData.drop(2744604, inplace = True) print("Total Train Data: ", len(trainData)) #print(trainData.isnull().any()) trainData['matchType'] = trainData['matchType'].astype('category') trainData['groupId'] = trainData['groupId'].astype('category') trainData['matchId'] = trainData['matchId'].astype('category') trainData['groupId_cat'] = trainData['groupId'].cat.codes trainData['matchId_cat'] = trainData['matchId'].cat.codes trainData['matchType_cat'] = trainData['matchType'].cat.codes trainData.drop(columns = ['Id','groupId', 'matchId', 'matchType'], inplace = True) #print(trainData.head()) x = trainData.drop(['winPlacePerc'],axis=1) y = trainData['winPlacePerc'] xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size = 0.2, random_state = 4) #print(xtrain.values) print("Train: ", len(ytrain)," Test: ", len(ytest)) print("Model Training...") svmModel = SVR(gamma=0.001, C=1.0, epsilon=0.2) svmModel.fit(xtrain.values[:50000],ytrain.values[:50000]) print("Score: " ,svmModel.score(xtest.values[:200],ytest.values[:200]))
import pandas as pd base = pd.read_csv('plano-saude2.csv') X = base.iloc[:, 0:1].values y = base.iloc[:, 1:2].values # kernel linear from sklearn.svm import SVR regressor_linear = SVR(kernel='linear') regressor_linear.fit(X, y) import matplotlib.pyplot as plt plt.scatter(X, y) plt.plot(X, regressor_linear.predict(X), color='red') regressor_linear.score(X, y) # kernel poly regressor_poly = SVR(kernel='poly', degree=3) regressor_poly.fit(X, y) plt.scatter(X, y) plt.plot(X, regressor_poly.predict(X), color='red') regressor_poly.score(X, y) # kernel rbf from sklearn.preprocessing import StandardScaler scaler_x = StandardScaler() X = scaler_x.fit_transform(X) scaler_y = StandardScaler() y = scaler_y.fit_transform(y)
X = np.array(dataset.connectivity) y = np.array(dataset.scores['age']) yr = np.ceil(y / 10).astype(int) skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) r2s, maes = [], [] ytrue, ypred = [], [] svr = SVR(kernel='linear') for iteration, (train, test) in enumerate(skf.split(X, yr)): svr.fit(X[train], y[train]) yp = svr.predict(X[test]) ytrue.extend(y[test]) ypred.extend(yp) r2 = svr.score(X[test], y[test]) mae = mean_absolute_error(y[test], yp) print('%u: R^2 %.2f - MAE %.2f' % (iteration, r2, mae)) maes.append(mae) r2s.append(r2) f = plt.figure(figsize=(6, 6)) ax = f.gca() ax.scatter(ytrue, ypred) ax.plot([0, 100], [0, 100], 'r', linewidth=2) ax.grid(linestyle='--') xlabels = ['%u' % x for x in ax.get_xticks()] ax.set_xticklabels(xlabels, fontsize=26) labels = ['%u' % x for x in ax.get_yticks()] ax.set_yticklabels(labels, fontsize=26) ax.set_ylabel('Predicted Age', fontsize=24)
### Create the dependent data set ### # Convert the dataframe to a numpy array y = np.array(df['Prediction']) # Get all of the y value except the last 'n' rows y = y[:-forecast_out] # Split data into 80% training and 20% testing x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) # Create and train SVM model svr_rbf = SVR(kernel = 'rbf', C=1e3, gamma = 0.1) svr_rbf.fit(x_train, y_train) # Testing Model: Score returns the coefficient of determination R^2 of the prediction svm_confidence = svr_rbf.score(x_test, y_test) print("svm confidence: ", svm_confidence) # Create and train the Linear Regression Model lr = LinearRegression() # Train the model lr.fit(x_train, y_train) # Test LR model lr_confidence = lr.score(x_test, y_test) print("lr confidence: ", lr_confidence) #Set x_forecast equal to the last 30 rows of the original data set from Adj. Close column x_forecast = np.array(df.drop(['Prediction'], 1))[-forecast_out:] # Print the LR model predictions for the next 'n' days
y_pred_gb = clf_gb.predict(x_test) f, (ax1, ax2) = plt.subplots(1, 2, figsize=(30, 10)) # Linear Regression ax1.scatter(range(len(y_test)), y_test, label='data') ax1.plot(range(len(y_test)), y_pred_lr, color='green', label='LR model') ax1.legend() # Support Vector Machine ax2.scatter(range(len(y_test)), y_test, label='data') ax2.plot(range(len(y_test)), y_pred_svr, color='orange', label='SVM-RBF model') ax2.legend() f1, (ax3, ax4) = plt.subplots(1, 2, figsize=(30, 10)) # Random Forest Regressor ax3.scatter(range(len(y_test)), y_test, label='data') ax3.plot(range(len(y_test)), y_pred_rf, color='red', label='RF model') ax3.legend() # Gradient Boosting Regressor ax4.scatter(range(len(y_test)), y_test, label='data') ax4.plot(range(len(y_test)), y_pred_gb, color='black', label='GB model') ax4.legend() print("Accuracy of Linear Regerssion Model:", clf_lr.score(x_test, y_test)) print("Accuracy of SVM-RBF Model:", clf_svr.score(x_test, y_test)) print("Accuracy of Random Forest Model:", clf_rf.score(x_test, y_test)) print("Accuracy of Gradient Boosting Model:", clf_gb.score(x_test, y_test))
linear_svr = SVR(kernel='linear') linear_svr.fit(x_train, y_train) linear_svr_y_predict = linear_svr.predict(x_test) #使用多项式核函数配置 poly_svr = SVR(kernel='poly') poly_svr.fit(x_train, y_train) poly_svr_y_predict = poly_svr.predict(x_test) #使用径向基核函数配置 rbf_svr = SVR(kernel='rbf') rbf_svr.fit(x_train, y_train) rbf_svr_y_predict = rbf_svr.predict(x_test) #模型评价 print 'the value of default measurement of Linear SVR is ', linear_svr.score( x_test, y_test) from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print 'the value of R-squared of Linear SVR is ', r2_score( y_test, linear_svr_y_predict) print 'the value of mean squared error of Linear SVR is ', mean_squared_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)) print 'the value of mean absoluate error of Linear SVR is ', mean_absolute_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)) print '' print 'the value of default measurement of Poly SVR is ', poly_svr.score( x_test, y_test) from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error print 'the value of R-squared of Poly SVR is ', r2_score(
X_test = test[features].dropna() y_test = test[target].dropna() svr = SVR(kernel='rbf', C=1e3, gamma=0.1) # svr = SVR(kernel='linear', C=1e3) # svr = SVR(kernel='poly', C=1e3, degree=2) # train the model on the training set svr.fit(X_train, y_train) y_pred = svr.predict(X_test) plt.scatter(y_test, y_pred, color='blue') plt.xlabel("Real revenue") plt.ylabel("Predicted revenue") plt.show() svr_score_train = svr.score(X_test, y_test) svr_score_test = svr.score(X_train, y_train) print("Training score: ", svr_score_train) print("Testing score: ", svr_score_test) # y = movies.revenue.values # length = 4083 # y = y.reshape(-1, 1) # x = preprocessing.scale(x) # y = preprocessing.scale(y) # regr = scr_rbfear_model.scr_rbfearRegression() # regr.fit(x,y)
# step 1. model mod = SVR() # step 2. learning mod.fit(X_train, y_train) # y_pred = mod.predict(X_train) # print_prediction_score(y_train, y_pred) # = training score # step 3. predict y_pred = mod.predict(X_test) # step 4. score print_prediction_score(y_test, y_pred) print('R^2 train : %.3f, test : %.3f' % (mod.score(X_train, y_train), mod.score(X_test, y_test))) # }}} # # 2. parameter optimization (Grid Search) #{{{ print('') print('') print('# 2. parameter optimization (Grid Search)') # step 1. model mod = SVR() # step 2. learning with optimized parameters # search range range_c = [i * 10**j for j in range(-2, 2) for i in range(1, 10)] range_g = [i * 10**j for j in range(-2, 2) for i in range(1, 10)]
pred1 = clss.predict(X_tst) pred1 = pd.DataFrame(pred1) ################################################################################### #################### Decision Tree Regressor ###################################### clss = DecisionTreeRegressor() clss.fit(X_trn, y_trn) scoreOfModel2 = clss.score(X_trn, y_trn) print("Model Score DTR: ", scoreOfModel2) pred2 = clss.predict(X_tst) pred2 = pd.DataFrame(pred2) #################################################################################### ###################### Support vector regressor #################################### sv = SVR(kernel='rbf', C=1.0) # radial basis function(rbf) sv.fit(X_trn, y_trn) scoreOfModel3 = sv.score(X_trn, y_trn) print("Model Score SVR: ", scoreOfModel3) pred3 = sv.predict(X_tst) pred3 = pd.DataFrame(pred3) #################################################################################### ################### Multiple Linear Regression ##################################### reg = LinearRegression() reg.fit(X_trn, y_trn) scoreOfModel4 = reg.score(X_trn, y_trn) pred4 = reg.predict(X_tst) pred4 = pd.DataFrame(pred4) print('r2 score MLR:', {r2_score(y_tst, pred4)}) #model Evaluation ################################################################################### ######################### Ridge Regression (L2) ###################################
# Best parameters for cuxhaven.de (~5yr. data) # score: 0.831, Best parameters: {'C': 10, 'gamma': 0.001 #svr_rbf = SVR(kernel='rbf', C=10, gamma=0.001) #Best parameters for 0.2 split # score: 0.834, Best parammeters: {'C': 20, 'gamma': 0.001, 'kernel': 'rbf'} #svr_rbf = SVR(kernel='rbf', C=20, gamma=0.001) # Unshuffled Best parameters for 0.2 split # score: 0.812, Best parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'} svr_rbf = SVR(kernel='rbf', C=10, gamma=0.001) eps = 0.1 # RBF svr_rbf.fit(lx_norm_train,ly_train['surge']) pred_svr_rbf = svr_rbf.predict(lx_norm_test) # surge predictions by the svr_rbf model print(svr_rbf.score(lx_norm_train, ly_train['surge'])) # Model Score R^2 of 0.777 # Compare the surge values from the test dataset to the predicted surge values SR_rmse = np.sqrt(metrics.mean_squared_error(ly_test['surge'], pred_svr_rbf)) # Plot results plt.figure(figsize=(14, 7)) plt.plot(surge_w1['date'],surge_w1['surge'], 'black') # un-split surge dataset plt.plot(ly_test['date'], ly_test['surge'], 'blue') # test data (target: surge) plt.plot(ly_test['date'], pred_svr_rbf, 'red') #plt.scatter(horizontal[svr_rbf.support_], retry[svr_rbf.support_], \ # facecolor='none', edgecolor='red', ) # support vectors plt.plot(ly_test['date'], pred_svr_rbf+eps, color='g', linestyle='--') plt.plot(ly_test['date'], pred_svr_rbf-eps, color='g', linestyle='--') plt.xlabel('Time')
clf_A.fit(features, arousal) clf_V = SVR(C=1.0, cache_size=200, coef0=0.0, degree=2, epsilon=0.5, gamma='auto', kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) clf_V.fit(features, valence) print('Arousal Training R^2: %0.3f' % clf_A.score(features, arousal)) print('Valence Training R^2: %0.3f' % clf_V.score(features, valence)) # Cross Validation scores_A = cross_val_score(clf_A, features, arousal, cv=5, scoring='neg_mean_squared_error') print("Arousal CV MSE: %0.2f (+/- %0.2f)" % (-scores_A.mean(), scores_A.std() * 2)) scores_V = cross_val_score(clf_V, features, valence, cv=5,
linear_svr.fit(X_train, y_train.ravel()) linear_svr_y_predict = linear_svr.predict(X_test) # 使用多项式核函数配置的支持向量机进行回归训练,并且对测试样本进行预测。 poly_svr = SVR(kernel='poly') poly_svr.fit(X_train, y_train.ravel()) poly_svr_y_predict = poly_svr.predict(X_test) # 使用径向基核函数配置的支持向量机进行回归训练,并且对测试样本进行预测。 rbf_svr = SVR(kernel='rbf') rbf_svr.fit(X_train, y_train.ravel()) rbf_svr_y_predict = rbf_svr.predict(X_test) print '' print '************************************************************************************************************' # 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估。 from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error print 'R-squared value of linear SVR is', linear_svr.score(X_test, y_test) print 'The mean squared error of linear SVR is', mean_squared_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)) print 'The mean absoluate error of linear SVR is', mean_absolute_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)) print '' print '************************************************************************************************************' print 'R-squared value of Poly SVR is', poly_svr.score(X_test, y_test) print 'The mean squared error of Poly SVR is', mean_squared_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict)) print 'The mean absoluate error of Poly SVR is', mean_absolute_error( ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
def main(): horses98 = HorseParserNoHandicaps('./../Data/born98.csv').horses horses05 = HorseParserNoHandicaps('./../Data/born05.csv').horses races98 = RaceParserNoHandicaps('./../Data/born98.csv').races races05 = RaceParserNoHandicaps('./../Data/born05.csv').races print 'HorsesBorn98 Dataset' horses_train_98, horses_test_98 = split_dataset(horses98) horses_98_X_train = [] horses_98_y_train = [] for h in horses_train_98: v,s = compute_vector(h) horses_98_X_train.append(v) horses_98_y_train .append(s) print 'No. of instances in training set:' print len(horses_98_X_train) print len(horses_98_y_train) print '' horses_98_X_test = [] horses_98_y_test = [] for h in horses_test_98: v,s = compute_vector(h) horses_98_X_test.append(v) horses_98_y_test.append(s) print 'No. of instances in testing set:' print len(horses_98_X_test) print len(horses_98_y_test) print '' print 'Create SVR object' # Create svr object svr98 = SVR(kernel='linear', C=1e3)#, gamma=0.1) print 'Training SVR' # Train the model using the training sets svr98.fit(horses_98_X_train, horses_98_y_train) print 'Predicting' horses_98_y_pred = svr98.predict(horses_98_X_test) # Explained variance score: 1 is perfect prediction print 'Variance score:' print svr98.score(horses_98_X_test, horses_98_y_test) print '' print 'Mean absolute error:' print mean_absolute_error(horses_98_y_test, horses_98_y_pred) print '' print 'Explained variance:' print explained_variance_score(horses_98_y_test, horses_98_y_pred) print '' print 'Mean squared error:' print mean_squared_error(horses_98_y_test, horses_98_y_pred) print '' print 'R2 score:' print r2_score(horses_98_y_test, horses_98_y_pred) print ''
from sklearn.svm import SVR erb = 4 # SVR svr_rbf = SVR(kernel='rbf', C=44.424063740, epsilon=0.0056846371, gamma=1 / 2000) svr_rbf.fit(bts_base[erb][:1500], pathloss[erb]['plreal'][:1500]) y = svr_rbf.predict(bts_base[erb][1500:]) plreal = pathloss[erb]['plreal'][1500:] # The mean squared error print("Mean squared error: %.2f" % np.mean((y - plreal)**2)) # Explained variance score: 1 is perfect prediction print('SVR score: %.2f' % svr_rbf.score(bts_base[erb][1500:], y)) # Plot lw = 2 plt.plot([plreal.min(), plreal.max()], [plreal.min(), plreal.max()], lw=2, color="black") plt.scatter(plreal, y, color='darkorange', label='data', edgecolors="black") plt.xlabel("Predict Path Loss") plt.ylabel("Real Path Loss") plt.show()
def svmRegressor(self): trainingData,desiredLabel = self.normalizeColumnwiseData() #trainingData,desiredLabel = self.loadExperimentData() test_size = 0.3 coordinates_train, coordinates_test, windspeed_train, windspeed_test = cross_validation.train_test_split(trainingData,desiredLabel,test_size=test_size) _, coordinates_predict, _, windspeed_predict = cross_validation.train_test_split(coordinates_test, windspeed_test,test_size=0.04) kernel='rbf' c= 21.0 epsilon= 0.2 gamma=1.6 curveFit = SVR(kernel=kernel,C=c, epsilon= epsilon, gamma=gamma) print curveFit print "kernel : ",kernel,"C : ",c,"epsilon : ",epsilon,"gamma : ",gamma, "test % : ", test_size, "no of train data : ", len(coordinates_train ) curveFit = curveFit.fit(coordinates_train, windspeed_train) print "Number of support vectors used:",len(curveFit.support_vectors_) print "Prediction Score :", curveFit.score(coordinates_test, windspeed_test) predicted_speed = curveFit.predict(coordinates_predict) predicted_speed_random_number_generator = [] for i in coordinates_predict: predicted_speed_random_number_generator.append(random.uniform(10,43)) predicted_speed_random_number_generator2 = [] for i in coordinates_predict: predicted_speed_random_number_generator2.append(random.uniform(10,43)) mse = mean_squared_error(windspeed_test, curveFit.predict(coordinates_test)) rms = sqrt(mse) print "mse : ",mse errorbarValues = [] #errorbins = [-4,-3,-2,-1,0,1,2,3,4,5] errorbins = np.arange(-30,30,1) for threshold in errorbins: correct_estimation = 0 for i in range(len(predicted_speed)): if (windspeed_predict[i] - predicted_speed[i] < threshold) and (windspeed_predict[i] - predicted_speed[i] > threshold-1): correct_estimation += 1 print "for threshold between: ", threshold ," and ",threshold-1," estimation: ", correct_estimation, " out of : ", len(windspeed_predict) errorbarValues.append(correct_estimation) """for threshold in [1,2,3,4,5]: correct_estimation = 0 for i in range(len(predicted_speed_random_number_generator)): if np.abs(windspeed_predict[i] - predicted_speed_random_number_generator[i]) < threshold: correct_estimation += 1 print "for threshold : ", threshold,"Fake Correct estimation: ", correct_estimation, " out of : ", len(windspeed_predict) for threshold in [1,2,3,4,5]: correct_estimation = 0 for i in range(len(predicted_speed_random_number_generator)): if np.abs(predicted_speed_random_number_generator[i] - predicted_speed_random_number_generator2[i]) < threshold: correct_estimation += 1 print "for threshold : ", threshold,"Total Fake Correct estimation: ", correct_estimation, " out of : ", len(predicted_speed_random_number_generator)""" ############################################################################### #Plot the error bar fig = plt.figure() ax = fig.add_subplot(111) width = 0.4 ax.bar([i - width for i in errorbins],errorbarValues,width,color="y",alpha=0.7) #ax.bar(errorbins,errorbarValues,width,color="y",alpha=0.7) plt.xlabel("Estimation error(kmph)") plt.ylabel("Number of observation") plt.title("Error histogram SVR") ax.set_xlim(-25,25) plt.grid() # look at the results fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(np.arange(0,len(predicted_speed),1),predicted_speed, c='g',marker='+', label='predicted speed') ax.scatter(np.arange(0,len(windspeed_predict),1),windspeed_predict, c='r',marker='x', label='Actual data') ax.set_xlim(-2,len(windspeed_predict)) ax.set_ylim(8,45) plt.xlabel('Number of test cases') plt.ylabel('wind speed') plt.title('Support Vector Regression') ax.legend() for i in range(len(predicted_speed)): ax.annotate('', xy=(i, windspeed_predict[i]), xytext=(i, predicted_speed[i]), arrowprops=dict(facecolor='b',alpha=0.5, shrink=0.03,headwidth=4.5,width=1.5,frac=0.4), ) plt.show()
w = w[:-forecast] print(w) q=y['Prediction'] q=q[:-forecast] print(q) from sklearn.svm import SVR from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(w,q,test_size =0.20) svr_rbf = SVR(kernel='rbf',C=1e3) svr_rbf.fit(x_train,y_train) svr_out = svr_rbf.score(x_test,y_test) print (svr_out) x_forecast = y.drop(['Prediction'],1)[-forecast:] print(x_forecast) svr_prediction = svr_rbf.predict(x_forecast) print(svr_prediction) import matplotlib.pyplot as plt z = y['2018-11-30':] plt.figure(figsize=(16,8)) plt.title('Bajaj Auto') plt.xlabel('Days')
dim = 128 f1 = f1(dim) nbTrain, nbTest = 10000, 100 xTrain = np.random.rand(nbTrain, dim) xTest = np.random.rand(nbTest, dim) #%% yTrain = f1.compute(xTrain) yTest = f1.compute(xTest) svr = SVR() print 'SVR' print 'Learning...' svr.fit(xTrain, yTrain) print 'Scoring...' print svr.score(xTest, yTest) #%% yTrain = f1.computeC(xTrain) yTest = f1.computeC(xTest) svc = SVC() print 'SVC' print 'Learning...' svc.fit(xTrain, yTrain) print 'Scoring...' print svc.score(xTest, yTest)
X = data.iloc[:, 1:2].values y = data.iloc[:, 2].values length_old = len(data.columns) sc_X = StandardScaler() sc_y = StandardScaler() X = sc_X.fit_transform(X.reshape(-1, 1)) y = sc_y.fit_transform(y.reshape(-1, 1)) reg = SVR(kernel='rbf') reg.fit(X, y) y_pred = reg.predict(np.array([5]).reshape(-1, 1)) print(y_pred) r2 = reg.score(X, y) print(r2) plt.scatter(X, y, color='r') plt.plot(X, reg.predict(X), color='b') plt.show() # Importing dataset dataSet = pd.read_csv( '/home/admin1/Desktop/Gayatri/Week2/ML_Problems/DecisionTreeRegression/Position_Salaries.csv' ) length_old = len(dataSet.columns) # Handling categorical data positions = pd.get_dummies(dataSet['Position']) dataSet = dataSet.drop('Position', axis=1)
import pandas as pd base = pd.read_csv('house-prices.csv') X = base.iloc[:, 3:19].values y = base.iloc[:, 2:3].values from sklearn.preprocessing import StandardScaler scaler_x = StandardScaler() X = scaler_x.fit_transform(X) scaler_y = StandardScaler() y = scaler_y.fit_transform(y) from sklearn.model_selection import train_test_split X_treinamento, X_teste, y_treinamento, y_teste = train_test_split( X, y, test_size=0.3, random_state=0) from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(X_treinamento, y_treinamento) score = regressor.score(X_treinamento, y_treinamento) regressor.score(X_teste, y_teste) previsoes = regressor.predict(X_teste) y_teste = scaler_y.inverse_transform(y_teste) previsoes = scaler_y.inverse_transform(previsoes) from sklearn.metrics import mean_absolute_error mae = mean_absolute_error(y_teste, previsoes)
.format(tree.score(training_encoded_data, training_targets))) print(' VGG16 Tree MaxLevel=3 R2 testing score = {}' .format(tree.score(testing_encoded_data, testing_targets))) tree = DecisionTreeRegressor(max_depth=2) tree.fit(training_encoded_data, training_targets) print(' VGG16 Tree MaxLevel=2 R2 training score = {}' .format(tree.score(training_encoded_data, training_targets))) print(' VGG16 Tree MaxLevel=2 R2 testing score = {}' .format(tree.score(testing_encoded_data, testing_targets))) svm = SVR() scaler = preprocessing.MinMaxScaler().fit(training_encoded_data) svm.fit(scaler.transform(training_encoded_data), training_targets) print(' VGG16 MinMaxScale Svm R2 training score = {}' .format(svm.score(scaler.transform(training_encoded_data), training_targets))) print(' VGG16 MinMaxScale Svm R2 testing score = {}' .format(svm.score(scaler.transform(testing_encoded_data), testing_targets))) svm = SVR() svm.fit(training_encoded_data, training_targets) print(' VGG16 Svm R2 training score = {}' .format(svm.score(training_encoded_data, training_targets))) print(' VGG16 Svm R2 testing score = {}' .format(svm.score(testing_encoded_data, testing_targets))) print() for index in range(11): print('ResNet50 index={}' .format(index)) extractor = ResNet50ImageFeature(index)
features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.20, random_state=42) # applying different classifiers from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVR cfl1 = RandomForestClassifier() cfl2 = SVR() cfl1.fit(features_train, labels_train) acc_test = cfl1.score(features_train, labels_train) acc_test cfl2.fit(features_train, labels_train) cfl2.score(features_train, labels_train) cfl1.score(features_test, labels_test) cfl2.score(features_test, labels_test) #predicting a type flower = [[5.7, 0.5]] class_code = cfl1.predict(flower) decoded_class = le.inverse_transform(class_code) print(decoded_class) class_code
# dimension should be n_bins target = geo_data[:, column_feature_mapping[pair[0]]] print target # create data (a 2D numpy array based on the classifiers the user provides) # dimensions should be n_bins x n_classifiers data = geo_data[:, [column_feature_mapping[pair[1]]]] print data clf = SVR() training_data, testing_data, training_target, testing_target = cross_validation.train_test_split( data, target, test_size=0.4, random_state=0 ) clf.fit(training_data, training_target) print clf.score(testing_data, testing_target) data = geo_data[:, column_feature_mapping[pair[1]]] if pair[1] == "farMarket": print "farmarket data:" print data corr_coefficent = stat.pearsonr(data, target) print corr_coefficent[0] plt.figure(i) plt.scatter(data, target) plt.xlabel(pair[1]) plt.ylabel(pair[0]) plt.title( "Pearson Correlation: " + str(corr_coefficent[0]) + " SVR Score: "
#训练模型 lr = LinearRegression() lr.fit(X_train, y_train) lr_y_predict = lr.predict(X_test) print("解析方法的评估:", lr.score(X_test, y_test)) sgdr = SGDRegressor() sgdr.fit(X_train, y_train) sgdr_y_predict = sgdr.predict(X_test) print("随机梯度法的评估:", sgdr.score(X_test, y_test)) linear_svr = SVR(kernel='linear') #使用线性核函数的支持向量机 linear_svr.fit(X_train, y_train) linear_y_predict = linear_svr.predict(X_test) print("线性核函数性能评估:", linear_svr.score(X_test, y_test)) poly_svr = SVR(kernel='poly') poly_svr.fit(X_train, y_train) poly_y_predict = poly_svr.predict(X_test) print("多项式核函数性能评估:", poly_svr.score(X_test, y_test)) rbf_svr = SVR(kernel='rbf') rbf_svr.fit(X_train, y_train) rbf_y_predict = rbf_svr.predict(X_test) print("径向基核函数性能评估:", rbf_svr.score(X_test, y_test)) knr = KNeighborsRegressor() knr.fit(X_train, y_train) knr_y_predict = knr.predict(X_train) print("K近邻性能评估:", knr.score(X_test, y_test))
###################### # cross validating ##################### X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, train_count, test_size=0.4, random_state=0) ############################################################################################# # Training Section # ############################################################################################# print "Training" # Support Vector Regression rbf = SVR(kernel="rbf", C=1e3, gamma=0.1) # rbf.fit(train, train_count) rbf.fit(X_train, y_train) print "svr with rbf ", rbf.score(X_test, y_test) # Bayesian Ridge Regression clf = linear_model.BayesianRidge(compute_score=True) # clf.fit(train,train_count) clf.fit(X_train, y_train) print "Bayesian Ridge Regression ", clf.score(X_test, y_test) # Linear Regression ols = linear_model.LinearRegression() # ols.fit(train,train_count) ols.fit(X_train, y_train) print "Linear Regressor ", ols.score(X_test, y_test) # Gradient Boosting Regression
sc_X = StandardScaler() sc_y = StandardScaler() x_tr = sc_X.fit_transform(x_tr) x_ts = sc_X.transform(x_ts) y_tr = sc_y.fit_transform(y_tr) y_ts = sc_y.fit_transform(y_ts) #SVR algorithm for training purpose from sklearn.svm import SVR regressor = SVR(kernel='rbf') regressor.fit(x_tr, y_tr) #SVR algorithm for testing purpose y_pred = regressor.predict(x_ts) regressor.score(x_tr, y_tr) regressor.score(x_ts, y_ts) #Applying K-Folf cross validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=regressor, X=x_tr, y=y_tr, cv=5, scoring='neg_mean_squared_error') accuracies = accuracies * (-1) accuracies.mean() accuracies.std()
def svr_trading(context, data): # Historical data, lets get the past days close prices for pastPrice = history(bar_count=context.history_len, frequency='1d', field='price') # Make predictions on universe for stock in data: # Make sure this stock has no existing orders or positions to simplify our portfolio handling. if check_if_no_conflicting_orders(stock) and context.portfolio.positions[stock].amount == 0: #This is a scoring system for our model, we only trade when confident our model is wicked awesome full_series = np.array(pastPrice[stock].values) l = context.out_of_sameple_bin_size power = 1 #N where X^n for weight function # Create bins of X len to hold as out of sample data, average score(error) of these is a decent measure of fit. prediction_history = [] for i in np.arange(context.history_len/context.out_of_sameple_bin_size): #Index of current in same, and out of sample data. # 3 cases of this slicing if i == 0: #First run, only two bins to work with(First OOSD bin, and the rest of the data) ISD = full_series[l:] OOSD = full_series[:l] X = np.arange(l,len(full_series)) # use a variable weight (~0 - 1.0) weight_training = np.power(np.arange(l,len(full_series),dtype=float), power)[::-1]/np.power(np.arange(l,len(full_series),dtype=float), power)[::-1].max() # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_score = np.concatenate((np.power(np.arange(1,l+1,dtype=float), power)/np.power(np.arange(1,l+1,dtype=float), power).max(), np.power(np.arange(l+1,len(full_series)+1,dtype=float), power)[::-1]/np.power(np.arange(l+1,len(full_series)+2,dtype=float), power)[::-1].max())) """print len (weight_training) print weight_training print len (weight_score) print weight_score print exit()""" elif i == context.history_len/context.out_of_sameple_bin_size - 1: #Last run, only two bins to work with(Last OOSD bin, and the rest of the data) ISD = full_series[:-l] OOSD = full_series[-l:] X = np.arange(0,len(full_series)-l) # use a variable weight (~0 - 1.0) weight_training = np.power(np.arange(l,len(full_series),dtype=float)+1, power)/np.power(np.arange(l,len(full_series),dtype=float)+1, power).max() # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_score = np.concatenate((np.power(np.arange(1,len(full_series)-l+1,dtype=float), power)/np.power(np.arange(1,len(full_series)-l+2,dtype=float), power).max(), np.power(np.arange(1,l+1,dtype=float), power)[::-1]/np.power(np.arange(1,l+1,dtype=float), power)[::-1].max())) """print len (weight_training) print weight_training print len (weight_score) print weight_score print exit()""" else: #Any other run, we have a sandwhich of OOSD in the middle of two ISD sets so we need to aggregate. ISD = np.concatenate((full_series[:(l*i)], full_series[l*(i+1):])) OOSD = full_series[l*i:l*(i+1)] X = np.concatenate(( np.arange(0,(l*i)), np.arange(l*(i+1),len(full_series)) )) # use a variable weight (~0 - 1.0) weight_training = np.concatenate(( np.power(np.arange(1, l*i+1, dtype=float), power)/np.power(np.arange(1, l*i+1, dtype=float), power).max(), np.power(np.arange(l*(i+1), len(full_series), dtype=float), power)[::-1]/np.power(np.arange(l*(i+1), len(full_series),dtype=float), power)[::-1].max() )) # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_score = np.concatenate(( np.power(np.arange(1, l*(i+1)+1, dtype=float), power)/np.power(np.arange(1, l*(i+1)+1, dtype=float), power).max(), np.power(np.arange(l*(i+1), len(full_series), dtype=float), power)[::-1]/np.power(np.arange(l*(i+1), len(full_series)+1, dtype=float), power)[::-1].max() )) """print len (weight_training) print weight_training print len (weight_score) print weight_score exit()""" # Domain and range of training data #X = np.arange(len(ISD)) X = np.atleast_2d(X).T y = ISD # Domain of prediction set #x = np.atleast_2d(np.linspace(0, len(ISD)+len(OOSD)-1, len(ISD)+len(OOSD))).T #x = np.atleast_2d(np.linspace(len(ISD) ,len(ISD)+len(OOSD)-1, len(OOSD))).T x = np.atleast_2d(np.linspace(0, len(full_series)-1, len(full_series))).T # epsilon-Support Vector Regression using scikit-learn # Read more here: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html SVR_model = SVR(kernel='rbf', C=100, gamma=.01) SVR_model.fit(X,y, weight_training) y_predSVR = SVR_model.predict(x) if np.isnan(full_series).any() or np.isinf(full_series).any(): print(stock + " Failed due to data INF or NAN") y_score = 0 break else: y_score = SVR_model.score(x, full_series)#, sample_weight=weight_score) #y_predSVR[-len(OOSD):] np.atleast_2d(y_predSVR).T #log.debug(y_score) prediction_history.append(y_score) score = np.mean(y_score) # If we are studying one stock, lets plot its correlation regression results if len(data) == 1: record(Ideal=1.0, Score=score) #Slope=slope, R_value=r # Store the prediction for comparison with the rest of the universe # Measure accuracy as the mean of the distance to the ideal value of # the r2 and slope from past vs predicted price correlation regression if score >= context.score_filter: #The model was accepted, make a forecast #form domain and range of test data(we leave no out of sameple data out since we already scored the model) X = np.arange(context.history_len) X = np.atleast_2d(X).T y = np.array(pastPrice[stock].values) # Domain of predection set. We only need to predict the next close price. x = np.atleast_2d(np.linspace(len(y), len(y), 1)).T """log.debug(X) log.debug(len(X)) log.debug(x) log.debug(len(x)) exit()""" # use a linearly peaking weight, focus on next day prediction (~0 - 1.0 - ~0) #weight_training = np.power(np.arange(1,context.history_len+1, dtype=float), power)/np.power(np.arange(1,context.history_len+1, dtype=float), power).max() #weight_training = np.exp(np.arange(1,context.history_len+1, dtype=float))/np.exp(np.arange(1,context.history_len+1, dtype=float)).max() # epsilon-Support Vector Regression using scikit-learn # Read more here: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html SVR_model = SVR(kernel='rbf', C=100, gamma=.01) SVR_model.fit(X, y)#, weight_training) y_predSVR = SVR_model.predict(x) context.next_pred_price[stock] = y_predSVR[-1] else: #Case where stock is left in dict and we dont want to use it, so remove it. if stock in context.next_pred_price: del context.next_pred_price[stock] # Count number of trades so we can split the availible cash properly number_of_trades_today = 0 for stock in data: # Make sure this stock has no existing orders or positions to simplify our portfolio handling # Also check that we have a prediction stored in the dict if check_if_no_conflicting_orders(stock) and \ context.portfolio.positions[stock].amount == 0 and \ stock in context.next_pred_price: # If we plan to move on this stock, take count of it(explained more in actual buy statement below)(Make sure these match both buy statements. if (percent_change(context.next_pred_price[stock], pastPrice[stock][-1]) >= context.action_to_move_percent and \ percent_change(context.next_pred_price[stock], data[stock]['price']) >= context.action_to_move_percent) or \ (percent_change(context.next_pred_price[stock], pastPrice[stock][-1]) <= -context.action_to_move_percent and \ percent_change(context.next_pred_price[stock], data[stock]['price']) <= -context.action_to_move_percent): number_of_trades_today += 1 # #Lets use record to plot how many securities are traded on each day. if len(data) >= 2: record(number_of_stocks_traded=number_of_trades_today) #Make buys and shorts if the predicted close change is bigger than our tollerance, same with current price to avoid opening gaps. for stock in data: # Make sure this stock has no existing orders or positions to simplify our portfolio handling # Also check that we have a prediction stored in the dict if check_if_no_conflicting_orders(stock) and context.portfolio.positions[stock].amount == 0 and stock in context.next_pred_price: #Go long if we predict the close price will change more(upward) than our tollerance, # apply same filter against current price vs predicted close in case of gap up/down. if percent_change(context.next_pred_price[stock], pastPrice[stock][-1]) >= context.action_to_move_percent and \ percent_change(context.next_pred_price[stock], data[stock]['price']) >= context.action_to_move_percent: # Place an order, and store the ID to fetch order info orderId = order_target_percent(stock, 1.0/number_of_trades_today) # How many shares did we just order, since we used target percent of availible cash to place order not share count. shareCount = get_order(orderId).amount # We can add a timeout time on the order. #context.duration[orderId] = exchange_time + timedelta(minutes=5) # We need to calculate our own inter cycle portfolio snapshot as its not updated till next cycle. value_of_open_orders(context, data) availibleCash = context.portfolio.cash-context.cashCommitedToBuy-context.cashCommitedToSell print("+ BUY {0:,d} of {1:s} at ${2:,.2f} for ${3:,.2f} / ${4:,.2f} @ {5:s}"\ .format(shareCount, stock,data[stock]['price'], data[stock]['price']*shareCount, availibleCash, context.exchange_time)) #Go short if we predict the close price will change more(downward) than our tollerance, # apply same filter against current price vs predicted close incase of gap up/down. elif percent_change(context.next_pred_price[stock], pastPrice[stock][-1]) <= -context.action_to_move_percent and \ percent_change(context.next_pred_price[stock], data[stock]['price']) <= -context.action_to_move_percent: #orderId = order_target_percent(stock, -1.0/len(data)) orderId = order_target_percent(stock, -1.0/number_of_trades_today) # How many shares did we just order, since we used target percent of availible cash to place order not share count. shareCount = get_order(orderId).amount # We can add a timeout time on the order. #context.duration[orderId] = exchange_time + timedelta(minutes=5) # We need to calculate our own inter cycle portfolio snapshot as its not updated till next cycle. value_of_open_orders(context, data) availibleCash = context.portfolio.cash-context.cashCommitedToBuy+context.cashCommitedToSell print("- SHORT {0:,d} of {1:s} at ${2:,.2f} for ${3:,.2f} / ${4:,.2f} @ {5:s}"\ .format(shareCount, stock,data[stock]['price'], data[stock]['price']*shareCount, availibleCash, context.exchange_time))
test_y_list = [] for x,y in testExamples: test_feature_list.append(tweetFeatureExtractor(x)) test_y_list.append(y) test = False if test: n_samples, n_features = 10,5 np.random.seed(0) y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) print y print X clf = SVR(C=1.0,epsilon=0.2) clf.fit(X,y) print clf.score(X,y) ## Straight SVR #y = np.array(y_list) #X = np.matrix(feature_list) #test_y = np.array(test_y_list) #test_X = np.matrix(test_feature_list) #print y #print X #clf = SVR(C=1.0,epsilon=0.4) #clf.fit(X,y) #print clf.score(test_X,test_y) #print test_y_list #pred_output = [int(round(y)) for y in clf.predict(test_X).tolist()] #print pred_output
class predictRealData: def __init__(self,learningProblem,seriesLengthInSeconds,readPath,writePath,test_size,slidingWindow=False,flag=True): self.frameRatePerSecond = 120 self.seriesLengthInSeconds = seriesLengthInSeconds #0.5 #0.0625 self.featuresPerSeries = np.round(self.seriesLengthInSeconds * self.frameRatePerSecond) self.framesInTheSlidingWindow = int(self.featuresPerSeries/3) self.learningProblem = learningProblem #This is the path from where the experiment results will be read self.readPath = readPath self.writePath = writePath self.slidingWindow = slidingWindow #print "features per series",2*self.featuresPerSeries self.numberOfFeatures = 6 self.test_size = test_size if learningProblem != "regression": self.clf = SVC(C=1.6,gamma=0.002) else: self.clf = SVR(kernel='rbf',C=1.2, epsilon=1.38) self.flagPredict = flag if flag: self.writePath = './tuft_real_data/17June/extractedFeatures/' def outlierDetection(self,listItem): #Reference http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm listItem = copy.deepcopy(listItem) #Step1: Sort the list sortedListItem = sorted(listItem) #Step2: Find the median value medianValue = self.calculateMedian(sortedListItem) #Step3: Find the lower quartile lowerQuartile = sortedListItem[:len(listItem)//2] lowerQuartileValue = self.calculateMedian(lowerQuartile) #Step4: Find the upper quartile upperQuartile = sortedListItem[len(listItem)//2:] upperQuartileValue = self.calculateMedian(upperQuartile) #Step5 find interquartile range interquartileRange = (upperQuartileValue - lowerQuartileValue)*1.5 innerFences = ((lowerQuartileValue - interquartileRange),(upperQuartileValue + interquartileRange)) #Step6 find outerquartile range interquartileRange2 = (upperQuartileValue - lowerQuartileValue)*3.0 outerFences = ((lowerQuartileValue - interquartileRange2),(upperQuartileValue + interquartileRange2)) #Step7 Calculate the major and minor outliers majorOutliers = [] minorOutliers = [] #print "outerFences",outerFences for i in listItem: if i < outerFences[0] or i > outerFences[1]: majorOutliers.append(i) if (i < innerFences[0] or i > innerFences[1]) and i not in majorOutliers: minorOutliers.append(i) return majorOutliers,minorOutliers def calculateMedian(self,listItem2): listItem2 = copy.deepcopy(listItem2) if len(listItem2) == 0: print "Empty array" return None if len(listItem2)%2 == 0: middleValue = len(listItem2)//2 medianValue = (listItem2[middleValue] + listItem2[middleValue+1])/2.0 else: medianValue = (listItem2[len(listItem2)//2]) #print medianValue return medianValue def rangeFinder(self,listItem3,outliers=None): listItem3 = copy.deepcopy(listItem3) outliers = copy.deepcopy(outliers) withoutOutliersList = [] if outliers != None and len(outliers) > 0: for v in listItem3: if v not in outliers: withoutOutliersList.append(v) valueRange = (min(withoutOutliersList),max(withoutOutliersList)) else: valueRange = (min(listItem3),max(listItem3)) withoutOutliersList = listItem3 #print "range",valueRange[0],valueRange[1] medianValue = self.calculateMedian(withoutOutliersList) std = np.std(withoutOutliersList) return valueRange,std '''This function reads files and creates a feature list which are sent to writeFeatures() function which creates .csv files contaning features and separate files contaning related labels''' def processFiles(self): list_of_files = glob.glob(self.readPath+'*.txt') #list_of_files = glob.glob('./tuft_real_data/22April/*.txt') colorCode = -1 colors = ["ro","bo","mo","go","yo"] numberOfFiles = 1 #For median Plot mediansX = [] mediansY = [] speeds = [] averagesX = [] averagesY = [] '''For the speed vs outlier analysis''' speedsStack = [] outlierStackX = [] outlierStackY = [] for fileName in list_of_files: print os.path.splitext(fileName)[0].split("/")[-1] #data_list = open( fileName, "r" ).readlines() featureSet = [] normalisedX = [] normalisedY = [] typeOfFlow = None speed = 0.0 #Saving x and y coordinates separately for range analysis xValues = [] yValues = [] colorCode += 1 #For one file should be one origin x and y originX = None originY = None flagOrigin = True with open(fileName,'r') as f: for line in f: if "TYPE" in line and line != "": typeOfFlow = line.split() typeOfFlow = typeOfFlow[1] #print typeOfFlow elif "SPEED" in line and line != "": speed = line.split() speed = speed[1] #print speed elif "VECTOR" in line and line != "": coordinates = line.split() #print coordinates[0] #TO DO: Only fixed start coordinates can be used. if flagOrigin: originX = float(coordinates[1]) originY = float(coordinates[2]) flagOrigin = False #normalisedX = float(coordinates[3]) - float(coordinates[1]) #normalisedY = float(coordinates[4]) - float(coordinates[2]) #xValues.append(coordinates[1]) xValues.append(float(coordinates[3]) - float(coordinates[1])) #yValues.append(coordinates[2]) yValues.append(float(coordinates[4]) - float(coordinates[2])) #featureSet.append(normalisedX) #featureSet.append(normalisedY) elif "FRAME" in line and line != "": frames = line.split() vectors = len(xValues) + 1 '''if vectors in [3660,3690,3900,4230,4470,4500,5730,9900]: print frames[1]''' featureFileName = os.path.splitext(fileName)[0].split("/")[-1] numberOfFiles += 1 #Calculating the outliers majorOutliersX,minorOutliersX = self.outlierDetection(xValues) majorOutliersY,minorOutliersY = self.outlierDetection(yValues) outlierStackX.append(len(minorOutliersX)) outlierStackY.append(len(minorOutliersY)) speedsStack.append(speed) #Calculating Range of the vectors valueRangeX,medianValueX = self.rangeFinder(xValues,majorOutliersX) valueRangeY,medianValueY = self.rangeFinder(yValues,majorOutliersY) #Plotting the median and range values mediansX.append(medianValueX) mediansY.append(medianValueY) speeds.append(speed) '''Note: The way this logic is implemented the outliers in y does not come into play at all''' withoutOutliersListX = [] withoutOutliersIndex = [] for i in range(len(xValues)): if xValues[i] not in majorOutliersX: withoutOutliersListX.append(xValues[i]) withoutOutliersIndex.append(i) withoutOutliersListY = [] for i in range(len(yValues)): if i in withoutOutliersIndex: withoutOutliersListY.append(yValues[i]) #featureSet.append(xValues[i]) #featureSet.append(yValues[i]) normalisedX.append(xValues[i]) normalisedY.append(yValues[i]) #Function call for the supplying the polar coordinates featureSet2 = [] featureSet2 = self.supplyStats(normalisedX,normalisedY) if self.flagPredict: self.predictResult(featureSet2) typeOfFlow = ['NA'] speed = 'NA' self.writeFeatures(featureSet2,featureFileName,typeOfFlow,speed) return featureSet '''This function takes input of the xdiff, ydiff values and returns the blocks of Stats in place of the raw data''' def supplyStats(self,xDiff,yDiff): xDiff = copy.deepcopy(xDiff) yDiff = copy.deepcopy(yDiff) xDiffBlocks = self.cutThelengthOfdata(xDiff,self.featuresPerSeries) yDiffBlocks = self.cutThelengthOfdata(yDiff,self.featuresPerSeries) #print xDiffBlocks statsAsFeatures = [] for i in range(len(xDiffBlocks)): #At this index the block of x values are stored, this will be processed for stats collection distances,angles,featureSet2 = self.supplyPolarCoordinates(xDiffBlocks[i],yDiffBlocks[i]) statsAsFeatures.append(np.mean(distances)) statsAsFeatures.append(np.mean(angles)) valueRange,std = self.rangeFinder(xDiffBlocks[i]) statsAsFeatures.append(std) statsAsFeatures.append(valueRange[1]-valueRange[0]) valueRange,std = self.rangeFinder(yDiffBlocks[i]) statsAsFeatures.append(std) statsAsFeatures.append(valueRange[1]-valueRange[0]) #print "Mean Length of vector,Mean angle,median distance,range diff distance,median angles,range diff angles" return statsAsFeatures '''This function returns the blocks of the feature to be exchanged by the stats data''' def cutThelengthOfdata(self,dataInput,lengthOfVector): counter = 0 eachLine = [] dataOutput = [] for i in range(len(dataInput)): #1,2,3,4,5,6,4,3,2,1 if counter < lengthOfVector: eachLine.append(dataInput[i]) counter += 1 else: counter = 1 dataOutput.append(eachLine) eachLine = [] eachLine.append(dataInput[i]) return dataOutput '''Distance calculator''' def distanceCalculator(self,diff1,diff2): distance = np.sqrt(pow(diff1,2)+pow(diff2,2)) return distance '''This function calculates the polar coordinates for supplied list of cartesian coordinates''' def supplyPolarCoordinates(self,xDiff,yDiff): xDiff = copy.deepcopy(xDiff) yDiff = copy.deepcopy(yDiff) polarDistances = [] angles = [] features = [] for i in range(len(xDiff)): dist = self.distanceCalculator(xDiff[i],yDiff[i]) polarDistances.append(dist) #theta = math.degrees(math.atan2(yDiff[i],xDiff[i])) theta = math.atan2(yDiff[i],xDiff[i]) angles.append(theta) features.append(dist) features.append(theta) return polarDistances,angles,features '''This function writes the vector series data and labels in .csv format. The length of the series depends on the parameters in init''' def writeFeatures(self,fileContent,fileName,typeOfFlow,speed): features = copy.deepcopy(fileContent) #print features #speed = float(speed) if speed == 'NA' and self.learningProblem != "classification": print "speed not available" return None else: print typeOfFlow,speed #path = './tuft_real_data/3May/extractedFeatures/' path = self.writePath with open(path+'data/'+fileName+'.csv', 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') eachLine = [] counter = 0 counter2 = 0 for i in features: eachLine.append(i) counter += 1 if counter%(self.numberOfFeatures) == 0: writer.writerow(eachLine) eachLine = [] counter2 += 1 with open(path+'labels/'+fileName+'label.csv', 'w') as csvfile2: writer2 = csv.writer(csvfile2,delimiter=',') counter = 0 for i in features: counter += 1 if counter%(self.numberOfFeatures) == 0: if self.learningProblem == "classification": writer2.writerow(typeOfFlow) else: #print "regression" writer2.writerow([speed]) def normalizeColumnwiseData(self): trainingData,desiredLabel = self.loadExperimentData() #Step 1: Check how many columns are there noOfColumns = len(trainingData[0]) trainDataArray = np.asarray(trainingData) #print trainingData.shape , noOfColumns normalizedData = np.zeros(trainingData.shape) for col in range(noOfColumns): columnVal = np.asarray(trainingData[:,col]) #print len(columnVal) , len(trainingData) #Step 2: For all the rows and specific column do the normalization meanSubstracted = columnVal - np.mean(columnVal) normalizedColumn = meanSubstracted/np.std(columnVal) #print "alles gut" #Step 3: Stack them vertically one by one normalizedData[:,col] =normalizedColumn #print normalizedData #print normalizedData.shape return normalizedData,desiredLabel def loadExperimentData(self): path = "./tuft_real_data/13June/extractedFeatures/" list_of_data_files = glob.glob(path+'data/*.csv') list_of_data_files = sorted(list_of_data_files) flagInitial = True for file_name in list_of_data_files: featureFileName = os.path.splitext(file_name)[0].split("/")[-1] #print featureFileName data = np.loadtxt(fname=file_name,delimiter=',') if flagInitial: flagInitial = False trainData = data else: trainData = np.vstack((trainData,data)) #For reading the labels list_of_label_files = glob.glob(path+'labels/*.csv') list_of_label_files = sorted(list_of_label_files) flagInitial = True for file_name in list_of_label_files: featureFileName = os.path.splitext(file_name)[0].split("/")[-1] #print featureFileName labels = np.loadtxt(fname=file_name,delimiter=',') if flagInitial: flagInitial = False trainLabel = labels else: trainLabel = np.concatenate((trainLabel,labels),axis=0) return trainData,trainLabel def svmClassifier(self): trainData,trainLabel = self.normalizeColumnwiseData() print "total available data",len(trainData) data_train,data_test,label_train,label_test = cross_validation.train_test_split(trainData,trainLabel,test_size=self.test_size) #self.clf = SVC(C=1.6,gamma=0.002) self.clf = self.clf.fit(data_train,label_train) print "prediction Accuracy",self.clf.score(data_test,label_test) print "Number of support vectors used:",len(self.clf.support_vectors_) '''#Use the cross_validation score clf2 = SVC(C=1.6,gamma=0.002) cv = cross_validation.ShuffleSplit(len(trainData), n_iterations=3,test_size=self.test_size, random_state=0) scores = cross_validation.cross_val_score(clf2, trainData, trainLabel, cv=cv) print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100 / 2)''' def svmRegressor(self): trainData,trainLabel = self.loadExperimentData() print "total available data",len(trainData) data_train,data_test,label_train,label_test = cross_validation.train_test_split(trainData,trainLabel,test_size=self.test_size) #self.clf = SVC(C=1.6,gamma=0.002) self.clf = self.clf.fit(data_train,label_train) print "prediction Accuracy",self.clf.score(data_test,label_test) print "Number of support vectors used:",len(self.clf.support_vectors_) '''#Use the cross_validation score clf2 = SVC(C=1.6,gamma=0.002) cv = cross_validation.ShuffleSplit(len(trainData), n_iterations=3,test_size=self.test_size, random_state=0) scores = cross_validation.cross_val_score(clf2, trainData, trainLabel, cv=cv) print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100 / 2)''' def predictResult(self,features): testData = copy.deepcopy(features) results = [] eachLine = [] counter = 0 confidence = [] for i in testData: eachLine.append(i) counter += 1 if counter%(self.numberOfFeatures) == 0: results.append(self.clf.predict(eachLine)[0]) #confidence.append(self.clf.predict_proba(eachLine)) eachLine = [] #print results #print confidence self.returnFrames(results) def returnFrames(self,results): results = copy.deepcopy(results) lastResult = 1 frameCounter = 0 for r in results: if r != lastResult: print frameCounter*self.featuresPerSeries,lastResult lastResult = r frameCounter += 1 print frameCounter*self.featuresPerSeries,lastResult
class Trainer(): def __init__(self): with open('credentials.json') as credentials_file: credentials = json.load(credentials_file) passwd = credentials['mysql']['password'] self.con = mdb.connect(host='127.0.0.1', port=3306, user='******', passwd=passwd, db='insight', autocommit=True) self.cur = self.con.cursor() print "Connected to database" self.load_data() def load_data(self): f = open('./pickles/mysql_dump.pickle', 'rb') self.loanData = pickle.load(f) self.loanData = pd.DataFrame(self.loanData) f.close() def drop_na(self): self.loanData = loanData.dropna() self.loanData.index = range(len(self.loanData)) def drop_columns(self): #drop the columns with malformed data in mysql db self.loanData = self.loanData.drop(['none', 'educational', 'IA', 'IDAHO', 'ME', 'NE', 'other_housing', 'issue_year'], 1) def drop_prepaid_loans(self): indices_to_drop = [] for i in range(len(self.loanData)): if self.loanData['loan_status'][i]==1 and self.loanData['days_to_zero_dollars'][i] < 1000: indices_to_drop.append(i) self.loanData = self.loanData.drop(indices_to_drop, 0) print "Number of prepaid loans: ", len(indices_to_drop) print "Number of loans after dropping prepaids: ", len(self.loanData) def define_features_targets(self, kind="regression"): #take out 1000 random loans with 36 month terms for testing #ids are already populated in test_loans for consistency test_ids = [] sql_query = "select id from test_loans;" self.cur.execute(sql_query) sql_resp = self.cur.fetchall() print "length of sql response: ", len(sql_resp) for val in sql_resp: test_ids.append(val[0]) print "length of test_ids: ", len(test_ids) #make the test and train data frames self.testLoanData = self.loanData[self.loanData['id'].isin(test_ids)] self.trainLoanData = self.loanData[~self.loanData['id'].isin(test_ids)] self.testLoanData.index = range(len(self.testLoanData)) self.trainLoanData.index = range(len(self.trainLoanData)) print "Train Loan Data: ", len(self.trainLoanData) print "Test Loan Data: ", len(self.testLoanData) self.features = self.trainLoanData.drop(['loan_status', 'days_to_zero_dollars', 'id'], 1) self.features = self.features.values #choose different target variables for regression vs classification if kind == "regression": self.targets = self.trainLoanData['days_to_zero_dollars'].values self.y_test = self.testLoanData['days_to_zero_dollars'].values elif kind == "classification": self.targets = self.trainLoanData['loan_status'].values self.y_test = self.testLoanData['loan_status'].values def preprocess(self): (self.X_train, self.X_cv, self.y_train, self.y_cv) = dm.split_train_test(features=self.features, targets=self.targets, test_size=0.1) self.X_test = self.testLoanData.drop(['loan_status', 'days_to_zero_dollars', 'id'], 1).values (self.X_train, self.X_cv) = dm.standardize_samples(self.X_train, self.X_cv) (self.X_train, self.X_cv) = dm.scale_samples_to_range(self.X_train, self.X_cv) (self.X_test, _) = dm.standardize_samples(self.X_test, self.X_test) (self.X_test, _) = dm.scale_samples_to_range(self.X_test, self.X_test) def define_dummy_classifier(self): self.clf = DummyClassifier() def define_rfr(self, n_estimators=10): self.regr = RandomForestRegressor(n_estimators=n_estimators, oob_score=True) print self.regr.get_params() def define_linear_regressor(self): self.regr = LinearRegression() print self.regr.get_params() def define_SVR(self, C=1, gamma=0.1): self.regr = SVR(C=C, gamma=gamma, verbose=3) print self.regr.get_params() def define_logistic_regressor(self, penalty="l2", C=1.0, class_weight=None): self.clf = LogisticRegression(penalty=penalty, C=C, class_weight=class_weight) print self.clf.get_params() def define_rfc(self, n_estimators=10): self.clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True) print self.clf.get_params() def train(self, kind="regression"): print "Fitting training data" if kind == "regression": self.regr.fit(self.X_train, self.y_train) elif kind == "classification": self.clf.fit(self.X_train, self.y_train) def predict(self, X, kind="regression"): if kind == "regression": self.prediction = self.regr.predict(X) elif kind == "classification": self.prediction = self.clf.predict(X) def score(self, X, y, kind="regression"): if kind == "regression": score_val = self.regr.score(X, y) print "R2 Score: ", score_val elif kind == "classification": score_val = self.clf.score(X, y) print "Accuracy: ", score_val print classification_report(y, self.prediction) self.precision = precision_score(y, self.prediction, labels=[0,1,2], average=None) print "\n\nPrecision Score: ", self.precision, "\n\n" self.accuracy = accuracy_score(y, self.prediction) def test(self, kind="regression"): #run clf and regr on the test data to determine to top 100 loans #the top loans are the ones least likely to default if kind == "regression": pred = self.regr.predict(self.X_test) print "length of regression pred: ", len(pred) for i, loan in enumerate(self.testLoanData['id']): sql_query = "UPDATE test_loans SET pred_days_to_zero_dollars=%s where id='%s';" %( pred[i], self.testLoanData['id'][i]) self.cur.execute(sql_query) print i elif kind == "classification": pred_proba = self.clf.predict_proba(self.X_test) for i, loan in enumerate(self.testLoanData['id']): sql_query = "UPDATE test_loans SET pred_default=%s, pred_paid=%s, pred_prepaid=%s where id='%s';" %( pred_proba[i][0], pred_proba[i][1],pred_proba[i][2], self.testLoanData['id'][i]) self.cur.execute(sql_query) self.con.close() def run_pca(self, n_components=20): self.pca = PCA(n_components=n_components) self.X_train = self.pca.fit_transform(self.X_train) print "Reduced data down to ", self.pca.n_components_, " dimensions: " print "Transforming cv data ..." self.X_cv = self.pca.transform(self.X_cv) print "Transforming test data ..." self.X_test = self.pca.transform(self.X_test) def plot_prediction(self): plt.scatter(self.prediction, self.y_cv) plt.xlabel('prediction') plt.ylabel('y_test') plt.show() def runSVRGridSearch(self): C_vals = [0.01, 0.1, 1, 10, 100] gamma_vals = [1E-2, 1E-1, 1, 1E1, 1E2, 1E3, 1E4] for C in C_vals: for gamma in gamma_vals: print "\n\n C: ", C, " gamma: ", gamma self.define_SVR(C=C, gamma=gamma) self.train() print "Training Scores:" self.predict(self.X_train) self.score(self.X_train, self.y_train) print "Testing Scores:" self.predict(self.X_cv) self.score(self.X_cv, self.y_cv) def roc(self): '''Compute ROC curve using one-vs-all technique''' pred_proba = self.clf.predict_proba(self.X_cv) fpr = [] tpr = [] thresholds = [] for i in [0, 1, 2]: fpr_i, tpr_i, thresholds_i = roc_curve(self.y_cv, pred_proba[:,i], pos_label=i) fpr.append(fpr_i) tpr.append(tpr_i) thresholds.append(thresholds_i) print "AUC: ", auc(fpr_i, tpr_i) plt.plot([0,1], [0,1], '--', color=(0.6, 0.6, 0.6)) plt.plot(fpr[0], tpr[0], label="Default", linewidth=3) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.show() def pickle_algo(self, X, fileName): print "pickling algorithm" f = open(fileName, 'wb') pickle.dump(X, f) f.close()
regr_rbf_3 = SVR(kernel="rbf", C=1.0, gamma=0.0002, epsilon=0.1) # Train the model using the training sets regr_linear.fit(annual_index_feature, annual_temp) regr_rbf_1.fit(annual_index_feature, annual_temp) regr_rbf_2.fit(annual_index_feature, annual_temp) regr_rbf_3.fit(annual_index_feature, annual_temp) # The coefficients #print 'Coefficients:', regr.coef_ # The mean square error print("Residual sum of squares: %.2f" % np.mean((regr_rbf_1.predict(annual_index_feature) - annual_temp) ** 2)) # Explained variance score: 1 is perfect prediction print('score1: %.2f' % regr_rbf_1.score(annual_index_feature, annual_temp)) print('score2: %.2f' % regr_rbf_2.score(annual_index_feature, annual_temp)) print('score3: %.2f' % regr_rbf_3.score(annual_index_feature, annual_temp)) # Plot outputs plt.figure(figsize=(20,5)) plt.bar(annual_index, annual_temp, width=0.7, edgecolor="none", color=(annual_temp>0).map({True: 'r', False: 'b'}), label="Annual Average Global Anomaly", alpha=0.3) plt.plot(prediction_annual_index[:], regr_linear.predict(prediction_annual_index[:]), color='green', linewidth=3, alpha=0.5, label="Linear Prediction") plt.plot(prediction_annual_index[:], regr_rbf_1.predict(prediction_annual_index[:]), color='blue', linewidth=3, alpha=0.5, label="RBF1 Prediction") plt.plot(prediction_annual_index[:], regr_rbf_2.predict(prediction_annual_index[:]), color='orange', linewidth=3, alpha=0.5, label="RBF2 Prediction") plt.plot(prediction_annual_index[:], regr_rbf_3.predict(prediction_annual_index[:]), color='red',
# Splitting data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle=True) print(X_train.shape, X_test.shape) print(y_train.shape, y_test.shape) print("=" * 25) # ---------------------------------------------------- # Applying SVR Model SVRModel = SVR(kernel='rbf', C=20.0) SVRModel.fit(X_train, y_train) # ---------------------------------------------------- # Calculating Details print('SVRModel Train Score is : ', SVRModel.score(X_train, y_train)) print('SVRModel Test Score is : ', SVRModel.score(X_test, y_test)) print("=" * 25) # ---------------------------------------------------- # Calculating Prediction y_pred = SVRModel.predict(X_test) print('Predicted Value for SVRModel is : ', y_pred[:5]) print('True Value for SVRModel is : ', y_test[:5]) print("=" * 25) # ---------------------------------------------------- # Calculating Mean Absolute Error MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average') print('Mean Absolute Error Value is : ', MAEValue) # ---------------------------------------------------- # Calculating Mean Squared Error MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
class LinearModels(object): def __init__(self, train_df, test_df): self.train_df = train_df self.test_df = test_df def fittingModels(self, predictors, out_fv, kernel="linear"): # logistic regression model self.predictors = predictors self.out_fv = out_fv self.kernel = kernel # linear regression self.lr_model = lm.LinearRegression().fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) # simple lasso model self.lasso_model = lm.Lasso(alpha=0.1).fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) # Naive Bayes algorithm self.nb_model = lm.GaussianNB().fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) # Bayesian Ridge Model - adapts to data at hand and regularized parameter is used self.br_model = lm.BayesianRidge().fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) # ARD Regression Model self.ard_model = lm.ARDRegression().fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) # SVR with linear kernel self.svm = SVR(C=1.0, epsilon=0.2, kernel=self.kernel).fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) # if number of dimensions are significantly larger than number of points then # LARS Lasso can be used self.lars_model = lm.LassoLars(alpha=0.1).fit( self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values ) def predictions(self): print "Simple Linear Regression Prediction" print self.lr_model.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values) self.lr_model_predict = self.lr_model.predict(self.test_df.loc[:, self.predictors].values) print "####################################" print "Simple LASSO Regression Prediction" print self.lasso_model.score( self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values ) self.lasso_model_predict = self.lasso_model.predict(self.test_df.loc[:, self.predictors].values) print "####################################" print "Bayesian Ridge Regression Prediction" print self.br_model.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values) self.br_model_predict = self.br_model.predict(self.test_df.loc[:, self.predictors].values) print "####################################" print "ARD Regression Prediction" print self.ard_model.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values) self.ard_model_predict = self.ard_model.predict(self.test_df.loc[:, self.predictors].values) print "####################################" print "Support Vector Regression Prediction" print self.svm.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values) self.svm_predict = self.svm.predict(self.test_df.loc[:, self.predictors].values) print "####################################" print "LARS LASSO Regression Prediction" print self.lars_model.score( self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values ) self.lars_model_predict = self.lars_model.predict(self.test_df.loc[:, self.predictors].values) print "####################################"
Y = Y[:-predict_Price] #print(Y) # Now split the data into x% training and y% testing in test_size , i.e = 0.8 ( 80% training 20% testing) # more training data -> better model , more testing data -> high accuracy on testing results x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.8) # Now using SVM (Regressor) s_Vector_Reg = SVR( kernel='rbf', C=1e3, gamma=0.1 ) #C is regularization parameter and gamma is a parameter that defines influence of single training s_Vector_Reg.fit( x_train, y_train ) # gamma is influence , smaller gamma ~ high influence ~ highly constrained model # Testing Model: 'Score' would return the coefficient of determination R^2 of the prediction. #best score = 1 , coefficient of determination = square of corelation between 'x' and 'y' scores svm_Confidence_Value = s_Vector_Reg.score(x_test, y_test) print("svm confidence value : ", svm_Confidence_Value) # we create the L.regression model linear_Regression = LinearRegression() # now we train the model linear_Regression.fit(x_train, y_train) #confidence score is how much a predicted base can be trusted linear_Regression_Confidence_Value = linear_Regression.score(x_test, y_test) print("linear regression confidence value : ", linear_Regression_Confidence_Value) # HENCE LINEAR REFRESSION MODEL IS BETTER THAN SVM MODEL # Set x_Predict_ equal to the last 30 rows of the original data set from 'Open Price' column x_Predict_ = np.array(data_Stocks.drop(['Prediction_Price'], 1))[-predict_Price:] #print(x_Predict_) # this is the data that we are going to do prediction on
print('X : ', len(X), '*', len(X[0]), ' Y : ', len(y), '*', '1') # Train on %s of the data train_idx = int(len(df) * train_factor) # create train and test data X_train, y_train, X_test, y_test = X[:train_idx], y[:train_idx], X[ train_idx:], y[train_idx:] ts_train, ts_test = ts[:train_idx], ts[train_idx:][seq_lag:] # fit and predict clf = SVR(kernel='rbf', C=1, epsilon=0.1) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score = clf.score(X_test, y_pred) print(score) print('length of output data are : ') print(len(X_test), len(y_pred), len(y_test), len(ts_test), len(ts_train), len(X_train), len(y_train)) print('-----') from sklearn.metrics import mean_squared_error from math import sqrt ms = mean_squared_error(y_test, y_pred) rms = sqrt(ms) print('MSE is ', ms) print(' ------------------------------------') def mean_absolute_percentage_error(y_true, y_pred): ape = []
#parse_data=parse_csv(Train_File_name) #X,Y=create_dataset(parse_data) #input Feature & output label failure=[12,15,28,39,53,53,60,60,60,63,68,68,82,91,97,97,102,103,103,104,105,109,109,113,125,126,131,158,165,166,166,173,183,189,193,194,202,204,214,229,230,235,235,237,238,238,239,243,251,253,257,260,263,266,268,271,271,272,274,279,284,288,288,291,293,299,305,308,323,323,327,328,333,336,347,349,369,389,392,393,405,410,411,411,417,435,435,435,435,441,441,453,467,468,488,509,512,517,558,559,573,587,644,644,655,728,734,769,783,994,1064] Y=[] X =[] for i in range(1,len(failure)+1): X.append([i]) for fail in failure: Y.append(fail) #************** #Now Create & Train Our Classifier clsf=SVR(kernel='rbf', C=1e2, gamma=0.1) print 'Training Started ..' clsf.fit(X,Y) #Now Load Testing dataset # Print Accuracy Test print 'TIme Real failure predicted failure' prediction=clsf.predict(X) for i in range(len(X)): print X[i],' ',Y[i],' ',prediction[i] print 'r-squire Score',clsf.score(X,Y)
lm.fit(x_train, y_train) print lm.score(x_test, y_test) print zip(features, lm.coef_) ##### score = 0.691 # random forest rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features='sqrt') rf.fit(x_train, y_train) print rf.score(x_test, y_test) print zip(features, rf.coef_) ##### score = 0.667 # svm - linear kernel svr_linear = SVR(kernel='linear', C=.5) svr_linear.fit(x_train, y_train) print svr_linear.score(x_train, y_train) print zip(features, svr_linear.coef_) ##### score = 0.686 # svm - rbf kernel svr_rbf = SVR(kernel='rbf', C=.5) svr_rbf.fit(x_train, y_train) print svr_rbf.score(x_train, y_train) print zip(features, svr_rbf.coef_) ##### score = 0.700 # let's transform our dependent variable y_train_log = np.log(y_train) y_test_log = np.log(y_test) # fill in nan and inf y_test_log = np.nan_to_num(y_test_log)
X2.append(list(get_composition_descriptors(c).values())) X = np.array(X) X2 = np.array(X) y = np.array(y) #pca = PCA(n_components=10, whiten=True) #X = pca.fit_transform(X) train_x, test_x, train_y, test_y = cross_validation.train_test_split( X, y, train_size=0.5) clf = linear_model.LinearRegression() clf.fit(train_x, train_y) train_y -= clf.predict(train_x) parameters = { 'n_estimators': [10, 100, 500], 'max_depth': [2, 3, 4], 'min_samples_split': [1, 2, 3, 4], 'learning_rate': [0.001, 0.01, 0.1] } #gbr = GradientBoostingRegressor() #clf = grid_search.GridSearchCV(gbr, parameters) clf = SVR() clf.fit(train_x, train_y) print(clf.score(test_x, test_y))
def SupportVectorRegression( symbol, seriesSet, c=100, Gamma=0.01, Epsilon=0.1, oosd_bin_size=10, oosd_lookback=100 ): # returnList, indexId, """ regression on the data and optionally display Parameters ---------- data : array like 2D array of float data. Returns ------- maskArray : numpy array 2D numpy array containing GaussianProcess fit """ fileName = "Images/SVR/%s.png" % (symbol) labels = ["High", "Low", "Open", "Close"] colors = ["r", "g", "b", "c"] # import matplotlib.pyplot as plt predictionSets = [] scoreSets = [] count = 0 for series in seriesSet: # Perform an analysis of the model w/ ISD and OOSD full_series = np.array(series) # H = len(series) - oosd_lookback #Len of history data # L = oosd_lookback-oosd_bin_size #Len of analysis lookback l = oosd_bin_size # Len of out of sample AND prediction domain(how many days forecasted) power = 1 # N where X^n for weight function prediction_history = [] for i in np.arange(oosd_lookback / oosd_bin_size): # Index of current in same, and out of sample data. # 3 cases of this slicing if i == 0: # First run, only two bins to work with(First OOSD bin, and the rest of the data) ISD = full_series[l:] OOSD = full_series[:l] X = np.arange(l, len(full_series)) # use a variable weight (~0 - 1.0) weight_training = ( np.power(np.arange(l, len(full_series), dtype=float), power)[::-1] / np.power(np.arange(l, len(full_series), dtype=float), power)[::-1].max() ) # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_score = np.concatenate( ( np.power(np.arange(1, l + 1, dtype=float), power) / np.power(np.arange(1, l + 1, dtype=float), power).max(), np.power(np.arange(l + 1, len(full_series) + 1, dtype=float), power)[::-1] / np.power(np.arange(l + 1, len(full_series) + 2, dtype=float), power)[::-1].max(), ) ) """print len (weight_training) print weight_training print len (weight_score) print weight_score print exit()""" elif i == oosd_lookback / oosd_bin_size - 1: # Last run, only two bins to work with(Last OOSD bin, and the rest of the data) ISD = full_series[:-l] OOSD = full_series[-l:] X = np.arange(0, len(full_series) - l) # use a variable weight (~0 - 1.0) weight_training = ( np.power(np.arange(l, len(full_series), dtype=float) + 1, power) / np.power(np.arange(l, len(full_series), dtype=float) + 1, power).max() ) # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_score = np.concatenate( ( np.power(np.arange(1, len(full_series) - l + 1, dtype=float), power) / np.power(np.arange(1, len(full_series) - l + 2, dtype=float), power).max(), np.power(np.arange(1, l + 1, dtype=float), power)[::-1] / np.power(np.arange(1, l + 1, dtype=float), power)[::-1].max(), ) ) """print len (weight_training) print weight_training print len (weight_score) print weight_score print exit()""" else: # Any other run, we have a sandwhich of OOSD in the middle of two ISD sets so we need to aggregate. ISD = np.concatenate((full_series[: (l * i)], full_series[l * (i + 1) :])) OOSD = full_series[l * i : l * (i + 1)] X = np.concatenate((np.arange(0, (l * i)), np.arange(l * (i + 1), len(full_series)))) # use a variable weight (~0 - 1.0) weight_training = np.concatenate( ( np.power(np.arange(1, l * i + 1, dtype=float), power) / np.power(np.arange(1, l * i + 1, dtype=float), power).max(), np.power(np.arange(l * (i + 1), len(full_series), dtype=float), power)[::-1] / np.power(np.arange(l * (i + 1), len(full_series), dtype=float), power)[::-1].max(), ) ) # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_score = np.concatenate( ( np.power(np.arange(1, l * (i + 1) + 1, dtype=float), power) / np.power(np.arange(1, l * (i + 1) + 1, dtype=float), power).max(), np.power(np.arange(l * (i + 1), len(full_series), dtype=float), power)[::-1] / np.power(np.arange(l * (i + 1), len(full_series) + 1, dtype=float), power)[::-1].max(), ) ) """print len (weight_training) print weight_training print len (weight_score) print weight_score exit()""" # Domain and range of training data # X = np.arange(len(ISD)) X = np.atleast_2d(X).T y = ISD # Domain of prediction set # x = np.atleast_2d(np.linspace(0, len(ISD)+len(OOSD)-1, len(ISD)+len(OOSD))).T # x = np.atleast_2d(np.linspace(len(ISD) ,len(ISD)+len(OOSD)-1, len(OOSD))).T x = np.atleast_2d(np.linspace(0, len(full_series) - 1, len(full_series))).T # epsilon-Support Vector Regression using scikit-learn # Read more here: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html SVR_model = SVR(kernel="rbf", C=c, gamma=Gamma, epsilon=Epsilon) SVR_model.fit(X, y, weight_training) y_predSVR = SVR_model.predict(x) if np.isnan(full_series).any() or np.isinf(full_series).any(): log.debug(stock.symbol + " Failed due to data INF or NAN") y_score = 0 break else: y_score = SVR_model.score( x, full_series, weight_score ) # y_predSVR[-len(OOSD):] np.atleast_2d(y_predSVR[::-1]).T # log.debug(y_score) # print y_score prediction_history.append(y_score) score = np.mean(y_score) # print "" # print score # exit() # Make the next day's prediction X = np.arange(series.shape[0]) X = np.atleast_2d(X).T y = series startTime = time.time() # Mesh the input space for evaluations of the real function, the prediction and # its MSE x = np.atleast_2d(np.linspace(len(series), len(series), 1.0)).T # kernel='rbf', ‘linear’, ‘poly’, ‘sigmoid’, ‘precomputed’ SVR_model = SVR(kernel="rbf", C=c, gamma=Gamma, epsilon=Epsilon) # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0) weight_training = ( np.power(np.arange(1, len(X) + 1, dtype=float), power) / np.power(np.arange(1, len(X) + 1), power).max() ) # Fit to data using Maximum Likelihood Estimation of the parameters SVR_model.fit(X, y, weight_training) # Make the prediction on the meshed x-axis (ask for MSE as well) y_pred = SVR_model.predict(x) """print len(X) print X print len(x) print x exit()""" # print SVR_model.score(x,y_pred) # score = gp.score(y, y_pred) # print score predictionSets.append(y_pred) scoreSets.append(score) # print "{0:0.1f} minutes to compute Gaussian Process & Fit.".format((time.time() - startTime)/60.0) count += 1 lookBack = -1 return ( predictionSets[0], predictionSets[1], predictionSets[2], predictionSets[3], predictionSets[4], scoreSets[0], scoreSets[1], scoreSets[2], scoreSets[3], scoreSets[4], )
from sklearn.metrics import mean_squared_error # In[54]: def rmse(Y_Test, Y_Pred): return np.sqrt(mean_squared_error(Y_Test, Y_Pred)) # In[55]: from sklearn.svm import SVR svr = SVR() svr.fit(X_Train, Y_Train) svr_score = svr.score(X_Test, Y_Test) svr_rmse = rmse(Y_Test, svr.predict(X_Test)) svr_score, svr_rmse # In[62]: from sklearn.linear_model import LinearRegression # In[63]: lr = LinearRegression() # In[65]: lr.fit(X_Train, Y_Train) lr_score = lr.score(X_Test, Y_Test)