def nonlinear_covariate_mat(dset, labels, meta, tissue, cov_matrix): '''Calculates non-linear covariates for a Dataset based on nucleotide resolution RNA-seq -dset: Pandas DataFrame with RNA-seq expression values. -labels: DataFrame used for assigning variables on meta. -meta: DataFrame of potential covariates. -cov_matrix: Dataframe were store R^2 values. -tissue: class used''' y_model = copy.deepcopy(dset) x_model = copy.deepcopy(labels) cov = copy.deepcopy(meta) cov_list = cov.columns x_model = x_model[x_model[0] == tissue] y_model = y_model[y_model.index.isin(x_model.index)] pca = KernelPCA(n_components=None, kernel='rbf', random_state=0) pc = pca.fit_transform(y_model) x_ = copy.deepcopy(x_model) for w in cov_list: sys.stderr.write(tissue + " " + w + "\n") x_model = copy.deepcopy(x_) covariate = pd.DataFrame(cov.loc[:, w]) if w.startswith('MH') and (cov[w].dtype == 'float64'): covariate[w] = covariate.loc[:, w].astype('category').cat.codes x_model = assign_val(x_model, covariate, w, 0) x_model = pd.get_dummies(x_model) lm = KernelRidge(alpha=1, kernel='rbf') lm.fit(x_model, pc) r2 = lm.score(x_model, pc) cov_matrix.loc[w, tissue] = r2 elif covariate[w].dtype == object: covariate[w] = covariate.loc[:, w].astype('category').cat.codes x_model = assign_val(x_model, covariate, w, 0) x_model = pd.get_dummies(x_model) lm = KernelRidge(alpha=1, kernel='rbf') lm.fit(x_model, pc) r2 = lm.score(x_model, pc) cov_matrix.loc[w, tissue] = r2 elif covariate[w].dtype == 'int64' and w != 'AGE': covariate[w] = covariate.loc[:, w].astype('category').cat.codes x_model = assign_val(x_model, covariate, w, 0) x_model = pd.get_dummies(x_model) lm = KernelRidge(alpha=1, kernel='rbf') lm.fit(x_model, pc) r2 = lm.score(x_model, pc) cov_matrix.loc[w, tissue] = r2 else: x_model = assign_val(x_model, covariate, w, 0) if x_model[0].max() != 0.0: x_model = x_model / x_model.max() lm = KernelRidge(alpha=1, kernel='rbf') lm.fit(x_model.values.reshape(-1, 1), pc) r2 = lm.score(x_model.values.reshape(-1, 1), pc) cov_matrix.loc[w, tissue] = r2 return cov_matrix
def kernel_ridge(g): """ :param split: :return: """ x_train, x_test, y_train, y_test = split_data() kern = KernelRidge(kernel='rbf', gamma=g).fit(x_train, y_train) print("Kernel score training: ", kern.score(x_train, y_train)) print("Kernel score test: ", kern.score(x_test, y_test)) predictions = kern.predict(x_test) plot_assignments(predictions, y_test)
def choose_krr_kernel(train_x, test_x, train_y, test_y): kernels = ['linear', 'rbf', 'laplacian', 'polynomial', 'sigmoid'] kernel_scores = [] best_k_score = 0.0 best_k = "" for k in kernels: krr = KernelRidge(kernel=k) krr.fit(train_x, train_y) krr.predict(test_x) score = krr.score(test_x, test_y) if score > best_k_score: best_k_score = score best_k = k kernel_scores.append(score) print(kernel_scores) print("Best kernel: " + str(best_k)) print("Score received: " + str(best_k_score)) plt.bar(kernels, kernel_scores) plt.xlabel('Kernel') plt.ylabel('Score') plt.xticks(np.arange(len(kernels)), kernels) plt.title('Tuning Kernel Hyperparameter for KRR') plt.show()
def train_krrl_linear(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training KernerRidge with linear kernel...') start_time = self.timer() krrl = KernelRidge(alpha=1) krrl.fit(x_tr, y_tr) print("The R2 is: {}".format(krrl.score(x_tr, y_tr))) # print("The alpha choose by CV is:{}".format(krrl.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(krrl.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/krrlLinearK.pkl', 'wb') as f: pickle.dump(krrl, f) print('Making prediction and saving into a csv') y_test = krrl.predict(self.x_test) return y_test
def prin(X,y,file,dic): t=100 #clf = MLPRegressor(solver=dic['solver'],activation=dic['activation'],hidden_layer_sizes=eval(dic['hls']), batch_size = dic['batch_size'], max_iter=dic['max_iter']) #clf = LinearRegression() clf=KernelRidge(alpha=0.001,kernel='laplacian',degree=18) X_train, X_test, y_train, y_test= cross_validation.train_test_split(X,y,test_size=float(dic['test_size'])) clf.fit(X_train, y_train) print 'Training size',len(X_train) print 'Testing size',len(X_test) #scores = cross_val_score(clf, X, y, cv=5) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) accuracy = clf.score(X_train,y_train) print 'accuracy',accuracy,'\n' print 'RMSE',math.sqrt(metrics.mean_squared_error(y_test,clf.predict(X_test))) MAE=metrics.mean_absolute_error(y_test,clf.predict(X_test)) print 'MAE',MAE #X_test,y_test=X[-t:],y[-t:] #file=file[-t:] pr=clf.predict(X_test) print 'Filename Percentage Error Actual Value Predicted Value Difference\n' for i in range (len(y_test)): if y_test[i]==0.0: y_test[i]=0.0000001 predi=str(round(((pr[i]-y_test[i])/y_test[i])*100,2))+' %' print file[i]+' '*(20-len(file[i])),' '*(20-len(predi))+ predi, ' '*(20-len(str(y_test[i])))+str(y_test[i]) , ' '*(20-len(str(round(pr[i],2))))+str(round(pr[i],2)),' '*(20-len(str(round((y_test[i]-pr[i]),4))))+str(round((y_test[i]-pr[i]),4)) #print 'Mean square Error',mean_squared_error(X,pr) #print 'R2 score',r2_score(X,pr) #test(X,y,file,clf.coef_[0],clf.intercept_[0]) #plot_g(clf) return MAE
def choose_krr_gamma(train_x, test_x, train_y, test_y): gammas = [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1.0, 2.0] gamma_scores = [] best_g_score = 0.0 best_g = "" for g in gammas: krr = KernelRidge(kernel="laplacian", gamma=g) krr.fit(train_x, train_y) krr.predict(test_x) score = krr.score(test_x, test_y) if score > best_g_score: best_g_score = score best_g = g gamma_scores.append(score) print(gamma_scores) print("Best gamma: " + str(best_g)) print("Score received: " + str(best_g_score)) plt.plot(gammas, gamma_scores) plt.xlabel('Gamma') plt.ylabel('Score') plt.title('Tuning Gamma Hyperparameter for KRR') plt.show()
def choose_krr_alpha(train_x, test_x, train_y, test_y): alphas = [0.01, 0.1, 0.25, 0.5, 0.75, 1.0, 2.0] alpha_scores = [] best_a_score = 0.0 best_a = "" for a in alphas: krr = KernelRidge(kernel="laplacian", alpha=a) krr.fit(train_x, train_y) krr.predict(test_x) score = krr.score(test_x, test_y) if score > best_a_score: best_a_score = score best_a = a alpha_scores.append(score) print(alpha_scores) print("Best alpha: " + str(best_a)) print("Score received: " + str(best_a_score)) plt.plot(alphas, alpha_scores) plt.xlabel('Alpha') plt.ylabel('Score') plt.title('Tuning Alpha Hyperparameter for KRR') plt.show()
def krr_predict(krr_params, train_x, test_x, train_y, test_y): print("Starting KRR prediction") a = krr_params['alpha'] g = krr_params['gamma'] k = krr_params['kernel'] krr = KernelRidge(alpha=a, kernel=k, gamma=g) krr.fit(train_x, train_y) print("KRR Score: ", krr.score(test_x, test_y)) cv_score = cross_val_score(krr, test_x, test_y, cv=10) print("Cross-Val Standard Deviation: ", np.std(cv_score)) print("Scores:", krr.score(test_x, test_y)) return krr.score(test_x, test_y)
def KRR(self): from sklearn.kernel_ridge import KernelRidge ti, to = self.LoadData(True) print "KRR: input shape", ti.shape, " output shape", to.shape #krr = KernelRidge() krr = KernelRidge(alpha=0.0001, kernel='rbf') trainsize = int(ti.shape[0] * 0.5) krr.fit(ti[0:trainsize, :], to[0:trainsize]) predict = krr.predict(ti[trainsize:, :]) print predict.shape krr_acc_pred = np.zeros((predict.shape[0], 2)) krr_acc_pred[:, 0] = to[trainsize:].reshape(to[trainsize:].shape[0]) krr_acc_pred[:, 1] = predict.reshape(predict.shape[0]) np.savetxt("krr_acc_pred.dat", krr_acc_pred) print "KRR train R^2:", krr.score(ti[0:trainsize, :], to[0:trainsize]) print "KRR test R^2:", krr.score(ti[trainsize:, :], to[trainsize:]) return
def train_model(df, featureset_keys, kernel="linear", alpha=1.0, gamma=None, degree=None, coef0=None): # Setup Parameters for Model kr_args = {"kernel": kernel, "alpha": alpha} # Validate parameters for polynomial if kernel == "polynomial": if degree is None or coef0 is None: print("Must provide a parameter for degree and coef0") return None else: kr_args["gamma"] = gamma kr_args["degree"] = degree kr_args["coef0"] = coef0 # Initialize the figure size plt.figure(figsize=(20, 10)) # Store the results of each training run predictions = [] scores = [] # Save the best model to return best_model = None baseline = 0.0 i = 0 for train, test in RepeatedKFold(n_splits=5, n_repeats=1).split(df): # Split dataset train_x, train_y = (df.iloc[train])[featureset_keys], (df.iloc[train])['happiness_score'] test_x, test_y =(df.iloc[test])[featureset_keys], (df.iloc[test])['happiness_score'] # Initialise model kr_model = KernelRidge(**kr_args) # Train model kr_model.fit(train_x, train_y) # Evaluate model pred_y = kr_model.predict(test_x) score = kr_model.score(test_x, test_y) # Save if better then previous if score > baseline: best_model = kr_model predictions.append(pred_y) plt.scatter(test_y, p, label=f"iter {i}") scores.append(score) i = i+1 plt.plot(df['happiness_score'], df['happiness_score'], label='actual') plt.xlabel('True Values') plt.ylabel('Predictions') plt.legend(loc="upper left", bbox_to_anchor=(1.05, 1)) return best_model, plt, scores print(pred_scores)
def regression_biking(X_train, X_test, y_train, y_test): # allez on apprend ! clf_ridge = KernelRidge(kernel='rbf') clf_ridge.fit(X_train, y_train) clf_lasso = Lasso() clf_lasso.fit(X_train, y_train) clf_linear = LinearRegression() clf_linear.fit(X_train, y_train) # show me the numbers ! print('[ERREUR DE PREDICTION]') print('Classifier Ridge : {}'.format(clf_ridge.score(X_test, y_test))) print('Classifier Lasso : {}'.format(clf_lasso.score(X_test, y_test))) print('Classifier Linear : {}'.format(clf_linear.score(X_test, y_test))) # erreurs d'apprentissage print('[ERREUR D\'APPRENTISSAGE]') print('Classifier Ridge : {}'.format(clf_ridge.score(X_train, y_train))) print('Classifier Lasso : {}'.format(clf_lasso.score(X_train, y_train))) print('Classifier Linear : {}'.format(clf_linear.score(X_train, y_train))) print('[RUNNING GRID SEARCH]') # cross validation models = { 'Ridge': Ridge(), 'Lasso': Lasso() } params = { 'Lasso': { 'alpha': [1, 5, 10, 20, 30, 50, 70, 100, 1_000, 10_000] }, 'Ridge': { 'alpha': [1, 5, 10, 20, 30, 50, 70, 100, 1_000, 10_000] } } grid = EstimatorSelectionHelper(models, params) grid.fit(X_train, y_train, n_jobs=2) print(grid.score_summary(sort_by='mean_score', num_rows_per_estimator=5))
def regr(stock, show=False, save=False): training_set, test_set, training_label, test_label = get_train_test_data( stock, l_max=500, l_ratio=4.0, s_max=5, binary=False) # Decision Tree regressor ''' clf = tree.DecisionTreeRegressor() #clf = tree.DecisionTreeClassifier(criterion='entropy') #clf.fit(data_set[0:training_size],b_label[0:training_size]) clf.fit(training_set,training_label) #print 'Decision Tree:',clf.score(data_set[training_size+1:training_size+1+ length_of_validation], b_label[training_size+1:training_size+1+length_of_validation]) print 'Decision Tree:',clf.score(test_set,test_label) ''' kernel_rgr = KernelRidge(alpha=1.0, kernel='linear') kernel_rgr.fit(training_set, training_label) kernel_score = kernel_rgr.score(test_set, test_label) print 'Kernel Regression:', kernel_score # Gaussian Process Regressor ''' gpr = GaussianProcessRegressor(alpha = 1e-3,n_restarts_optimizer = 10,normalize_y=True) gpr.fit(training_set,training_label) print "Gaussain Process Regression:",gpr.score(test_set,test_label) ''' # SVM with Polynomial model ''' #clf_svm = svm.SVC() clf_svm = svm.SVR(kernel='poly', C=1e2, degree=3) clf_svm.fit(training_set,training_label) #print 'SVM:',clf_svm.score(data_set[training_size+1:training_size+1+ length_of_validation], b_label[training_size+1:training_size+1+length_of_validation]) svm_score = clf_svm.score(test_set,test_label) print 'SVM:',svm_score ''' if show: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) #ax.plot(clf_svm.predict(test_set),label='SVM') #ax.plot(clf.predict(test_set),label = 'DT') ax.plot(test_label, label='Actual data') ax.plot(kernel_rgr.predict(test_set), label='Kernel Rgr') ax.set_xlabel("Relative Time Stamp") ax.set_ylabel("Normalized Price") ax.legend() if save: #if kernel_score >=0.3: fig.savefig("regr_output\\" + stock + '_regr_output_' + str(round(kernel_score, 4)) + '.png', dpi=500) plt.close('all') return kernel_score #,svm_score]
def execute(self, dataset): # X - набор свойств, y - результат, зависящий от X X, y = dataset X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.validation_fraction) X0, X1 = X[:, 0], X[:, 1] xx, yy = self.make_meshgrid(X0, X1) labels = set(y) colors = ListedColormap([ plt.get_cmap(name="rainbow")(each) for each in np.linspace(0, 1, len(labels)) ]) classifier = KernelRidge(alpha=self.alpha, kernel=self.kernel, degree=self.degree, coef0=self.coef0, gamma=self.gamma) classifier.fit(X_train, y_train) xxr, yyr = xx.ravel(), yy.ravel() cxy = np.c_[xx.ravel(), yy.ravel()] Z = classifier.predict(cxy) Z = Z.reshape(xx.shape) plt.clf() plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=colors, s=20, edgecolors='k') plt.scatter(X_test[:, 0], X_test[:, 1], alpha=0.5, c=y_test, cmap=colors, s=20, edgecolors='k') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) score = classifier.score(X_test, y_test) plt.title('Kernel Ridge Reduction Classification\n score: ' + str(round(score, 5))) plt.show()
def _function(self, x): thetaParam = np.power(10, x[0]) lambdaParam = np.power(10, x[1]) x = x[2:] (Features, targets) = self._prepareDataset() kf = KFold(n_splits=10, shuffle=True) scoreList = list() for train_index, test_index in kf.split(Features): regressor = KernelRidge(alpha=lambdaParam, kernel='rbf', gamma=thetaParam) regressor.fit(X=Features[train_index], y=targets[train_index], sample_weight=x[train_index]) scoreList.append( regressor.score(X=Features[test_index], y=targets[test_index])) return np.mean(scoreList)
def seleccionarMejorAlgoritmoRegresion(X_train, y_train, X_test, y_test): scores = {} '''Importacion de todos los algoritmos que vamos a implementar''' from sklearn.kernel_ridge import KernelRidge from sklearn import linear_model from sklearn.linear_model import Ridge from sklearn.tree import DecisionTreeRegressor '''Declaracion de los algoritmos y entrenamiento de los algortimos''' '''Kernel Ridge''' kernelRidge = KernelRidge(kernel="polynomial") kernelRidge.fit(X_train, y_train) scoreKernelRidge = kernelRidge.score(X_test, y_test) scores[kernelRidge] = scoreKernelRidge '''Bayesian Ridge''' bayesianRidge = linear_model.BayesianRidge() bayesianRidge.fit(X_train, y_train) scoreBayesianRidge = bayesianRidge.score(X_test, y_test) scores[bayesianRidge] = scoreBayesianRidge '''Linear Regression''' linearRegression = linear_model.LinearRegression() linearRegression.fit(X_train, y_train) scoreLinearRegression = linearRegression.score(X_test, y_test) scores[linearRegression] = scoreLinearRegression '''Ridge Regression''' ridge = Ridge(alpha=1.0) ridge.fit(X_train, y_train) scoreRidge = ridge.score(X_test, y_test) scores[ridge] = scoreRidge '''Decission Tree Regression''' decisionTreeRegressor = DecisionTreeRegressor(random_state=0) decisionTreeRegressor.fit(X_train, y_train) scoreDecisionTreeRegressor = decisionTreeRegressor.score(X_test, y_test) scores[decisionTreeRegressor] = scoreDecisionTreeRegressor import operator return max(scores.items(), key=operator.itemgetter(1))[0]
X, Y = boston.data, boston.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3) ''' 核岭回归: 在l2正则化的线性模型(岭回归)的基础上,引入了核技术的概念 在岭回归中,用w* = ∑β*z,也就是β代替w 代价函数随之替换一下即可 使用梯度下降求解,β = (λI + K)^-1 * y 特点: 对于中型数据集较快,但对于大数据集就很吃力了 训练时间复杂度O(n^3),挺高的 预测时间复杂度O(n) ''' rg = KernelRidge(alpha=1, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None) rg.fit(X_train, Y_train) Y_pre = rg.predict(X_test) rg.score(X_test, Y_test) ''' alpha 惩罚项系数 kernel 核函数的选定 gamma 核函数的中的一个参数项 degree 多项式核的程度 coef0 多项式核和sigmoid核中一个参数设定 kernel_params 核函数的附加参数 '''
score = 0 oof_predictions = np.zeros(X.shape[0]) for fold, (train_index, test_index) in enumerate(kf.split(X)): X_train, X_valid = X[train_index, :], X[test_index, :] y_train, y_valid = y[train_index], y[test_index] clf = KernelRidge(alpha=.6) clf.fit(X_train, y_train) pred0 = clf.predict(X) pred1 = clf.predict(test) oof_predictions[test_index] = clf.predict(X_valid) #, ntree_limit=model.best_ntree_limit predictions0[:, fold] = pred0 predictions1[:, fold] = pred1 score += clf.score(X_train, y_train) #.best_score print('Fold %d: Score %f'%(fold, clf.score(X_train, y_train))) # print('Fold %d: Score %f'%(fold, clf)) # prediction0 = predictions0.mean(axis=1) prediction1 = predictions1.mean(axis=1) score /= n_splits oof_score = r2_score(y, oof_predictions) print('=====================') print('Final Score %f'%score) print ('Final Out-of-Fold Score %f'%oof_score) print ('=====================') print("Creating layer 1 prediction CSV files for training and test") submission = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv')
scores = cross_val_score(regr, data.df[inputVariables].values, data.df['count'].values) print("Linear Regression cross validation score: ", scores.mean()) regr.fit(X_train_sum, y_train_sum) print("Linear Regression training score: ", regr.score(X_train_sum, y_train_sum)) print("Linear Regression testing score: ", regr.score(X_test_sum, y_test_sum)) ##### Kernel Ridge and Support Vector Regression ##### ## Finding the best parameters alpha=[1,1e-1,1e-2,1e-3] for a in alpha: kr = KernelRidge(kernel='rbf', alpha=a) kr.fit(X_train_sum, y_train_sum) print("Kernel Ridge train score: ", kr.score(X_train_sum, y_train_sum), " for alpha = %s" %a) print("Kernel Ridge test score: ", kr.score(X_test_sum, y_test_sum), " for alpha = %s" %a) ### Using GridSearchCV param_grid = { 'alpha': [1, 1e-1, 1e-2] "gamma": np.logspace(-2, 2, 5) } GSKernelRidge = GridSearchCV(KernelRidge(kernel='rbf'), param_grid=param_grid) GSKernelRidge.fit(X_train_sum, y_train_sum)
clfknn.fit(X_train, y_train) # Ridge regression clfridge = Ridge(alpha=2) clfridge.fit(X_train, y_train) # Kernel Ridge regression clfkrr = KernelRidge(alpha=0.5) clfkrr.fit(X_train, y_train) X_test = X[1000:1800] y_test = y[1000:1800] confidenceknn = clfknn.score(X_test, y_test) confidenceridge = clfridge.score(X_test, y_test) confidencekrr = clfkrr.score(X_test, y_test) winner_clf = max(confidenceknn, confidenceridge, confidencekrr) print('Score for KNN confidence is', confidenceknn) print('Score for Ridge Regression confidence is', confidenceridge) print('Score for Kernel Ridge Regression confidence is', confidencekrr) print('The highest score is', winner_clf) forecast_set = clfknn.predict(X_lately) dfreg['Forecast'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1)
print('Quadratic Discriminant Analysis') quadDisc = QuadraticDiscriminantAnalysis() quadDisc.fit(X_train, y_train) y_test_pred = quadDisc.predict(X_test) matrix = confusion_matrix(y_test, y_test_pred) score = quadDisc.score(X_test, y_test) no_selection_performance.append( ('Quadratic Discriminant Analysis', score, matrix)) print('Kernel Ridge Regression') kerRid = KernelRidge(alpha=1.0) kerRid.fit(X_train, y_train) y_test_pred = kerRid.predict(X_test) y_test_pred = [int(round(x)) for x in y_test_pred] matrix = confusion_matrix(y_test, y_test_pred) score = kerRid.score(X_test, y_test) no_selection_performance.append(('Kernel Ridge Regression', score, matrix)) print('SVC') svc = svm.SVC(C=1, class_weight=None, coef0=0, gamma='scale', kernel='rbf', shrinking=True, tol=1e-1) svc.fit(X_train, y_train) y_test_pred = svc.predict(X_test) matrix = confusion_matrix(y_test, y_test_pred) score = svc.score(X_test, y_test) no_selection_performance.append(('SVC', score, matrix))
import seaborn as sns import numpy as np seed = 45 # data prep===== df= pd.read_csv("wine_quality.csv") y= df.pop("quality") X_train, X_test, y_train, y_test = train_test_split(df, y, test_size= 0.3, random_state= seed) model1= RandomForestRegressor(max_depth=3, random_state=seed) model1.fit(X_train, y_train) train_score_rf= model1.score(X_train, y_train)*100 test_score_rf= model1.score(X_test, y_test)*100 with open("metrics.txt", "w") as outfile: outfile.write("Random Forest Training variance explained: %2.1f%% \n" % train_score_rf) outfile.write("Random Forest Test variance explained: %2.1f%% \n" % test_score_rf) model2= KernelRidge(alpha=1) model2.fit(X_train, y_train) train_score_kr= model2.score(X_train, y_train)*100 test_score_kr= model2.score (X_test, y_test)*100 with open("metrics.txt", "a") as outfile: outfile.write("Kernel Ridge Training variance explained: %2.1f%% \n" %train_score_kr) outfile.write("Kernel Ridge Test variance explained: %2.1f%% \n" %test_score_kr)
from sklearn.kernel_ridge import KernelRidge import numpy as np from importation_pandas import importcsv from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix setX, setY = importcsv() X_train, X_test, y_train, y_test = train_test_split(setX, setY, test_size=0.3) clf = KernelRidge(alpha=1.0) clf.fit(X_train, y_train) result1 = clf.predict(X_test) print(clf.score(X_test, y_test)) print(confusion_matrix(y_test,result1))
fig = plt.figure() ax = fig.add_subplot(111) red = ax.scatter(Xtrain, ytrain, color='red', marker='+') knn_plot = ax.plot(Xtest, knn.predict(Xtest), color='green') kridge_plot = ax.plot(Xtest, kridge.predict(Xtest), color='blue') base = ax.plot(Xtest, dummy.predict(Xtest), color='orange', linestyle='--') ax.set_ylabel("output Y", fontsize=20) ax.set_xlabel("input X", fontsize=20) fig.legend(["kNN", "KernelRidge", "baseline", "train"], scatterpoints=1, loc='right', ncol=2, fontsize=15) ax.set_title( "kNN & KernelRidge Predictions", fontsize=20) # Compute percentage of accuracy for each predictions knn_accuracy = knn.score(Xtrain, ytrain) kridge_accuracy = kridge.score(Xtrain, ytrain) baseline_accuracy = dummy.score(Xtrain, ytrain) # Print outputs print("base model accuracy score: ", baseline_accuracy, " - knn model accuracy score: ", knn_accuracy, " - kridge accuracy: ", kridge_accuracy) plt.show()
np.logspace(-5, 5) }) estimator.fit(X_train, Y_train) best_alpha = estimator.best_params_['alpha'] best_gamma = estimator.best_params_['gamma'] # Train with the best parameters estimator2 = KernelRidge(alpha=best_alpha, coef0=1, gamma=best_gamma, kernel='laplacian', kernel_params=None) estimator2.fit(X_train, Y_train) y_predicted = estimator2.predict(X_test) r2 = estimator2.score(X_test, Y_test, sample_weight=None) print('r^2 = ', r2) mae = mean_absolute_error(y_predicted, Y_test) print(mae) # Plotting n_test = len(Y_test) Y_testing = [] sg_testing = [] for i in range(n_test): for j in range(N_data): if Y_test[i, 0] == Y_sp[j, 0]: sg_testing.append(sg_sp[j]) break
# backarr2=backarr2.reshape(-1, 1) # regressor2.fit(backarr2, outarr) # mid=regressor2.predict(x) # #mid2=classificator.score(x, y) # #print (x, y) # backarr2=np.append(backarr2, x) # outarr=np.append(outarr, y) # out=np.append(out, mid) # #scores=np.append(scores,mid2) regressor.fit(backarr2,outarr) out=regressor.predict(backarr2) regressor2.fit(backarr2,outarr) out2=regressor2.predict(backarr2) res=regressor.score(backarr2,outarr) res2=regressor2.score(backarr2,outarr) time1,time2=[],[] for i in range(dt.date(2008,2,25).toordinal(),dt.date(2014,2,3).toordinal(),7): #make an array of mondays time1=np.append(time1,i) for i in range(dt.date(2014,10,27).toordinal(),dt.date(2018,8,21).toordinal(),7): time2=np.append(time2,i) for i in np.nditer(time1, op_flags=['readwrite']): i[...]=i-733000 for i in np.nditer(time2, op_flags=['readwrite']): i[...]=i-733000 time1=time1.reshape(-1,1) time2=time2.reshape(-1,1) interpol=regressor.predict(time1) #interpolate the original data that has gaps in it to fill the said gaps interpol2=regressor.predict(time2) time=np.concatenate((time1,time2)) result=np.concatenate((interpol,interpol2))
np.mean(cv_results2['test_neg_median_absolute_error'])) print('r2:', np.mean(cv_results['test_r2'])) scaler2 = StandardScaler() scaler2.fit(X_train) transformed_X_train = scaler.transform(X_train) transformed_X_test = scaler.transform(X_test) rbf_kernel = RBF(length_scale=10) ker_regr_rbf = KernelRidge(kernel=rbf_kernel) ker_regr_rbf.fit(transformed_X_train, y_train) ker_rbf_pred = ker_regr_rbf.predict(transformed_X_test) print("score: %.5f" % (ker_regr_rbf.score(transformed_X_test, y_test))) print("Error cuadratico medio: %.5f" % mean_squared_error(y_test, ker_rbf_pred)) # GRAFICOS PARA COMPARAR plt.rcParams["figure.figsize"] = (20, 10) plt.plot(ker_rbf_pred[1:500]) plt.plot(y_reg[1:500]) plt.xlabel('Datos') plt.ylabel('Tiempo [s]') plt.legend(['predicciones', 'datos'], loc='upper left') plt.show() plt.rcParams["figure.figsize"] = (20, 10) plt.plot(ker_rbf_pred[501:1000]) plt.plot(y_reg[501:1000])
pylab.plot(x,y_test116[:n],lw=2,label='Gradient Boosting') pylab.plot(x,y_test117[:n],lw=2,label='Random Forest') pylab.plot(x,y_test114[:n],lw=2,label='Bagging') pylab.plot(x,y_test115[:n],lw=2,label='Ada Boost') pylab.plot(x,y_test113[:n],lw=2,label='ExtraTree') pylab.xlabel('Observations'); pylab.ylabel('Targets') pylab.title('Regressors. Test Results. Boston') pylab.legend(loc=2,fontsize=10); pylab.show() """combining regression with kernels""" # Kernel Ridge; Toy regression 2 reg25=KernelRidge(); reg25.fit(X_train8,y_train8) y_train825=reg25.predict(X_train8) y_test825=reg25.predict(X_test8) print(reg25.score(X_test8,y_test8)) pylab.figure(figsize=(12,5)); n=30; x=range(n) pylab.scatter(x,y_test8[:n,0],marker='*',s=200, color='darkblue',label='Real data 1') pylab.scatter(x,y_test8[:n,1],marker='*',s=200, color='darkgreen',label='Real data 2') pylab.plot(x,y_test825[:n,0],lw=2, color='steelblue',label='Kernel Ridge 1') pylab.plot(x,y_test825[:n,1],lw=2, color='seagreen',label='Kernel Ridge 2') pylab.xlabel('Observations'); pylab.ylabel('Targets') pylab.title('Kernel Ridge Regressor. Test Results. Toy Regression 2') pylab.legend(loc=2,fontsize=10); pylab.show() """# Unsupervised Learning"""
def main(): X = pd.read_csv( '../data/BlackFriday.csv' ) # names =("User_ID", "Product_ID", "Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status,", "Product_Category_1","Product_Category_2","Product_Category_3", "Purchase" )) N, d = X.shape print(N, d) # fill missing values with 0 # (?) need to calculate percentage of missing value? X = X.fillna(0) # change gender to 0 and 1 X['Gender'] = X['Gender'].apply(change_gender) # change age to 0 to 6 X['Age'] = X['Age'].apply(change_age) # change city categories to 0 to 2 X['City_Category'] = X['City_Category'].apply(change_city) # change the year to integer X['Stay_In_Current_City_Years'] = X['Stay_In_Current_City_Years'].apply( change_year) #predict gender y = np.zeros((N, 1)) y = X.values[:, 2] y = y.astype('int') X1 = X ID = ['User_ID', 'Product_ID', 'Gender'] X1 = X1.drop(ID, axis=1) X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.20, random_state=42) model = LogisticRegression(C=1, fit_intercept=False, solver='lbfgs', multi_class='multinomial') model.fit(X_train, y_train) print("LogisticRegression(softmax) Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("LogisticRegression(softmax) Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) model = linear_model.SGDClassifier(max_iter=1000, tol=1e-3) model.fit(X_train, y_train) print("logLinearClassifier Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("logLinearClassifier Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #predict the product category1 based on other information. y2 = np.zeros((N, 1)) y2 = X.values[:, 8] y2 = y2.astype('int') X2 = X ID = [ 'User_ID', 'Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3' ] X2 = X2.drop(ID, axis=1) X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=42) model = KNeighborsClassifier(n_neighbors=5, metric='cosine') model.fit(X_train, y_train) y_pred = model.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error of KNN to predict age: %.3f" % tr_error) print("Testing error of KNN to predict age: %.3f" % te_error) # Training error of KNN to predict age: 0.363 #Testing error of KNN to predict age: 0.496 # Use decision tree to predict e_depth = 20 s_depth = 1 train_errors = np.zeros(e_depth - s_depth) test_errors = np.zeros(e_depth - s_depth) for i, d in enumerate(range(s_depth, e_depth)): print("\nDepth: %d" % d) model = DecisionTreeClassifier(max_depth=d, criterion='entropy', random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error: %.3f" % tr_error) print("Testing error: %.3f" % te_error) train_errors[i] = tr_error test_errors[i] = te_error x_vals = np.arange(s_depth, e_depth) plt.title("The effect of tree depth on testing/training error") plt.plot(x_vals, train_errors, label="training error") plt.plot(x_vals, test_errors, label="testing error") plt.xlabel("Depth") plt.ylabel("Error") plt.legend() fname = os.path.join("..", "figs", "trainTest_category1.pdf") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) model = RandomForestClassifier(criterion="entropy", n_estimators=5, max_features=5) model.fit(X_train, y_train) print("RandomForest Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("RandomForest Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #RandomForest Training error 0.027 #RandomForest Validation error 0.157 tree = DecisionTreeClassifier(max_depth=13, criterion='entropy', random_state=1) tree.fit(X_train, y_train) y_pred = tree.predict(X_train) tr_error = np.mean(y_pred != y_train) y_pred = tree.predict(X_test) te_error = np.mean(y_pred != y_test) print("Decision Tree Training error : %.3f" % tr_error) print("Decision Tree Validation error: %.3f" % te_error) #Depth: 11 #Training error: 0.127 #Testing error: 0.131 #use softmaxClassifier to predict occputation model = LogisticRegression(C=10000, fit_intercept=False, solver='lbfgs', multi_class='multinomial') model.fit(X_train, y_train) print("LogisticRegression(softmax) Training error %.3f" % utils.classification_error(model.predict(X_train), y_train)) print("LogisticRegression(softmax) Validation error %.3f" % utils.classification_error(model.predict(X_test), y_test)) #LogisticRegression(softmax) Training error 0.651 #LogisticRegression(softmax) Validation error 0.652 from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.gaussian_process.kernels import ConstantKernel, RBF from sklearn.kernel_ridge import KernelRidge from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel from sklearn.metrics import mean_squared_error poly = PolynomialFeatures(degree=4) X_train_sub = X_train[:1000] y_train_sub = y_train[:1000] X_train_ = poly.fit_transform(X_train_sub) model = LinearRegression() model.fit(X_train_, y_train_sub) model.score(X_train_, y_train_sub, sample_weight=None) y_pred = model.predict(X_train_) tr_error = mean_squared_error(y_pred, y_train_sub) y_pred = model.predict(X_test) te_error = np.mean(y_pred != y_test) print("Training error : %.3f" % tr_error) print("Validation error: %.3f" % te_error) #kernel = DotProduct() + WhiteKernel() y2 = np.zeros((N, 1)) y2 = X.values[:, 8] y2 = y2.astype('int') X2 = X ID = [ 'User_ID', 'Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3' ] X2 = X2.drop(ID, axis=1) X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.02, random_state=42) gpr = GaussianProcessRegressor(kernel=None, random_state=0).fit(X_train, y_train) gpr.score(X_train, y_train) y_pred = gpr.predict(X_train) tr_error = mean_squared_error(y_pred, y_train) y_pred = gpr.predict(X_test) te_error = mean_squared_error(y_pred, y_test) clf = KernelRidge(alpha=0.5) clf.fit(X_train_sub, y_train_sub) clf.score(X_train_sub, y_train_sub, sample_weight=None)
train = pandas.read_csv("steel_composition_train.csv", sep=",") test = pandas.read_csv("steel_composition_test.csv", sep=",") names = [ "id", "Carbon", "Nickel", "Manganese", "Sulfur", "Chromium", "Iron", "Phosphorus", "Silicon" ] data_train = train[names] targets_train = train["Strength"] tr_len = len(targets_train) krr2 = KernelRidge(alpha=1, kernel="polynomial", degree=2, coef0=1) krr2.fit(data_train, targets_train) krr2_score = krr2.score(data_train, targets_train) K2TR = krr2.predict(data_train) E = K2TR - targets_train E = np.asarray(E) RMSE2 = np.sqrt(np.dot(np.transpose(E), E) / tr_len) krr3 = KernelRidge(alpha=1, kernel="polynomial", degree=3, coef0=1) krr3.fit(data_train, targets_train) krr3_score = krr3.score(data_train, targets_train) K3TR = krr3.predict(data_train) E = K3TR - targets_train E = np.asarray(E) RMSE3 = np.sqrt(np.dot(np.transpose(E), E) / tr_len)
def splitData(dataMat,dataLabel): indices = np.random.permutation(len(dataMat)) dataTrainMat = np.array(dataMat)[100:]; dataTrainLabel = np.array(dataLabel)[100:] dataTestMat = np.array(dataMat)[:100]; dataTestLabel = np.array(dataLabel)[:100] return dataTrainMat ,dataTrainLabel,dataTestMat,dataTestLabel dataMat,dataLabel = createAbaloneData() dataTrainMat ,dataTrainLabel,dataTestMat,dataTestLabel = splitData(dataMat,dataLabel) #knn """ knn = KNeighborsClassifier() knn.fit(dataTrainMat,dataTrainLabel) print 'knn accucy',knn.score(dataTestMat,dataTestLabel) """ #line regression clf = KernelRidge(alpha=2.0) clf.fit(dataTrainMat,dataTrainLabel) print 'linear regression accucy',clf.score(dataTestMat,map(float,dataTestLabel)) #tree """ tree = DecisionTreeClassifier(random_state=2) tree.fit(dataTrainMat,dataTrainLabel) print 'tree accucy',tree.score(dataTestMat,dataTestLabel) """
testSet.append(trainSet[randIndex]) del trainSet[randIndex] #训练集 for dataIndex in trainSet: x_train.append(xArr[dataIndex]) y_train.append(yArr[dataIndex]) #测试集 for dataIndex in testSet: x_test.append(xArr[dataIndex]) y_test.append(yArr[dataIndex]) print(x_train) print(y_train) print(x_test) print(y_test) """ clf = KernelRidge() clf.fit(xArr, yArr) y_predict = clf.predict(xArr) #y_predict_int = [] #for i in range(len(y_predict)): # y_predict_int.append(int(y_predict[i])) print(yArr) print(y_predict) print(clf.score(xArr, yArr))