def _omp(*, train, test, x_predict=None, metrics, n_nonzero_coefs=None, tol=None, fit_intercept=True, normalize=True, precompute='auto'): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.OrthogonalMatchingPursuit.html#sklearn.linear_model.OrthogonalMatchingPursuit """ model = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs, tol=tol, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute) model.fit(train[0], train[1]) model_name = 'OrthogonalMatchingPursuit' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def fit_model_14(self,toWrite=False): model = OrthogonalMatchingPursuit() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 14 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model14/model.pkl','w') pickle.dump(model,f2) f2.close()
def classify_OMP(train, test): from sklearn.linear_model import OrthogonalMatchingPursuit as OMP x, y = train ydim = np.unique(y).shape[0] y = [tovec(yi, ydim) for yi in y] clf = OMP() clf.fit(x, y) x, y = test proba = clf.predict(x) return proba
class _OrthogonalMatchingPursuitImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def test_OMP(): """ find the 3 best nodes in the set [0, 0.1, ..., 0.9, 1.0] and their weights using Orthogonal Matching Pursuit """ kernel = Matern(length_scale=0.8, nu=1.2) set_size = 100 x = [] y = [] for n in range(set_size): f = GPRealization(kernel) data = [] for num in np.linspace(0, 1, 11): data.append(f(num)) x.append(data) y.append(quad(f, 0, 1)[0]) # build OMP model reg = OrthogonalMatchingPursuit(3).fit(x, y) print(reg.coef_) print(reg.intercept_) # test against simpsons rule num_tests = 100 reg_better = 0 total_err_simps = 0.0 total_err_reg = 0.0 for i in range(num_tests): f = GPRealization(kernel) data = [] for num in np.linspace(0, 1, 11): data.append(f(num)) int_reg = reg.predict([data]) int_reg = int_reg[0] int_simpsons = 1 / 6 * f(0) + 4 / 6 * f(.5) + 1 / 6 * f(1) int_true = quad(f, 0, 1)[0] total_err_simps += abs(int_simpsons - int_true) total_err_reg += abs(int_reg - int_true) if abs(int_reg - int_true) < abs(int_simpsons - int_true): reg_better += 1 print("The Regression Model was better in {} of {} cases".format( reg_better, num_tests)) print("The average error of the Regression model was {}".format( total_err_reg / num_tests)) print("The average error of the simpsons rule was {}".format( total_err_simps / num_tests))
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ n_nonzero_coefs = 17 algorithm = OrthogonalMatchingPursuit() algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
def task2(data): df = data dfreg = df.loc[:, ['Adj Close', 'Volume']] dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0 dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] #Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) ################## ################## ################## # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) # Lasso Regression clflas = Lasso() clflas.fit(X_train, y_train) # Multitask Lasso Regression # clfmtl = MultiTaskLasso(alpha=1.) # clfmtl.fit(X_train, y_train).coef_ # Bayesian Ridge Regression clfbyr = BayesianRidge() clfbyr.fit(X_train, y_train) # Lasso LARS Regression clflar = LassoLars(alpha=.1) clflar.fit(X_train, y_train) # Orthogonal Matching Pursuit Regression clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2) clfomp.fit(X_train, y_train) # Automatic Relevance Determination Regression clfard = ARDRegression(compute_score=True) clfard.fit(X_train, y_train) # Logistic Regression # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True) # coefs_ = [] # for c in cs: # clflgr.set_params(C=c) # clflgr.fit(X_train, y_train) # coefs_.append(clflgr.coef_.ravel().copy()) clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) clfsgd.fit(X_train, y_train) ################## ################## ################## #Create confindence scores confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) confidencelas = clflas.score(X_test, y_test) # confidencemtl = clfmtl.score(X_test, y_test) confidencebyr = clfbyr.score(X_test, y_test) confidencelar = clflar.score(X_test, y_test) confidenceomp = clfomp.score(X_test, y_test) confidenceard = clfard.score(X_test, y_test) confidencesgd = clfsgd.score(X_test, y_test) # results print('The linear regression confidence is:', confidencereg * 100) print('The quadratic regression 2 confidence is:', confidencepoly2 * 100) print('The quadratic regression 3 confidence is:', confidencepoly3 * 100) print('The knn regression confidence is:', confidenceknn * 100) print('The lasso regression confidence is:', confidencelas * 100) # print('The lasso regression confidence is:',confidencemtl*100) print('The Bayesian Ridge regression confidence is:', confidencebyr * 100) print('The Lasso LARS regression confidence is:', confidencelar * 100) print('The OMP regression confidence is:', confidenceomp * 100) print('The ARD regression confidence is:', confidenceard * 100) print('The SGD regression confidence is:', confidencesgd * 100) #Create new columns forecast_reg = clfreg.predict(X_lately) forecast_pol2 = clfpoly2.predict(X_lately) forecast_pol3 = clfpoly3.predict(X_lately) forecast_knn = clfknn.predict(X_lately) forecast_las = clflas.predict(X_lately) forecast_byr = clfbyr.predict(X_lately) forecast_lar = clflar.predict(X_lately) forecast_omp = clfomp.predict(X_lately) forecast_ard = clfard.predict(X_lately) forecast_sgd = clfsgd.predict(X_lately) #Process all new columns data dfreg['Forecast_reg'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_reg: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))] dfreg['Forecast_reg'].loc[next_date] = i dfreg['Forecast_pol2'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol2: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol2'].loc[next_date] = i dfreg['Forecast_pol3'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_pol3: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_pol3'].loc[next_date] = i dfreg['Forecast_knn'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_knn: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_knn'].loc[next_date] = i dfreg['Forecast_las'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_las: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_las'].loc[next_date] = i dfreg['Forecast_byr'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_byr: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_byr'].loc[next_date] = i dfreg['Forecast_lar'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_lar: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_lar'].loc[next_date] = i dfreg['Forecast_omp'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_omp: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_omp'].loc[next_date] = i dfreg['Forecast_ard'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_ard: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_ard'].loc[next_date] = i dfreg['Forecast_sgd'] = np.nan last_date = dfreg.iloc[-26].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_sgd: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg['Forecast_sgd'].loc[next_date] = i return dfreg.index.format(formatter=lambda x: x.strftime( '%Y-%m-%d')), dfreg['Adj Close'].to_list( ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list( ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list( ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list( ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list( ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
# Create linear regression object regrmavg = linear_model.LinearRegression() regomp = OrthogonalMatchingPursuit() regsgd = linear_model.SGDRegressor(max_iter=1000, tol=1e-3) # Train the model using the training sets regomp.fit(mavg_date_train, mavg_train) regrmavg.fit(mavg_date_train, mavg_train) regsgd.fit(mavg_date_train, mavg_train) # Make predictions using the testing set mavg_pred = regrmavg.predict(mavg_date_test) omp_pred = regomp.predict(mavg_date_test) sgd_pred = regsgd.predict(mavg_date_test) # The coefficients print('Coefficients: \n', regrmavg.coef_) print('Coefficients: \n', regomp.coef_) # The mean squared error print("Mov Avg mean squared error: %.2f" % mean_squared_error(mavg_test, mavg_pred)) # Explained variance score: 1 is perfect prediction print('move avg Variance score: %.2f' % r2_score(mavg_test, mavg_pred)) print("omp Mov Avg mean squared error: %.2f" %
sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['LassoLars_pca'] = sumsum / float(result_row) rs_score['LassoLars_pca'] = r2_score(y_test, y) LassoLarsModel = LassoLars() LassoLarsModel.fit(X_train_std, y_train) y = LassoLarsModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['LassoLars_std'] = sumsum / float(result_row) rs_score['LassoLars_std'] = r2_score(y_test, y) ompModel = OrthogonalMatchingPursuit() ompModel.fit(X_train_pca, y_train) y = ompModel.predict(X_test_pca) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['OM_pca'] = sumsum / float(result_row) rs_score['OM_pca'] = r2_score(y_test, y) ompModel = OrthogonalMatchingPursuit() ompModel.fit(X_train_std, y_train) y = ompModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
feature_selection = SelectKBest(f_classif, k=50) anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)]) anova_svc.fit(X_train, y_train[i, :]) pipelines.append(anova_svc) """ """ f_classif 100 + Ridge """ from sklearn.linear_model import OrthogonalMatchingPursuit as OMP clf = OMP(n_nonzero_coefs=20) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) """ clf.fit(X_train, y_train_tall.T) y_pred_tall = clf.predict(X_test) clf.fit(X_train, y_train_large.T) y_pred_large = clf.predict(X_test) clf.fit(X_train, y_train_big.T) y_pred_big = clf.predict(X_test) """ """
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + LARS lars = Lars() lars.fit(reduced_training_features, training_labels) preds = lars.predict(reduced_testing_features) score = lars.score(reduced_testing_features,testing_labels) print 'PCA + LARS Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Orthogonal Matching Pursuit from sklearn.linear_model import OrthogonalMatchingPursuit omp = OrthogonalMatchingPursuit() omp.fit(training_features, training_labels) preds = omp.predict(testing_features) score = omp.score(testing_features,testing_labels) print 'Orthogonal Matching Pursuit Regression Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds), '\n' # PCA + Orthogonal Matching Pursuit omp = OrthogonalMatchingPursuit() omp.fit(reduced_training_features, training_labels) preds = omp.predict(reduced_testing_features) score = omp.score(reduced_testing_features,testing_labels) print 'PCA + Orthogonal Matching Pursuit Results:' print 'R2 score:', score print 'MAE:', mean_absolute_error(testing_labels,preds) # Bayesian Ridge Regression
def _evaluate(self, datasets, **kwargs): """ Main method of PCM. It collects the response values from Feature and Target models, and Measurements from experiment, maps the biases and uncertainties from Feature to Target side, and calculates the uncertainty reduction fraction using Feature to validate Target. @ In, datasets, list, list of datasets (data1,data2,etc.) to used. @ In, kwargs, dict, keyword arguments @ Out, outputDict, dict, dictionary containing the results {"pri_post_stdReduct_<targName>":value} """ names = kwargs.get('dataobjectNames') outputDict = {} # Create empty list for multiple Exp responses featData = [] msrData = [] featPW = [] msrPW = [] for feat, msr in zip(self.features, self.measurements): featDataProb = self._getDataFromDataDict(datasets, feat, names) msrDataProb = self._getDataFromDataDict(datasets, msr, names) # M>=1 Feature arrays (1D) to 2D array with dimension (N, M) featData.append(featDataProb[0].flatten()) msrData.append(msrDataProb[0].flatten()) # Probability Weights for future use featPW.append(featDataProb[1]) msrPW.append(msrDataProb[1]) # *Data of size (num_of_samples, num_of_features) featData = np.array(featData).T msrData = np.array(msrData).T featPW = np.array(featPW).T msrPW = np.array(msrPW).T # Probability Weights to be used in the future yExp = np.array(featData) yMsr = np.array(msrData) # Reference values of Experiments, yExpRef in M # Sample mean as reference value for simplicity # Can be user-defined in the future yExpRef = np.mean(yExp, axis=0) # Usually the reference value is given, # and will not be zero, e.g. reference fuel temperature. # Standardization yExpStd = (yExp - yExpRef) / yExpRef yMsrStd = (yMsr - yExpRef) / yExpRef # For each Target/Application model/response, calculate an uncertainty reduction fraction # using all available Features/Experiments for targ in self.targets: targDataProb = self._getDataFromDataDict(datasets, targ, names) # Data values in <x>Data, <x>=targ, feat, msr targData = targDataProb[0] # Probability Weights values in <x>PW, , <x>=targ, feat, msr targPW = targDataProb[1] # Application responses yApp in Nx1 yApp = np.array(targData) # Reference values of Application, yAppRef is a scalar yAppRef = np.mean(yApp) # Standardization yAppStd = (yApp - yAppRef) / yAppRef # Single Experiment response if yExpStd.shape[1] == 1: yExpReg = yExpStd.flatten() yMsrReg = yMsrStd.flatten() # Pseudo response of multiple Experiment responses # OrthogonalMatchingPursuit from sklearn used here # Possibly change to other regressors elif yExpStd.shape[1] > 1: regrExp = OrthogonalMatchingPursuit(fit_intercept=False).fit( yExpStd, yAppStd) yExpReg = regrExp.predict(yExpStd) # Combine measurements by multiple Experiment regression yMsrReg = regrExp.predict(yMsrStd) # Measurement PDF with KDE knlMsr = stats.gaussian_kde(yMsrReg) # KDE for joint PDF between Exp and App m1 = yExpReg[:] m2 = yAppStd.flatten() xmin = m1.min() xmax = m1.max() ymin = m2.min() ymax = m2.max() # Grid of Experiment (X), grid of Application (Y) X, Y = np.mgrid[xmin:xmax:self.binKDE, ymin:ymax:self.binKDE] psts = np.vstack([X.ravel(), Y.ravel()]) vals = np.vstack([m1, m2]) # Measurement PDF over Exp range pdfMsr = knlMsr(X[:, 0]) # Condition number of matrix of feature and target condNum = np.linalg.cond(vals) # If condition number is greater than 100 invErr = 100 # Check whether the covavariance matrix is positive definite if condNum >= invErr: # If singular matrix, measurement of Experiment is directly transfered # as predicted Application pdfAppPred = knlMsr(Y[0, :]) else: # If not, KDE of Experiment and Application knl = stats.gaussian_kde(vals) # Joint PDF of Experiment and Application Z = np.reshape(knl(psts).T, X.shape) # yAppPred by integrating p(yexp, yapp)p(ymsr) over [yexp.min(), yexp.max()] pdfAppPred = np.dot(Z, pdfMsr.reshape(pdfMsr.shape[0], 1)) # Normalized PDF of predicted application pdfAppPredNorm = pdfAppPred.flatten() / pdfAppPred.sum() / np.diff( Y[0, :])[0] # Calculate Expectation (average value) of predicted application # by integrating xf(x), where f(x) is PDF of x predMean = 0.0 for i in range(len(Y[0, :])): predMean += Y[0, i] * pdfAppPredNorm[i] * (Y[0, 1] - Y[0, 0]) # Calculate Variance of predicted application # by integrating (x-mu_x)^2f(x), where f(x) is PDF of x predVar = 0.0 for i in range(len(Y[0, :])): predVar += (Y[0, i] - predMean)**2.0 * pdfAppPredNorm[i] * ( Y[0, 1] - Y[0, 0]) # Predicted standard deviation is square root of variance predStd = np.sqrt(predVar) # Prior standard deviation is the sample standard deviation # Consider probability weights in the future priStd = np.std(yAppStd) # Uncertainty reduction fraction is 1.0-sigma_pred/sigma_pri name = "pri_post_stdReduct_" + targ.split('|')[-1] outputDict[name] = (1.0 - predStd / priStd) return outputDict
class InfluenzaNetwork: def __init__(self, fields, testPercentage): self.data = self.getDataFromFile("influenza_data_by_year_by_county.csv") self.fields = fields self.model = None self.trainingInput = None self.trainingOutput = None self.trainingInfo = None self.testInput = None self.testOutput = None self.testInfo = None self.testPercentage = testPercentage if (self.fields is None): self.fields = ["EP_POV", "EP_UNEMP", "EP_PCI", "EP_NOHSDP", "EP_AGE65", "EP_AGE17", "EP_DISABL", "EP_SNGPNT", "EP_MINRTY", "EP_LIMENG", "EP_MUNIT", "EP_MOBILE", "EP_CROWD", "EP_NOVEH", "EP_GROUPQ", "EP_UNINSUR"] if (self.testPercentage is None): self.testPercentage = 0.20 def getDataFromFile(self, fileName): ''' setData(): Sets field "self.data" with dictionary parsed from CSV File; dictionary in form {year : {county: {...} } } fileName: Relative Path to "influenza_data_by_year_by_county.csv" ''' yearSet = {} with open(fileName, 'r') as rp: csvreader = csv.reader(rp) fieldDictionary = {} fields = next(csvreader) for i in range(len(fields)): if not (fields[i] in fieldDictionary): fieldDictionary[fields[i]] = i for row in csvreader: if len(row) == 0: continue year = row[fieldDictionary["Year"]] county = row[fieldDictionary["County"]] if not (year in yearSet): yearSet[year] = {} if not (county in yearSet[year]): yearSet[year][county] = {} for parsedField in list(fieldDictionary.keys()): if parsedField in ["Year", "County"]: continue yearSet[year][county][parsedField] = float(row[fieldDictionary[parsedField]]) return yearSet def getIOFromData(self, testPercentage): ''' Assumes existence of self.data formatted as "{year : {county: {...} } }" ''' inputList, outputList, trainingInput, trainingOutput = [], [], [], [] IOMetadata, trainingMetadata = [], [] for yearKey in self.data: for countyKey in self.data[yearKey]: outputList.append(self.data[yearKey][countyKey]["Percent"]) singleInput = [] for field in self.fields: singleInput.append(self.data[yearKey][countyKey][field]) inputList.append(singleInput) IOMetadata.append((yearKey, countyKey, self.data[yearKey][countyKey]["Population"])) # Split into test and training sets based on "testPercentage" if testPercentage > 1 or testPercentage < 0: testPercentage = 0.20 trainingSplit = int(float(len(inputList)) * (1-testPercentage)) while len(trainingInput) < trainingSplit: randomPos = random.randint(0, len(inputList)-1) trainingInput.append(inputList[randomPos]) trainingOutput.append(outputList[randomPos]) inputList.pop(randomPos) outputList.pop(randomPos) trainingMetadata.append(IOMetadata[randomPos]) IOMetadata.pop(randomPos) self.trainingInput = np.array(trainingInput) self.trainingOutput = np.array(trainingOutput) self.testInput = np.array(inputList) self.testOutput = np.array(outputList) self.trainingInfo = trainingMetadata self.testInfo = IOMetadata def trainLinearElasticNet(self, alpha, l1): self.getIOFromData(self.testPercentage) self.model = ElasticNet(alpha=alpha, l1_ratio=l1) self.model.fit(self.trainingInput, self.trainingOutput) def trainLinearRegression(self): self.getIOFromData(self.testPercentage) self.model = LinearRegression() self.model.fit(self.trainingInput, self.trainingOutput) def trainSVRLinear(self, cValue, gammaValue): self.getIOFromData(self.testPercentage) self.model = SVR(kernel='linear', C=cValue, gamma=gammaValue) self.model.fit(self.trainingInput, self.trainingOutput) def trainSVRRadial(self, cValue, gammaValue, epsilonValue): self.getIOFromData(self.testPercentage) self.model = SVR(kernel='rbf', C=cValue, gamma=gammaValue, epsilon=epsilonValue) self.model.fit(self.trainingInput, self.trainingOutput) def trainLinearRidge(self, alpha, fit_intercept): self.getIOFromData(self.testPercentage) self.model = Ridge(alpha=alpha, fit_intercept=fit_intercept) self.model.fit(self.trainingInput, self.trainingOutput) def trainLars(self): self.getIOFromData(self.testPercentage) self.model = Lars() self.model.fit(self.trainingInput, self.trainingOutput) def trainLinearOrthogonalMatchingPursuit(self): self.getIOFromData(self.testPercentage) self.model = OrthogonalMatchingPursuit() self.model.fit(self.trainingInput, self.trainingOutput) def trainMLPRegressor(self, layerSizes, tolerance, max_iterations, activationFunction='relu'): self.getIOFromData(self.testPercentage) self.model = make_pipeline(StandardScaler(),MLPRegressor(hidden_layer_sizes=layerSizes,tol=tolerance, max_iter=max_iterations, random_state=0, activation=activationFunction)) self.model.fit(self.trainingInput, self.trainingOutput) def testModel_statistics(self): results = self.model.predict(self.testInput) percentErrors = [] for index in range(len(results)): # Calculate percent error PE = abs((self.testOutput[index] - results[index])/self.testOutput[index]) * 100 percentErrors.append(PE) return statistics.mean(percentErrors) def testModel_output(self): results = self.model.predict(self.testInput) return results def testModel_custom(self, customInputList): results = self.model.predict(customInputList) return results # Static Methods to load/dump models from/into files def importModel(filename): with open(filename, 'rb') as fp: model = pickle.load(fp) toReturn = InfluenzaNetwork(model.fields, model.testPercentage) toReturn.model = model.model toReturn.trainingInput = model.trainingInput toReturn.trainingOutput = model.trainingOutput toReturn.testInput = model.testInput toReturn.testOutput = model.testOutput if hasattr(model, 'trainingInfo'): toReturn.trainingInfo = model.trainingInfo if hasattr(model, 'testInfo'): toReturn.testInfo = model.testInfo return toReturn def exportModel(influenzaNetworkInstance, filename=None): if (filename is None) or (len(filename) == 0) or (".pickle" not in filename): filename = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_model.pickle" with open(filename, 'wb') as wp: pickle.dump(influenzaNetworkInstance, wp, protocol=pickle.HIGHEST_PROTOCOL)
def mySRC(X_train_array, Y_train_array, X_test_array): print 'SRC' src1 = OrthogonalMatchingPursuit() src1.fit(X_train_array, Y_train_array) predict = src1.predict(X_test_array) return predict
def train_error_data(n, J, x, y, train_size, nb_features, my_alphas): ''' Parameters ---------- n : number of repetitions. J : number of sparsity. x : data. y : desired output. train_size : number of training points. nb_features : number of features. my_alphas : array of different values for alpha. Returns : representation of MSE depending on sparsity for Lasso, OMP and Lars methods, for training points. ------- ''' #initialisation vec = np.zeros(train_size * J).reshape(train_size, J) res = np.zeros(n * J).reshape(n, J) somme = np.zeros(J) vec2 = np.zeros(train_size * J).reshape(train_size, J) res2 = np.zeros(n * J).reshape(n, J) somme2 = np.zeros(J) vec3 = np.zeros(train_size * J).reshape(train_size, J) res3 = np.zeros(n * J).reshape(n, J) somme3 = np.zeros(J) axes = np.arange(1, 11) # Average training squared error : n iterations and sparsity (1 to J) for i in range(n): X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=train_size) for j in range(J): alpha_coef = alpha(X_train, train_size=train_size, nb_features=nb_features, my_alphas=my_alphas) reg2 = Lasso(alpha=alpha_coef[j]).fit(X_train, y_train) reg = OrthogonalMatchingPursuit(n_nonzero_coefs=j + 1).fit(X_train, y_train) reg3 = Lars(n_nonzero_coefs=j + 1).fit(X_train, y_train) vec[:, j] = (y_train - reg.predict(X_train))**2 res[i, j] = sum(vec[:, j]) / train_size vec2[:, j] = (y_train - (reg2.predict(X_train)))**2 res2[i, j] = sum(vec2[:, j]) / train_size vec3[:, j] = (y_train - reg3.predict(X_train))**2 res3[i, j] = sum(vec3[:, j]) / train_size for j in range(J): for i in range(n): somme[j] = somme[j] + res[i, j] somme2[j] = somme2[j] + res2[i, j] somme3[j] = somme3[j] + res3[i, j] # plot the results plt.plot(axes, somme / n, label='OMP') plt.plot(axes, somme2 / n, label='Lasso') plt.plot(axes, somme3 / n, label='Lars') plt.xlabel('sparsity') plt.ylabel('train error') plt.title('Performance comparison on simulation data') plt.legend()
tss, rss, ess, r2 = xss(Y, elasticNetCV.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试OrthogonalMatchingPursuit类**********" # 在初始化OrthogonalMatchingPursuit类时, 指定参数n_nonzero_coefs, 默认值是None. omp = OrthogonalMatchingPursuit(n_nonzero_coefs=3) # 拟合训练集 omp.fit(train_X, train_Y) # 打印模型的系数 print "系数:", omp.coef_ print "截距:", omp.intercept_ print '训练集R2: ', r2_score(train_Y, omp.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = omp.predict(test_X) print "测试集得分:", omp.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, omp.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2