Example #1
0
def _omp(*,
         train,
         test,
         x_predict=None,
         metrics,
         n_nonzero_coefs=None,
         tol=None,
         fit_intercept=True,
         normalize=True,
         precompute='auto'):
    """For more info visit : 
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.OrthogonalMatchingPursuit.html#sklearn.linear_model.OrthogonalMatchingPursuit
    """

    model = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs,
                                      tol=tol,
                                      fit_intercept=fit_intercept,
                                      normalize=normalize,
                                      precompute=precompute)
    model.fit(train[0], train[1])
    model_name = 'OrthogonalMatchingPursuit'
    y_hat = model.predict(test[0])

    if metrics == 'mse':
        accuracy = _mse(test[1], y_hat)
    if metrics == 'rmse':
        accuracy = _rmse(test[1], y_hat)
    if metrics == 'mae':
        accuracy = _mae(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Example #2
0
    def fit_model_14(self,toWrite=False):
        model = OrthogonalMatchingPursuit()

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 14 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model14/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Example #3
0
def classify_OMP(train, test):
	from sklearn.linear_model import OrthogonalMatchingPursuit as OMP

	x, y = train
	ydim = np.unique(y).shape[0]
	y = [tovec(yi, ydim) for yi in y]

	clf = OMP()
	clf.fit(x, y)
	
	x, y = test
	proba = clf.predict(x)
	return proba
class _OrthogonalMatchingPursuitImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Example #5
0
def test_OMP():
    """
    find the 3 best nodes in the set [0, 0.1, ..., 0.9, 1.0] and their weights using Orthogonal Matching Pursuit
    """
    kernel = Matern(length_scale=0.8, nu=1.2)
    set_size = 100
    x = []
    y = []
    for n in range(set_size):
        f = GPRealization(kernel)
        data = []
        for num in np.linspace(0, 1, 11):
            data.append(f(num))
        x.append(data)
        y.append(quad(f, 0, 1)[0])

    # build OMP model
    reg = OrthogonalMatchingPursuit(3).fit(x, y)
    print(reg.coef_)
    print(reg.intercept_)

    # test against simpsons rule
    num_tests = 100
    reg_better = 0
    total_err_simps = 0.0
    total_err_reg = 0.0
    for i in range(num_tests):
        f = GPRealization(kernel)
        data = []
        for num in np.linspace(0, 1, 11):
            data.append(f(num))
        int_reg = reg.predict([data])
        int_reg = int_reg[0]
        int_simpsons = 1 / 6 * f(0) + 4 / 6 * f(.5) + 1 / 6 * f(1)
        int_true = quad(f, 0, 1)[0]
        total_err_simps += abs(int_simpsons - int_true)
        total_err_reg += abs(int_reg - int_true)
        if abs(int_reg - int_true) < abs(int_simpsons - int_true):
            reg_better += 1

    print("The Regression Model was better in {} of {} cases".format(
        reg_better, num_tests))
    print("The average error of the Regression model was {}".format(
        total_err_reg / num_tests))
    print("The average error of the simpsons rule was {}".format(
        total_err_simps / num_tests))
Example #6
0
    def predict(self):
        """
         trains the scikit-learn  python machine learning algorithm library function
         https://scikit-learn.org

         then passes the trained algorithm the features set and returns the
         predicted y test values form, the function

         then compares the y_test values from scikit-learn predicted to
         y_test values passed in

         then returns the accuracy
         """

        n_nonzero_coefs = 17
        algorithm = OrthogonalMatchingPursuit()
        algorithm.fit(self.X_train, self.y_train)
        y_pred = list(algorithm.predict(self.X_test))
        self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test)
        return self.acc
Example #7
0
def task2(data):

    df = data

    dfreg = df.loc[:, ['Adj Close', 'Volume']]
    dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
    dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

    # Drop missing value
    dfreg.fillna(value=-99999, inplace=True)
    # We want to separate 1 percent of the data to forecast
    forecast_out = int(math.ceil(0.01 * len(dfreg)))
    # Separating the label here, we want to predict the AdjClose
    forecast_col = 'Adj Close'
    dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
    X = np.array(dfreg.drop(['label'], 1))
    # Scale the X so that everyone can have the same distribution for linear regression
    X = preprocessing.scale(X)
    # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]
    # Separate label and identify it as y
    y = np.array(dfreg['label'])
    y = y[:-forecast_out]

    #Split data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    ##################
    ##################
    ##################

    # Linear regression
    clfreg = LinearRegression(n_jobs=-1)
    clfreg.fit(X_train, y_train)
    # Quadratic Regression 2
    clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
    clfpoly2.fit(X_train, y_train)

    # Quadratic Regression 3
    clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
    clfpoly3.fit(X_train, y_train)

    # KNN Regression
    clfknn = KNeighborsRegressor(n_neighbors=2)
    clfknn.fit(X_train, y_train)

    # Lasso Regression
    clflas = Lasso()
    clflas.fit(X_train, y_train)

    # Multitask Lasso Regression
    # clfmtl = MultiTaskLasso(alpha=1.)
    # clfmtl.fit(X_train, y_train).coef_

    # Bayesian Ridge Regression
    clfbyr = BayesianRidge()
    clfbyr.fit(X_train, y_train)

    # Lasso LARS Regression
    clflar = LassoLars(alpha=.1)
    clflar.fit(X_train, y_train)

    # Orthogonal Matching Pursuit Regression
    clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2)
    clfomp.fit(X_train, y_train)

    # Automatic Relevance Determination Regression
    clfard = ARDRegression(compute_score=True)
    clfard.fit(X_train, y_train)

    # Logistic Regression
    # clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True)
    # coefs_ = []
    # for c in cs:
    #   clflgr.set_params(C=c)
    #   clflgr.fit(X_train, y_train)
    #   coefs_.append(clflgr.coef_.ravel().copy())

    clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
    clfsgd.fit(X_train, y_train)

    ##################
    ##################
    ##################

    #Create confindence scores
    confidencereg = clfreg.score(X_test, y_test)
    confidencepoly2 = clfpoly2.score(X_test, y_test)
    confidencepoly3 = clfpoly3.score(X_test, y_test)
    confidenceknn = clfknn.score(X_test, y_test)
    confidencelas = clflas.score(X_test, y_test)
    # confidencemtl = clfmtl.score(X_test, y_test)
    confidencebyr = clfbyr.score(X_test, y_test)
    confidencelar = clflar.score(X_test, y_test)
    confidenceomp = clfomp.score(X_test, y_test)
    confidenceard = clfard.score(X_test, y_test)
    confidencesgd = clfsgd.score(X_test, y_test)

    # results
    print('The linear regression confidence is:', confidencereg * 100)
    print('The quadratic regression 2 confidence is:', confidencepoly2 * 100)
    print('The quadratic regression 3 confidence is:', confidencepoly3 * 100)
    print('The knn regression confidence is:', confidenceknn * 100)
    print('The lasso regression confidence is:', confidencelas * 100)
    # print('The lasso regression confidence is:',confidencemtl*100)
    print('The Bayesian Ridge regression confidence is:', confidencebyr * 100)
    print('The Lasso LARS regression confidence is:', confidencelar * 100)
    print('The OMP regression confidence is:', confidenceomp * 100)
    print('The ARD regression confidence is:', confidenceard * 100)
    print('The SGD regression confidence is:', confidencesgd * 100)

    #Create new columns
    forecast_reg = clfreg.predict(X_lately)
    forecast_pol2 = clfpoly2.predict(X_lately)
    forecast_pol3 = clfpoly3.predict(X_lately)
    forecast_knn = clfknn.predict(X_lately)
    forecast_las = clflas.predict(X_lately)
    forecast_byr = clfbyr.predict(X_lately)
    forecast_lar = clflar.predict(X_lately)
    forecast_omp = clfomp.predict(X_lately)
    forecast_ard = clfard.predict(X_lately)
    forecast_sgd = clfsgd.predict(X_lately)

    #Process all new columns data
    dfreg['Forecast_reg'] = np.nan

    last_date = dfreg.iloc[-1].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_reg:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))]
        dfreg['Forecast_reg'].loc[next_date] = i

    dfreg['Forecast_pol2'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol2:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol2'].loc[next_date] = i

    dfreg['Forecast_pol3'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_pol3:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_pol3'].loc[next_date] = i

    dfreg['Forecast_knn'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_knn:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_knn'].loc[next_date] = i

    dfreg['Forecast_las'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_las:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_las'].loc[next_date] = i

    dfreg['Forecast_byr'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_byr:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_byr'].loc[next_date] = i

    dfreg['Forecast_lar'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_lar:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_lar'].loc[next_date] = i

    dfreg['Forecast_omp'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_omp:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_omp'].loc[next_date] = i

    dfreg['Forecast_ard'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_ard:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_ard'].loc[next_date] = i

    dfreg['Forecast_sgd'] = np.nan

    last_date = dfreg.iloc[-26].name
    last_unix = last_date
    next_unix = last_unix + datetime.timedelta(days=1)

    for i in forecast_sgd:
        next_date = next_unix
        next_unix += datetime.timedelta(days=1)
        dfreg['Forecast_sgd'].loc[next_date] = i

    return dfreg.index.format(formatter=lambda x: x.strftime(
        '%Y-%m-%d')), dfreg['Adj Close'].to_list(
        ), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(
        ), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(
        ), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(
        ), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(
        ), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()
# Create linear regression object
regrmavg = linear_model.LinearRegression()
regomp = OrthogonalMatchingPursuit()
regsgd = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
# Train the model using the training sets

regomp.fit(mavg_date_train, mavg_train)

regrmavg.fit(mavg_date_train, mavg_train)
regsgd.fit(mavg_date_train, mavg_train)

# Make predictions using the testing set
mavg_pred = regrmavg.predict(mavg_date_test)

omp_pred = regomp.predict(mavg_date_test)

sgd_pred = regsgd.predict(mavg_date_test)

# The coefficients
print('Coefficients: \n', regrmavg.coef_)
print('Coefficients: \n', regomp.coef_)

# The mean squared error

print("Mov Avg mean squared error: %.2f" %
      mean_squared_error(mavg_test, mavg_pred))
# Explained variance score: 1 is perfect prediction
print('move avg Variance score: %.2f' % r2_score(mavg_test, mavg_pred))

print("omp Mov Avg mean squared error: %.2f" %
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['LassoLars_pca'] = sumsum / float(result_row)
 rs_score['LassoLars_pca'] = r2_score(y_test, y)
 LassoLarsModel = LassoLars()
 LassoLarsModel.fit(X_train_std, y_train)
 y = LassoLarsModel.predict(X_test_std)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['LassoLars_std'] = sumsum / float(result_row)
 rs_score['LassoLars_std'] = r2_score(y_test, y)
 ompModel = OrthogonalMatchingPursuit()
 ompModel.fit(X_train_pca, y_train)
 y = ompModel.predict(X_test_pca)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['OM_pca'] = sumsum / float(result_row)
 rs_score['OM_pca'] = r2_score(y_test, y)
 ompModel = OrthogonalMatchingPursuit()
 ompModel.fit(X_train_std, y_train)
 y = ompModel.predict(X_test_std)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
Example #10
0
    feature_selection = SelectKBest(f_classif, k=50)
    anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)])
    anova_svc.fit(X_train, y_train[i, :])
    pipelines.append(anova_svc)

"""
"""
    f_classif 100 + Ridge
"""

from sklearn.linear_model import OrthogonalMatchingPursuit as OMP

clf = OMP(n_nonzero_coefs=20)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

"""
clf.fit(X_train, y_train_tall.T)
y_pred_tall = clf.predict(X_test)

clf.fit(X_train, y_train_large.T)
y_pred_large = clf.predict(X_test)

clf.fit(X_train, y_train_big.T)
y_pred_big = clf.predict(X_test)
"""



"""
Example #11
0
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + LARS
lars = Lars()
lars.fit(reduced_training_features, training_labels)
preds = lars.predict(reduced_testing_features)
score = lars.score(reduced_testing_features,testing_labels)
print 'PCA + LARS Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Orthogonal Matching Pursuit
from sklearn.linear_model import OrthogonalMatchingPursuit
omp = OrthogonalMatchingPursuit()
omp.fit(training_features, training_labels)
preds = omp.predict(testing_features)
score = omp.score(testing_features,testing_labels)
print 'Orthogonal Matching Pursuit Regression Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + Orthogonal Matching Pursuit
omp = OrthogonalMatchingPursuit()
omp.fit(reduced_training_features, training_labels)
preds = omp.predict(reduced_testing_features)
score = omp.score(reduced_testing_features,testing_labels)
print 'PCA + Orthogonal Matching Pursuit Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Bayesian Ridge Regression
    def _evaluate(self, datasets, **kwargs):
        """
      Main method of PCM. It collects the response values from Feature and Target models,
      and Measurements from experiment, maps the biases and uncertainties from Feature to
      Target side, and calculates the uncertainty reduction fraction using Feature to
      validate Target.
      @ In, datasets, list, list of datasets (data1,data2,etc.) to used.
      @ In, kwargs, dict, keyword arguments
      @ Out, outputDict, dict, dictionary containing the results {"pri_post_stdReduct_<targName>":value}
    """
        names = kwargs.get('dataobjectNames')
        outputDict = {}

        # Create empty list for multiple Exp responses
        featData = []
        msrData = []
        featPW = []
        msrPW = []
        for feat, msr in zip(self.features, self.measurements):
            featDataProb = self._getDataFromDataDict(datasets, feat, names)
            msrDataProb = self._getDataFromDataDict(datasets, msr, names)
            # M>=1 Feature arrays (1D) to 2D array with dimension (N, M)
            featData.append(featDataProb[0].flatten())
            msrData.append(msrDataProb[0].flatten())
            # Probability Weights for future use
            featPW.append(featDataProb[1])
            msrPW.append(msrDataProb[1])
        # *Data of size (num_of_samples, num_of_features)
        featData = np.array(featData).T
        msrData = np.array(msrData).T
        featPW = np.array(featPW).T
        msrPW = np.array(msrPW).T

        # Probability Weights to be used in the future
        yExp = np.array(featData)
        yMsr = np.array(msrData)
        # Reference values of Experiments, yExpRef in M
        # Sample mean as reference value for simplicity
        # Can be user-defined in the future
        yExpRef = np.mean(yExp, axis=0)
        # Usually the reference value is given,
        # and will not be zero, e.g. reference fuel temperature.
        # Standardization
        yExpStd = (yExp - yExpRef) / yExpRef
        yMsrStd = (yMsr - yExpRef) / yExpRef

        # For each Target/Application model/response, calculate an uncertainty reduction fraction
        # using all available Features/Experiments
        for targ in self.targets:
            targDataProb = self._getDataFromDataDict(datasets, targ, names)
            # Data values in <x>Data, <x>=targ, feat, msr
            targData = targDataProb[0]
            # Probability Weights values in <x>PW, , <x>=targ, feat, msr
            targPW = targDataProb[1]

            # Application responses yApp in Nx1
            yApp = np.array(targData)
            # Reference values of Application, yAppRef is a scalar
            yAppRef = np.mean(yApp)
            # Standardization
            yAppStd = (yApp - yAppRef) / yAppRef

            # Single Experiment response
            if yExpStd.shape[1] == 1:
                yExpReg = yExpStd.flatten()
                yMsrReg = yMsrStd.flatten()
            # Pseudo response of multiple Experiment responses
            # OrthogonalMatchingPursuit from sklearn used here
            # Possibly change to other regressors
            elif yExpStd.shape[1] > 1:
                regrExp = OrthogonalMatchingPursuit(fit_intercept=False).fit(
                    yExpStd, yAppStd)
                yExpReg = regrExp.predict(yExpStd)
                # Combine measurements by multiple Experiment regression
                yMsrReg = regrExp.predict(yMsrStd)

            # Measurement PDF with KDE
            knlMsr = stats.gaussian_kde(yMsrReg)

            # KDE for joint PDF between Exp and App
            m1 = yExpReg[:]
            m2 = yAppStd.flatten()
            xmin = m1.min()
            xmax = m1.max()
            ymin = m2.min()
            ymax = m2.max()
            # Grid of Experiment (X), grid of Application (Y)
            X, Y = np.mgrid[xmin:xmax:self.binKDE, ymin:ymax:self.binKDE]
            psts = np.vstack([X.ravel(), Y.ravel()])
            vals = np.vstack([m1, m2])
            # Measurement PDF over Exp range
            pdfMsr = knlMsr(X[:, 0])

            # Condition number of matrix of feature and target
            condNum = np.linalg.cond(vals)
            # If condition number is greater than 100
            invErr = 100
            # Check whether the covavariance matrix is positive definite
            if condNum >= invErr:
                # If singular matrix, measurement of Experiment is directly transfered
                # as predicted Application
                pdfAppPred = knlMsr(Y[0, :])
            else:
                # If not, KDE of Experiment and Application
                knl = stats.gaussian_kde(vals)
                # Joint PDF of Experiment and Application
                Z = np.reshape(knl(psts).T, X.shape)
                # yAppPred by integrating p(yexp, yapp)p(ymsr) over [yexp.min(), yexp.max()]
                pdfAppPred = np.dot(Z, pdfMsr.reshape(pdfMsr.shape[0], 1))

            # Normalized PDF of predicted application
            pdfAppPredNorm = pdfAppPred.flatten() / pdfAppPred.sum() / np.diff(
                Y[0, :])[0]

            # Calculate Expectation (average value) of predicted application
            # by integrating xf(x), where f(x) is PDF of x
            predMean = 0.0
            for i in range(len(Y[0, :])):
                predMean += Y[0, i] * pdfAppPredNorm[i] * (Y[0, 1] - Y[0, 0])

            # Calculate Variance of predicted application
            # by integrating (x-mu_x)^2f(x), where f(x) is PDF of x
            predVar = 0.0
            for i in range(len(Y[0, :])):
                predVar += (Y[0, i] - predMean)**2.0 * pdfAppPredNorm[i] * (
                    Y[0, 1] - Y[0, 0])

            # Predicted standard deviation is square root of variance
            predStd = np.sqrt(predVar)
            # Prior standard deviation is the sample standard deviation
            # Consider probability weights in the future
            priStd = np.std(yAppStd)
            # Uncertainty reduction fraction is 1.0-sigma_pred/sigma_pri
            name = "pri_post_stdReduct_" + targ.split('|')[-1]
            outputDict[name] = (1.0 - predStd / priStd)

        return outputDict
Example #13
0
class InfluenzaNetwork:

    def __init__(self, fields, testPercentage): 
        self.data = self.getDataFromFile("influenza_data_by_year_by_county.csv")
        self.fields = fields
        self.model = None
        self.trainingInput = None 
        self.trainingOutput = None
        self.trainingInfo = None 
        self.testInput = None
        self.testOutput = None
        self.testInfo = None
        self.testPercentage = testPercentage
        if (self.fields is None): 
            self.fields = ["EP_POV", "EP_UNEMP", "EP_PCI", "EP_NOHSDP", "EP_AGE65", "EP_AGE17", "EP_DISABL", "EP_SNGPNT", "EP_MINRTY", "EP_LIMENG", "EP_MUNIT", "EP_MOBILE", "EP_CROWD", "EP_NOVEH", "EP_GROUPQ", "EP_UNINSUR"]
        if (self.testPercentage is None): 
            self.testPercentage = 0.20
    
    def getDataFromFile(self, fileName):
        '''
        setData(): Sets field "self.data" with dictionary parsed from CSV File; 
        dictionary in form {year : {county: {...} } }
        fileName: Relative Path to "influenza_data_by_year_by_county.csv"        
        '''
        yearSet = {}
        with open(fileName, 'r') as rp: 
            csvreader = csv.reader(rp)
            fieldDictionary = {} 
            fields = next(csvreader) 
            for i in range(len(fields)):
                if not (fields[i] in fieldDictionary): 
                    fieldDictionary[fields[i]] = i          
            for row in csvreader: 
                if len(row) == 0: continue
                year = row[fieldDictionary["Year"]]
                county = row[fieldDictionary["County"]]
                if not (year in yearSet): 
                    yearSet[year] = {} 
                if not (county in yearSet[year]):
                    yearSet[year][county] = {}
                for parsedField in list(fieldDictionary.keys()): 
                    if parsedField in ["Year", "County"]: continue
                    yearSet[year][county][parsedField] = float(row[fieldDictionary[parsedField]])
        return yearSet
        
    def getIOFromData(self, testPercentage):
        '''
        Assumes existence of self.data formatted as "{year : {county: {...} } }"
        '''
        inputList, outputList, trainingInput, trainingOutput = [], [], [], []
        IOMetadata, trainingMetadata = [], []
        for yearKey in self.data: 
            for countyKey in self.data[yearKey]:
                outputList.append(self.data[yearKey][countyKey]["Percent"])
                singleInput = []
                for field in self.fields:
                    singleInput.append(self.data[yearKey][countyKey][field])
                inputList.append(singleInput)
                IOMetadata.append((yearKey, countyKey, self.data[yearKey][countyKey]["Population"]))
                
        # Split into test and training sets based on "testPercentage"
        if testPercentage > 1 or testPercentage < 0: 
            testPercentage = 0.20
        trainingSplit = int(float(len(inputList)) * (1-testPercentage))
        while len(trainingInput) < trainingSplit: 
            randomPos = random.randint(0, len(inputList)-1)
            trainingInput.append(inputList[randomPos])
            trainingOutput.append(outputList[randomPos])
            inputList.pop(randomPos)
            outputList.pop(randomPos)
            trainingMetadata.append(IOMetadata[randomPos])
            IOMetadata.pop(randomPos)
        self.trainingInput = np.array(trainingInput)
        self.trainingOutput = np.array(trainingOutput)
        self.testInput = np.array(inputList)
        self.testOutput = np.array(outputList)
        self.trainingInfo = trainingMetadata
        self.testInfo = IOMetadata
    
    def trainLinearElasticNet(self, alpha, l1): 
        self.getIOFromData(self.testPercentage)
        self.model = ElasticNet(alpha=alpha, l1_ratio=l1)
        self.model.fit(self.trainingInput, self.trainingOutput)
        
    def trainLinearRegression(self):
        self.getIOFromData(self.testPercentage)
        self.model = LinearRegression()
        self.model.fit(self.trainingInput, self.trainingOutput)
    
    def trainSVRLinear(self, cValue, gammaValue):
        self.getIOFromData(self.testPercentage)
        self.model = SVR(kernel='linear', C=cValue, gamma=gammaValue)
        self.model.fit(self.trainingInput, self.trainingOutput)    
        
    def trainSVRRadial(self, cValue, gammaValue, epsilonValue):
        self.getIOFromData(self.testPercentage)
        self.model = SVR(kernel='rbf', C=cValue, gamma=gammaValue, epsilon=epsilonValue)
        self.model.fit(self.trainingInput, self.trainingOutput)
        
    def trainLinearRidge(self, alpha, fit_intercept):
        self.getIOFromData(self.testPercentage)
        self.model = Ridge(alpha=alpha, fit_intercept=fit_intercept)
        self.model.fit(self.trainingInput, self.trainingOutput)
        
    def trainLars(self):
        self.getIOFromData(self.testPercentage)
        self.model = Lars()
        self.model.fit(self.trainingInput, self.trainingOutput)
        
    def trainLinearOrthogonalMatchingPursuit(self):
        self.getIOFromData(self.testPercentage)
        self.model = OrthogonalMatchingPursuit()
        self.model.fit(self.trainingInput, self.trainingOutput)
        
    def trainMLPRegressor(self, layerSizes, tolerance, max_iterations, activationFunction='relu'):
        self.getIOFromData(self.testPercentage)
        self.model = make_pipeline(StandardScaler(),MLPRegressor(hidden_layer_sizes=layerSizes,tol=tolerance, max_iter=max_iterations, random_state=0, activation=activationFunction))
        self.model.fit(self.trainingInput, self.trainingOutput)
        
    def testModel_statistics(self):
        results = self.model.predict(self.testInput)
        percentErrors = []
        for index in range(len(results)): 
            # Calculate percent error
            PE = abs((self.testOutput[index] - results[index])/self.testOutput[index]) * 100
            percentErrors.append(PE)
        return statistics.mean(percentErrors)
        
    def testModel_output(self):
        results = self.model.predict(self.testInput)
        return results
        
    def testModel_custom(self, customInputList):
        results = self.model.predict(customInputList)
        return results
        
    # Static Methods to load/dump models from/into files
    def importModel(filename):
        with open(filename, 'rb') as fp:
            model = pickle.load(fp)
            toReturn = InfluenzaNetwork(model.fields, model.testPercentage)
            toReturn.model = model.model
            toReturn.trainingInput = model.trainingInput 
            toReturn.trainingOutput = model.trainingOutput
            toReturn.testInput = model.testInput
            toReturn.testOutput = model.testOutput
            if hasattr(model, 'trainingInfo'): toReturn.trainingInfo = model.trainingInfo
            if hasattr(model, 'testInfo'): toReturn.testInfo = model.testInfo
            return toReturn

    def exportModel(influenzaNetworkInstance, filename=None):
        if (filename is None) or (len(filename) == 0) or (".pickle" not in filename):
            filename = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_model.pickle"
        with open(filename, 'wb') as wp: 
            pickle.dump(influenzaNetworkInstance, wp, protocol=pickle.HIGHEST_PROTOCOL)
Example #14
0
def mySRC(X_train_array, Y_train_array, X_test_array):
    print 'SRC'
    src1 = OrthogonalMatchingPursuit()
    src1.fit(X_train_array, Y_train_array)
    predict = src1.predict(X_test_array)
    return predict
Example #15
0
def train_error_data(n, J, x, y, train_size, nb_features, my_alphas):
    '''

    Parameters
    ----------
    n : number of repetitions.
    J : number of sparsity.
    x : data.
    y : desired output.
    train_size : number of training points.
    nb_features : number of features.
    my_alphas : array of different values for alpha.

    Returns : representation of MSE depending on sparsity for Lasso, OMP and Lars methods,
              for training points.
    -------

    '''
    #initialisation
    vec = np.zeros(train_size * J).reshape(train_size, J)
    res = np.zeros(n * J).reshape(n, J)
    somme = np.zeros(J)
    vec2 = np.zeros(train_size * J).reshape(train_size, J)
    res2 = np.zeros(n * J).reshape(n, J)
    somme2 = np.zeros(J)
    vec3 = np.zeros(train_size * J).reshape(train_size, J)
    res3 = np.zeros(n * J).reshape(n, J)
    somme3 = np.zeros(J)
    axes = np.arange(1, 11)

    # Average training squared error : n iterations and sparsity (1 to J)
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=train_size)
        for j in range(J):
            alpha_coef = alpha(X_train, train_size=train_size, nb_features=nb_features,
                               my_alphas=my_alphas)
            reg2 = Lasso(alpha=alpha_coef[j]).fit(X_train, y_train)
            reg = OrthogonalMatchingPursuit(n_nonzero_coefs=j + 1).fit(X_train, y_train)
            reg3 = Lars(n_nonzero_coefs=j + 1).fit(X_train, y_train)
            vec[:, j] = (y_train - reg.predict(X_train))**2
            res[i, j] = sum(vec[:, j]) / train_size
            vec2[:, j] = (y_train - (reg2.predict(X_train)))**2
            res2[i, j] = sum(vec2[:, j]) / train_size
            vec3[:, j] = (y_train - reg3.predict(X_train))**2
            res3[i, j] = sum(vec3[:, j]) / train_size

    for j in range(J):
        for i in range(n):
            somme[j] = somme[j] + res[i, j]
            somme2[j] = somme2[j] + res2[i, j]
            somme3[j] = somme3[j] + res3[i, j]

    # plot the results
    plt.plot(axes, somme / n, label='OMP')
    plt.plot(axes, somme2 / n, label='Lasso')
    plt.plot(axes, somme3 / n, label='Lars')

    plt.xlabel('sparsity')
    plt.ylabel('train error')
    plt.title('Performance comparison on simulation data')
    plt.legend()
Example #16
0
    tss, rss, ess, r2 = xss(Y, elasticNetCV.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试OrthogonalMatchingPursuit类**********"
    # 在初始化OrthogonalMatchingPursuit类时, 指定参数n_nonzero_coefs, 默认值是None.
    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=3)
    # 拟合训练集
    omp.fit(train_X, train_Y)
    # 打印模型的系数
    print "系数:", omp.coef_
    print "截距:", omp.intercept_
    print '训练集R2: ', r2_score(train_Y, omp.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = omp.predict(test_X)
    print "测试集得分:", omp.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, omp.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2