Esempio n. 1
0
def create_model_LARS(state_matrix, transcription_factors):
    regulators = {}

    for i in range(len(transcription_factors)):
        #Declaration for training set for the Target Gene
        X = []
        y = []

        for j in range(1, len(state_matrix)):
            X.append(state_matrix[j - 1].tolist())
            y.append(state_matrix[j][i] - state_matrix[j - 1][i])

        #Initialise the LARS Model
        lars = Lars()

        #Fit the training data into the Model
        lars.fit(X, y)

        #Extract the important features corresponding to a particular gene
        coefficients = lars.coef_

        #Add to the dictionary
        regulators[transcription_factors[i]] = coefficients

    return regulators
Esempio n. 2
0
    def fit(self, X, y):
        assert not y is None, f'y:{y}'
        k = X.shape[1]
        self.k_ = k
        if self.max_k is None:
            if self.k_share is None:
                self.max_k = 500
            else:
                self.max_k = int(k * self.k_share)

        if self.selector is None:
            self.selector = 'Lars'
        if self.selector == 'Lars':
            selector = Lars(fit_intercept=1, normalize=1, n_nonzero_coefs=self.max_k)
        elif self.selector == 'elastic-net':
            selector = ElasticNet(fit_intercept=True, selection='random', tol=0.001, max_iter=5000, warm_start=1,
                                  random_state=0)
        else:
            selector = self.selector

        selector.fit(X, y)
        self.col_select_ = np.arange(k)[np.abs(selector.coef_) > 0.0001]
        if self.col_select_.size < 1:
            self.col_select_ = np.arange(1)
        return self
Esempio n. 3
0
def LarsRegressorGS(X_train, X_test, y_train, y_test):
    reg = Lars()
    grid_values = {
        'n_nonzero_coefs': list(range(100, 500, 100)),
    }
    grid_reg = GridSearchCV(
        reg,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg.fit(X_train, y_train)
    reg = grid_reg.best_estimator_
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred = reg.predict(X=X_train)
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    best_params: dict = grid_reg.best_params_
    saveBestParams(nameOfModel="LarsRegressorGS", best_params=best_params)
    logSave(nameOfModel="LarsRegressorGS",
            reg=reg,
            metrics=metrics,
            val_metrics=val_metrics)
Esempio n. 4
0
    def runLarsRegressor(self):
        lm = Lars(fit_intercept=True, normalize=True)

        print("Lars Regressor\n")
        lm.fit(self.m_X_train, self.m_y_train)
        predictY = lm.predict(self.m_X_test)
        score = lm.score(self.m_X_test, self.m_y_test)
        predictTraingY = lm.predict(self.m_X_train)

        self.displayPredictPlot(predictY)
        self.displayResidualPlot(predictY, predictTraingY)
        self.dispalyModelResult(lm, predictY, score)
Esempio n. 5
0
    def run(self, X, y=None):
        """
            Fits filter

            Parameters
            ----------
            X : numpy array, shape (n_samples, n_features)
                The training input samples.
            y : numpy array, optional
                The target values (ignored).

            Returns
            ----------
            W : array-like, shape (n_features, k)
                Feature weight matrix.

            See Also
            --------

            examples
            --------
            from ITMO_FS.filters.sparse import MCFS
            from sklearn.datasets import make_classification
            import numpy as np

            dataset = make_classification(n_samples=100, n_features=20, n_informative=4, n_redundant=0, shuffle=False)
            data, target = np.array(dataset[0]), np.array(dataset[1])
            model = MCFS(d=5, k=2, scheme='heat')
            weights = model.run(data, target)
            print(model.feature_ranking(weights))

        """
        n_samples, n_features = X.shape
        graph = NearestNeighbors(n_neighbors=self.p + 1, algorithm='ball_tree').fit(X).kneighbors_graph(X).toarray()
        graph = graph + graph.T

        indices = [[(i, j) for j in range(n_samples)] for i in range(n_samples)]
        func = np.vectorize(lambda xy: graph[xy[0]][xy[1]] * self.scheme(X[xy[0]], X[xy[1]]), signature='(1)->()')
        W = func(indices)

        D = np.diag(W.sum(axis=0))
        L = D - W
        eigvals, Y = eigh(type=1, a=L, b=D, eigvals=(0, self.k - 1))

        weights = np.zeros((n_features, self.k))
        for i in range(self.k):
            clf = Lars(n_nonzero_coefs=self.d)
            clf.fit(X, Y[:, i])
            weights[:, i] = clf.coef_

        return weights
Esempio n. 6
0
class _LarsImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Esempio n. 7
0
def LarsRegressor(X_train, X_test, y_train, y_test):
    reg = Lars()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    printMetrics(y_true=y_test, y_pred=y_pred)
    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred = reg.predict(X=X_train)
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    logSave(nameOfModel="LarsRegressor",
            reg=reg,
            metrics=metrics,
            val_metrics=val_metrics)
Esempio n. 8
0
def perform_LARS(normalized_matrix,genes):
	#Number of Genes
	no_genes = len(genes)

	#Dictionary for top regulators for each gene
	regulators = {}
    
	for i in range(0,no_genes):
		#Current Gene for which the Top Regulators are being found
		current_y = normalized_matrix[:,i]

		#Create a copy of the matrix
		temp_matrix = normalized_matrix.copy()

		#Remove the current feature
		temp_matrix = np.delete(temp_matrix,i,axis=1)		

		#Computation of the coefficients after training with Least Angle Regression Method
		coefficients = Lars()

		#Fit the Model
		coefficients.fit(temp_matrix,current_y)

		#Coefficient values
		coeff_values = coefficients.coef_

		#Copy the genes into a temporary list
		gene_copy = list(genes)

		#Remove the Gene to create the appropriate indexes
		gene_copy.remove(genes[i])
        
        #Perform Stability Selection to get an effective rank of the top regulators
		rank_dict_score = stability_selection(temp_matrix,genes,2000,current_y,gene_copy)

		#Top Regulators
		top_regulators = find_top_regulators(rank_dict_score)

		#Append to regulators
		regulators[genes[i]] = top_regulators	


	return regulators
Esempio n. 9
0
def create_model_LARS(state_matrix, transcription_factors):
    regulators = {}

    for i in range(0, len(transcription_factors)):
        #Create the training set
        X = []
        y = []
        for j in range(1, len(state_matrix)):
            #Append the expression level of the previous step
            X.append(state_matrix[j - 1].tolist())

            #The output value is the difference / rate of change of expression
            y.append(state_matrix[j][i] - state_matrix[j - 1][i])

        #Copy the list of Transcription Factors
        tf_copy = list(transcription_factors)

        #Remove the current transcription factor
        tf_copy.remove(tf_copy[i])

        #Remove the corresponding column from the training set
        [expression.remove(expression[i]) for expression in X]
        """ Feature Selection using Least Angle Regression """

        #Initialise the model using Least Angle Regression
        lars = Lars()

        #Fit the training data into the Model
        lars.fit(X, y)

        #Extract the important features corresponding to a particular gene
        coefficients = lars.coef_

        #Regulators for the Network
        regulators[transcription_factors[i]] = coefficients

    return regulators
def main():
    from_root = "~/Documents/School/ComputerScience/ahcompsci/Scikit-Learning-StanleyWei/scikit-utkproject/dataset/fiftytwo"
    path = "dataset/whitemensmall/"
    dirs = os.listdir(path)
    main_df = add_images_from_dirs(dirs, path)

    train_images, test_images = train_test_split(main_df.loc[:, "image"],
                                                 main_df.loc[:, "gender"])

    # train_df = train_df.loc[train_df['ethnicity'] == "0"]
    # test_df = test_df.loc[test_df['ethnicity'] == "0"]
    train_x = flatten_image_df(train_images)
    test_x = flatten_image_df(test_images)

    clf = Lars()
    # train_x = np.array(train_df.loc[:, "image"])
    # x_train = train_x.flatten().reshape(len(train_df), -1)
    clf.fit(train_x, train_df.loc[:, "age"].to_numpy())

    coefficients = clf.coef_
    # print(coefficients)
    coefficients_array = np.array(coefficients).reshape(
        len(train_df.image[0]), -1)
    # print(coefficients_array)
    # heatmap = plt.imshow(coefficients_array, cmap = "hot", interpolation = "nearest")
    coefficients_abs = coefficients
    for i in range(len(coefficients_abs)):
        coefficients_abs[i] = abs(coefficients_abs[i])
    coefficients_array_abs = np.array(coefficients_abs).reshape(
        len(train_df.image[0]), -1)
    heatmap = plt.imshow(coefficients_array_abs,
                         cmap="hot",
                         interpolation="nearest")
    # heatmap_extremes = plt.imshow(coefficients_array_abs, vmax = 0.025, cmap = "hot", interpolation = "nearest")
    plt.colorbar(heatmap)
    # plt.colorbar(heatmap_extremes)
    plt.show()
Esempio n. 11
0
# LARS Regression
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Lars
# load the diabetes datasets
dataset = datasets.load_diabetes()
# fit a LARS model to the data
model = Lars()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))
Esempio n. 12
0
    cdg = CDG.CollinearDataGenerator(p = 20,sparsity=.8)
    X = cdg.getX(n)
    p = X.shape[1]
    y = cdg.getY(X)

    print cdg.gamma

    val_size = int(0.1 * X.shape[0])
    X_val = X[0:val_size,:]
    y_val = y[0:val_size,:]
    X_train = X[val_size:,:]
    y_train = y[val_size:,:]

    lars = Lars(n_nonzero_coefs=2)
    lars.fit(X,y)
    # print lars.coef_

    alphas, order, coefs = lars_path(X,y.T[0],verbose=True)
    # print alphas
    print order
    magnitudes = sorted(list(enumerate(coefs[:,-1])),key=lambda x: x[1])
    magnitudes = map(lambda x: x[0],magnitudes)
    print magnitudes
    # print coefs
    quantities = coefs[:,-1]
    quantities = np.array([quantities[i] for i in order])
    # print quantities
    total = sum(abs(quantities))
    # # print total
    cumsum = np.array(reduce(lambda a, x: a + [a[-1] + abs(x)], quantities[1:],[abs(quantities[0])]))
 def larsLearn(kap):
     lars = Lars(n_nonzero_coefs=kap,fit_intercept=False)
     lars.fit(X_train,y_train)
     return lars
Esempio n. 14
0
model = sm.OLS(housing['labels'], housing['data'])

results = model.fit()

print results.summary()

# Part B
preds_train = lin.predict(housing['data'])
preds_test = lin.predict(housing['testdata'])

ave_sq_loss_train = ((housing['labels'] - preds_train) ** 2).sum()/len(housing['data'][:,1])

ave_sq_loss_test = ((housing['testlabels'] - preds_test) ** 2).sum()/len(housing['testdata'][:,1])

print ave_sq_loss_train
print ave_sq_loss_test

# Part C
housing['data'] = housing['data'][:,1:14]
housing['testdata'] = housing['testdata'][:,1:14]

from sklearn.linear_model import Lars

reduced = Lars(fit_intercept = True, n_nonzero_coefs = 3)

reduced.fit(housing['data'], housing['labels'])

print reduced.intercept_
print reduced.coef_
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['Elastic_pca'] = sumsum / float(result_row)
 rs_score['Elastic_pca'] = r2_score(y_test, y)
 ElasticModel = ElasticNetCV()
 ElasticModel.fit(X_train_std, y_train)
 y = ElasticModel.predict(X_test_std)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['Elastic_std'] = sumsum / float(result_row)
 rs_score['Elastic_std'] = r2_score(y_test, y)
 LarsModel = Lars()
 LarsModel.fit(X_train_pca, y_train)
 y = LarsModel.predict(X_test_pca)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
     sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
 rank_result['Lars_pca'] = sumsum / float(result_row)
 rs_score['Lars_pca'] = r2_score(y_test, y)
 LarsModel = Lars()
 LarsModel.fit(X_train_std, y_train)
 y = LarsModel.predict(X_test_std)
 [result_row] = y.shape
 sumsum = 0
 #print y
 for i in range(result_row):
Esempio n. 16
0
def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
                   regularization=None, copy_cov=True,
                   init=None, max_iter=1000):
    """Generic sparse coding

    Each column of the result is the solution to a Lasso problem.

    Parameters
    ----------
    X: array of shape (n_samples, n_features)
        Data matrix.

    dictionary: array of shape (n_components, n_features)
        The dictionary matrix against which to solve the sparse coding of
        the data. Some of the algorithms assume normalized rows.

    gram: None | array, shape=(n_components, n_components)
        Precomputed Gram matrix, dictionary * dictionary'
        gram can be None if method is 'threshold'.

    cov: array, shape=(n_components, n_samples)
        Precomputed covariance, dictionary * X'

    algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
        lars: uses the least angle regression method (linear_model.lars_path)
        lasso_lars: uses Lars to compute the Lasso solution
        lasso_cd: uses the coordinate descent method to compute the
        Lasso solution (linear_model.Lasso). lasso_lars will be faster if
        the estimated components are sparse.
        omp: uses orthogonal matching pursuit to estimate the sparse solution
        threshold: squashes to zero all coefficients less than regularization
        from the projection dictionary * data'

    regularization : int | float
        The regularization parameter. It corresponds to alpha when
        algorithm is 'lasso_lars', 'lasso_cd' or 'threshold'.
        Otherwise it corresponds to n_nonzero_coefs.

    init: array of shape (n_samples, n_components)
        Initialization value of the sparse code. Only used if
        `algorithm='lasso_cd'`.

    max_iter: int, 1000 by default
        Maximum number of iterations to perform if `algorithm='lasso_cd'`.

    copy_cov: boolean, optional
        Whether to copy the precomputed covariance matrix; if False, it may be
        overwritten.

    Returns
    -------
    code: array of shape (n_components, n_features)
        The sparse codes

    See also
    --------
    sklearn.linear_model.lars_path
    sklearn.linear_model.orthogonal_mp
    sklearn.linear_model.Lasso
    SparseCoder
    """
    if X.ndim == 1:
        X = X[:, np.newaxis]
    n_samples, n_features = X.shape
    if cov is None and algorithm != 'lasso_cd':
        # overwriting cov is safe
        copy_cov = False
        cov = np.dot(dictionary, X.T)

    if algorithm == 'lasso_admm':
        alpha = float(regularization) / n_features  # account for scaling
        try:
            err_mgt = np.seterr(all='ignore')

            code, dictionary = lasso_admm(X.T, dictionary.T,
                                          gamma=alpha,
                                          gram=gram, cov=cov,
                                          max_iter=max_iter)

            new_code = code.T
        finally:
            np.seterr(**err_mgt)

    elif algorithm == 'lasso_lars':
        alpha = float(regularization) / n_features  # account for scaling
        try:
            err_mgt = np.seterr(all='ignore')
            lasso_lars = LassoLars(alpha=alpha, fit_intercept=False,
                                   verbose=False, normalize=False,
                                   precompute=gram, fit_path=False)
            lasso_lars.fit(dictionary.T, X.T, Xy=cov)
            new_code = lasso_lars.coef_
        finally:
            np.seterr(**err_mgt)

    elif algorithm == 'lasso_cd':
        alpha = float(regularization) / n_features  # account for scaling
        clf = Lasso(alpha=alpha, fit_intercept=False, precompute=gram,
                    max_iter=max_iter, warm_start=True)
        clf.coef_ = init
        clf.fit(dictionary.T, X.T)
        new_code = clf.coef_

    elif algorithm == 'lars':
        try:
            err_mgt = np.seterr(all='ignore')
            lars = Lars(fit_intercept=False, verbose=False, normalize=False,
                        precompute=gram, n_nonzero_coefs=int(regularization),
                        fit_path=False)
            lars.fit(dictionary.T, X.T, Xy=cov)
            new_code = lars.coef_
        finally:
            np.seterr(**err_mgt)

    elif algorithm == 'threshold':
        new_code = ((np.sign(cov) *
                    np.maximum(np.abs(cov) - regularization, 0)).T)

    elif algorithm == 'omp':
        new_code = orthogonal_mp_gram(gram, cov, regularization, None,
                                      row_norms(X, squared=True),
                                      copy_Xy=copy_cov).T
    else:
        raise ValueError('Sparse coding method must be "lasso_lars" '
                         '"lasso_cd",  "lasso", "threshold" or "omp", got %s.'
                         % algorithm)
    return new_code
Esempio n. 17
0
print new_reg_data.shape
#(200, 11)

#Taking a more fundamental基本的 approach to regularization正则化 with LARS

#Least-angle regression (LARS) is a regression technique that is well suited for 
#high-dimensional problems, that is, p >> n, where p denotes the columns or features 
#and n is the number of samples.

from sklearn.datasets import make_regression
reg_data, reg_target = make_regression(n_samples=200,
                                           n_features=500, n_informative=10, noise=2)
                                           
from sklearn.linear_model import Lars
lars = Lars(n_nonzero_coefs=10)
lars.fit(reg_data, reg_target)
print np.sum(lars.coef_ != 0)
#10

train_n = 100
lars_12 = Lars(n_nonzero_coefs=12)
lars_12.fit(reg_data[:train_n], reg_target[:train_n])
lars_500 = Lars() # it's 500 by default
lars_500.fit(reg_data[:train_n], reg_target[:train_n]);
#Now, to see how well each feature fit the unknown data, do the following:
np.mean(np.power(reg_target[train_n:] - lars_12.predict(reg_data[train_n:]), 2))
#31.527714163321001
np.mean(np.power(reg_target[train_n:] - lars_500.predict(reg_data[train_n:]), 2))
#9.6198147535136237e+30

from sklearn.linear_model import LarsCV
Esempio n. 18
0
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + Elastic Net
elasticnet = ElasticNet(l1_ratio=0.5)
elasticnet.fit(reduced_training_features, training_labels)
preds = elasticnet.predict(reduced_testing_features)
score = elasticnet.score(reduced_testing_features,testing_labels)
print 'PCA + ElasticNet Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
# Least-Angle Regression (LARS)
from sklearn.linear_model import Lars
lars = Lars()
lars.fit(training_features, training_labels)
preds = lars.predict(testing_features)
score = lars.score(testing_features,testing_labels)
print 'LARS Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds), '\n'
 
# PCA + LARS
lars = Lars()
lars.fit(reduced_training_features, training_labels)
preds = lars.predict(reduced_testing_features)
score = lars.score(reduced_testing_features,testing_labels)
print 'PCA + LARS Results:'
print 'R2 score:', score
print 'MAE:', mean_absolute_error(testing_labels,preds)
 
Esempio n. 19
0
                    # ('ppru', 'ppr_submission_user.csv', 'ppr_fitted_user.csv'),
                    # ('pprg', 'ppr_submission_global.csv', 'ppr_fitted_global.csv'),
                    ]

    fitted = pd.DataFrame(index=review_data.index)
    submission = pd.DataFrame(index=review_data_final.index)
    for name, sub_name, fit_name in blend_inputs:
        f_df = pd.read_csv(os.path.join('..', fit_name))
        f_df.index = review_data.index
        fitted[name] = f_df['stars']
        s_df = pd.read_csv(os.path.join('..', sub_name))
        s_df.index = review_data_final.index
        submission[name] = s_df['stars']

    gbr = GradientBoostingRegressor(max_depth=3,verbose=2)
    gbr.fit(fitted, review_data['stars'])
    pred = gbr.predict(submission)
    pd.DataFrame({'review_id' : submission.index, 'stars' : np.maximum(1, np.minimum(5, pred))}).to_csv('../gbr_submission.csv', index=False)

    lar = Lars(fit_intercept=True, verbose=2, normalize=True, fit_path=True)
    lar.fit(fitted, review_data['stars'])
    pred = lar.predict(submission)
    pd.DataFrame({'review_id' : submission.index, 'stars' : np.maximum(1, np.minimum(5, pred))}).to_csv('../lar_submission.csv', index=False)

    ridge = Ridge()
    ridge.fit(fitted, review_data['stars'])
    pred = ridge.predict(submission)
    pd.DataFrame({'review_id' : submission.index, 'stars' : np.maximum(1, np.minimum(5, pred))}).to_csv('../ridge_submission.csv', index=False)
    
    ## TODO: blend based on size of rating neighborhood
    y_train = ml_outs.loc[train_index]
    x_test = ml.loc[test_index]
    y_test = ml_outs.loc[test_index]

    # Scale
    scaler = StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    # Implemnent Model
    linreg = Lars()  # Better
    linreg = LarsCV()
    # one Better
    linreg = LassoLarsCV()  # Same
    linreg = LinearRegression()
    linreg.fit(x_train, y_train)
    predictions = linreg.predict(x_test)

    # Plot predictions and y_test
    plt.figure()
    plt.plot(predictions, label='Predictions')
    plt.plot(pd.Series(predictions).rolling(5).mean(),
             label='rolling predictions')
    plt.plot(y_test.values,
             label='Shifted Currencies ( y_test values',
             color='grey')
    plt.plot(cu.loc[test_index, currency].values, label='UNSHIFTED')
    plt.legend()
    plt.show()

    # Print Score and summary
Esempio n. 21
0
    def _fit(self, X, y):
        """
            Fits the filter.

            Parameters
            ----------
            X : array-like, shape (n_samples, n_features)
                The training input samples.
            y : array-like
                The target values (ignored).

            Returns
            ----------
            None
        """
        if self.scheme == '0-1':
            scheme = self.__scheme_01
        elif self.scheme == 'heat':
            scheme = self.__scheme_heat
        elif self.scheme == 'dot':
            scheme = self.__scheme_dot
        else:
            getLogger(__name__).error(
                "scheme should be either '0-1', 'heat' or 'dot'; %s passed",
                self.scheme)
            raise KeyError(
                "scheme should be either '0-1', 'heat' or 'dot'; %s passed" %
                self.scheme)

        n_samples = X.shape[0]

        if self.k > n_samples:
            getLogger(__name__).error(
                "Cannot find %d clusters with n_samples = %d", self.k,
                n_samples)
            raise ValueError("Cannot find %d clusters with n_samples = %d" %
                             (self.k, n_samples))

        if self.p >= n_samples:
            getLogger(__name__).error(
                "Cannot select %d nearest neighbors with n_samples = %d",
                self.p, n_samples)
            raise ValueError(
                "Cannot select %d nearest neighbors with n_samples = %d" %
                (self.p, n_samples))

        if self.full_graph:
            graph = np.ones((n_samples, n_samples))
        else:
            graph = NearestNeighbors(
                n_neighbors=self.p,
                algorithm='ball_tree').fit(X).kneighbors_graph().toarray()
            graph = np.minimum(1, graph + graph.T)

        getLogger(__name__).info("Nearest neighbors graph: %s", graph)

        W = graph * pairwise_distances(X, metric=lambda x, y: scheme(x, y))
        getLogger(__name__).info("W: %s", W)
        D = np.diag(W.sum(axis=0))
        getLogger(__name__).info("D: %s", D)
        L = D - W
        getLogger(__name__).info("L: %s", L)
        eigvals, Y = eigh(type=1, a=L, b=D, subset_by_index=[1, self.k])
        getLogger(__name__).info("Eigenvalues: %s, classes: %s", eigvals, Y)

        weights = np.zeros((self.n_features_, self.k))
        for i in range(self.k):
            clf = Lars(n_nonzero_coefs=self.n_features)
            clf.fit(X, Y[:, i])
            weights[:, i] = np.abs(clf.coef_)
            getLogger(__name__).info("Weights for eigenvalue %d: %s", i,
                                     weights[:, i])

        self.feature_scores_ = weights.max(axis=1)
        getLogger(__name__).info("Feature scores: %s", self.feature_scores_)
        ranking = np.argsort(self.feature_scores_)[::-1]
        self.selected_features_ = ranking[:self.n_features]
Esempio n. 22
0
def stability_selection(expression_matrix,genes,R,y,gene_copy):
	#Final Score for each of the transcription factors
	score = []

	#Coefficients for each iteration
	coefficients = []

	#Run the Selection Algorithm for R/2 times
	for i in range(0,R/2):
		#Indexes for Randomly splitting the data into equal halves
		indices = range(0,len(genes)-1)

		#Randomly Shuffle the indices
		random.shuffle(indices)

		#Split into two parts
		first_half = indices[:len(genes)/2]
		second_half = indices[len(genes)/2:]

		#First Half of the Expression Matrix
		extract_first_half = expression_matrix[:,first_half]

		#Second Half of the Expression Matrix
		extract_second_half = expression_matrix[:,second_half]

		#Randomly Perturb Data by multiplying the expression of candidate TF's with a number b/w (alpha,1), where alpha belongs to (0,1)
		alpha = 0.19

		#Perturbation
		perturbation = random.uniform(alpha,1)

		#Multiply the expression matrix
		perturbed_first_half = extract_first_half * perturbation

		perturbed_second_half = extract_second_half * perturbation

		#Run LARS on each of them to get the score
		coeff = Lars()

		#Fit the First Half
		coeff.fit(perturbed_first_half,y)
        
        #Result for the first half of the split
		result_first_half = coeff.coef_

		#Fit the second half
		coeff.fit(perturbed_second_half,y)

		#Result for the second half of the split
		result_second_half = coeff.coef_

		temp_dict = {}

		#Creation of Singular Score Array
		for i in range(0,len(first_half)):
			temp_dict[first_half[i]] = result_first_half[i]


		for i in range(0,len(second_half)):
			temp_dict[second_half[i]] = result_second_half[i]


        #Append the values into the empty list
		coeff_list = []

		for val in temp_dict.values():
			coeff_list.append(val)

		#Append to main coeff list
		coefficients.append(coeff_list)

	
    #Ranks for Each Regulator Gene
	ranks = get_ranks(coefficients,gene_copy)

	return ranks
Esempio n. 23
0
            时间复杂度比较低
            可以快速改造成lasso
        缺点:
            因为模型是对残差进行迭代设计,所以对噪声敏感
        
'''
rg = Lars(fit_intercept=True,
          verbose=False,
          normalize=True,
          precompute='auto',
          n_nonzero_coefs=500,
          eps=2.2204460492503131e-16,
          copy_X=True,
          fit_path=True,
          positive=False)
rg.fit(X_train, Y_train)
Y_pre = rg.predict(X_test)
rg.score(X_test, Y_test)
rg.coef_
rg.intercept_
'''
    fit_intercept                       是否训练截距
    verbose                             冗长度
    normalize                           归一化否
    precompute                          是否使用Gram矩阵来加速
    n_nonzero_coefs                     非零系数的目标数
    eps                                 精确度,计算某个值时用到
    copy_X                              是否覆盖模型中的X
    fit_path                            不太理解,暂时应该也用不到
    positive                            设置强制系数为正的嘛?
'''
Esempio n. 24
0
class LarsClass:
    """
    Name      : Lars
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'lars'

        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/regression/resource/regression_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = Lars(normalize=False)

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model,
                                                     'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}_rg.pkl',
                    self._f_path +
                    f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
def TIGRESS(X,
            y,
            nsplit=100,
            nstepsLARS=5,
            alpha=0.4,
            scoring="area"):
    """
    TIGRESS score predictor based on stability selection.

    Args:
        X (pandas.DataFrame): Transcriptor factor gene expressions where rows
            are experimental conditions and columns are transcription factors
        y (pandas.Series): Target gene expression vector where rows are
            experimental conditions
        nsplit (int): number of splits applied,
            i.e., randomization tests, the highest the best
        nstepsLARS (int): number of steps of LARS algorithm,
            i.e., number of non zero coefficients to keep (Lars parameter)
        alpha: Noise multiplier coefficient,
            Each transcription factor expression is multiplied by a
            random variable $\in [\alpha,1]$
        scoring (str): option used to score each possible link
            only "area" and "max" options are available

    Returns:
        numpy.array: co-regulation scores

        The i-th element of the score array represents the score assigned by the
        sklearn randomizedlasso stability selection to the regulatory
        relationship between the target gene and transcription factor i.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> np.random.seed(0)
        >>> tfs = pd.DataFrame(np.random.randn(5,3),
                               index =["c1","c2","c3","c4","c5"],
                               columns=["tf1","tf2","tf3"])
        >>> tg = pd.Series(np.random.randn(5),index=["c1","c2","c3","c4","c5"])
        >>> scores = TIGRESS(tfs,tg)
        >>> scores
        array([349.   , 312.875, 588.125])
    """
    n,p = X.shape
    halfsize = int(n/2)
    if nstepsLARS > p:
        nstepsLARS = p-1
    freq = np.zeros((p, nstepsLARS))
    i = 0
    while i < nsplit:
        # Randomly reweight each variable (TF expression)
        random_perturbation = np.random.uniform(low=alpha, high=1.0, size=p)
        X *= random_perturbation
        # Randomly split the sample in two sets
        X_1,X_2,y_1,y_2 = train_test_split(X,y,test_size=halfsize, shuffle=True)
        for X_i,y_i in [[X_1, y_1],[X_2,y_2]]:
            if y_i.std() > 0:
                # run LARS on each subsample and collect variables are selected
                lars = Lars(normalize=False, n_nonzero_coefs=nstepsLARS)
                lars.fit(X_i,y_i)
                # collect the presence of the coefficients along the path
                path = lars.coef_path_
                if path.shape[1] < nstepsLARS+1:
                    path_add = np.tile(path[:,-1],(nstepsLARS+1 - path.shape[1],1)).T
                    path = np.hstack((path,path_add))
                freq += np.abs(np.sign(path[:,1:]))
                i += 1
        X /= random_perturbation
    # normalize frequence in [0,1] to get stability curves
    freq /= 2*halfsize
    if (scoring=="area"):
        score = np.cumsum(freq,axis=1)/np.arange(1,nstepsLARS+1,1)
    if (scoring=="max"):
        score = np.maximum.accumulate(freq,axis=1)
    return(score[:,nstepsLARS - 1])
Esempio n. 26
0
def main():
    feature_vectors = []
    movie_features = []
    features_train = []
    data = None
    with open("movie-data/ratings-train.csv") as f:
        data = f.readlines()
    data = data[1:]
    for val in data:
        line = val[:-1]
        line = line.split(",")
        line = [float(val) for val in line]
        feature_vectors.append(line)
    feature_vectors = np.array(feature_vectors)
    ratings_train = feature_vectors[:, 2]
    movie_ids = feature_vectors[:, 1]
    data = None
    with open("movie-data/movie-features.csv") as f:
        data = f.readlines()
    data = data[1:]
    for val in data:
        line = val[:-1]
        line = line.split(",")
        line = [float(val) for val in line]
        movie_features.append(line)
    movie_features = np.array(movie_features)
    error = []
    error_lars = []
    error_lasso = []
    for i in range(671):
        person_features = feature_vectors[(feature_vectors[:,0] - 1) == i]
        for k in range(len(person_features[:, 2])):
            person_features[:,2][k] = (person_features[:,2][k] - np.mean(person_features[:, 2]))/(np.std(person_features[:, 2]))
        MOVIE_IDS = person_features[:, 1]
        features_train = movie_features[np.array(MOVIE_IDS, int) - 1]
        features_train[:, 0] = 1.0
        for p in range(1, features_train.shape[1]):
            features_train[:, p] = (features_train[:, p] - np.mean(features_train[:, p]))/(np.std(features_train[:, p]) + 10**-8)
        lasso = Lasso(alpha = 0.01, normalize=True, fit_intercept = True)
        alphas, coeff, _ = lasso_path(features_train, person_features[:, 2], 5e-3, positive = True, fit_intercept = False)
        alphas = -np.log10(alphas)
        colors = cycle(['b', 'r', 'g', 'c', 'k'])
        plt.xlabel('alphas')
        plt.ylabel('coeff')
        k = 0
        for val, c in zip(coeff, colors):
            print(k)
            k = k + 1
            print(np.array(val).shape)
            print(np.array(alphas).shape)
            plt.plot(alphas, val, c=c)
        plt.show()
        pred = lasso.fit(features_train, person_features[:, 2]).predict(features_train)
        error_lasso.append(np.mean((pred - person_features[:, 2])**2))
        clf = Ridge(alpha = 0.1, normalize=True, fit_intercept = True)
        reg = Lars(n_nonzero_coefs = 5)
        pred = clf.fit(features_train, person_features[:, 2]).predict(features_train)
        error.append(np.mean((pred - person_features[:, 2])**2))
        pred = reg.fit(features_train, person_features[:, 2]).predict(features_train)
        #pred[pred < 0] = 0
        error_lars.append(np.mean((pred - person_features[:, 2])**2))
        print(np.mean((pred - person_features[:, 2])**2))
	#print(features_train)
	#print(person_features)
    plt.figure(1)
    #plt.title('least angles regression')
    plt.xlabel('users')
    plt.ylabel('error')
    line_up, = plt.plot(error_lars, label='least angles regression')
    #plt.figure(2)
    #plt.xlabel('users')
    #plt.ylabel('error')
    #plt.title('ridge regression')
    line_down, = plt.plot(error, label = 'ridge regression')
    #plt.figure(3)
    #plt.xlabel('users')
    #plt.ylabel('error')
    #plt.title('lasso regression')
    line_hoz, = plt.plot(error_lasso, label = 'lasso regression')
    plt.legend(handles=[line_up, line_down, line_hoz])
    plt.show()
    print("lar error: " + str(np.mean(error_lars)))
    print("ridge error: " + str(np.mean(error)))
    print("lasso error: " + str(np.mean(error_lasso)))
    #print(movie_ids)
    #print(np.array(movie_features[:, 0], int) == movie_ids)
    features_train = movie_features[np.array(movie_ids, int) - 1]
    features_train[:, 0] = 1.0
    #for i in range(1, features_train.shape[1]):
    #    features_train[:, i] = (features_train[:, i] - np.mean(features_train[:, i]))/(np.std(features_train[:, i]))
    features_train = (features_train - np.mean(features_train))/np.std(features_train)
    clf = Ridge(alpha = 0.1, normalize=True, fit_intercept = True)
    pred = clf.fit(features_train, ratings_train).predict(features_train)
    reg = Lars(n_nonzero_coefs = np.inf)
    #pred = reg.fit(features_train, ratings_train).predict(features_train)
    error = (pred - ratings_train)**2
    plt.plot(error)
    plt.show()
Esempio n. 27
0
# LARS Regression
# The Least Angle Regression (LARS) method is a computationally efficient algorithm for fitting a regression model.
# It is useful for high­dimensional data and is commonly used in conjunction with regularization (such as LASSO).
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Lars

# load the diabetes datasets
dataset = datasets.load_diabetes()

# fit a LARS model to the data
model = Lars()
model.fit(dataset.data, dataset.target)
print(model)

# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))

Esempio n. 28
0
#LarsCV: fit_intercept, verbose, normalize, cv

from sklearn.linear_model import LarsCV, Lars
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
X, y = make_regression(n_samples=200, noise=4.0, random_state=0)
reg = LarsCV(cv=5).fit(X, y)
reg.score(X, y)
reg.alpha_
pred = reg.predict(X[:, ])

plt.scatter(X[:, 0], y, color='black')
plt.scatter(X[:, 0], pred, color='red')
plt.show()

reg2 = Lars().fit(X, y)
reg2.score(X, y)
reg2.alpha_
pred = reg2.predict(X[:, ])

#%% LassoLars: alpha, fit_intercept, normalize
#LassoLarsCV: alpha, fit_intercept, normalize, cv
from sklearn import linear_model
reg = linear_model.LassoLars(alpha=0.01)
reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])

print(reg.coef_)

reg2 = linear_model.LassoLarsCV()
reg2.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
Esempio n. 29
0
# In[3]:

from sklearn.linear_model import Lasso  # AdaptiveLasso找不到
# LASSO回归的特点是在拟合广义线性模型的同时进行变量筛选和复杂度调整。 因此,不论目标因变量是连续的,还是二元或者多元离散的,
#都可以用LASSO回归建模然后预测。 这里的变量筛选是指不把所有的变量都放入模型中进行拟合,而是有选择的把变量放入模型从而得到更好的性能参数。
model = Lasso(alpha=0.1)
model.fit(data[['x1', 'x2', 'x3', 'x4', 'x5', 'x7']],
          data['y'])  # data.iloc[:, 0:13]
print model.coef_  # 各个特征的系数
print model.intercept_

# In[4]:

from sklearn.linear_model import Lars  #最小角回归
model1 = Lars(n_nonzero_coefs=7)
model1.fit(data.iloc[:, 0:13], data['y'])
print model1.coef_  # 各个特征的系数

# In[5]:

# 确定最合适的Alpha
from sklearn.linear_model import LarsCV  #交叉验证最小二乘法回归模型
model1 = LarsCV()
model1.fit(data.iloc[:, 0:13], data['y'])
print model1.coef_  # 各个特征的系数
print model1.alpha_

# In[6]:

from sklearn.linear_model import LassoCV  #交叉验证最小二乘法回归模型
model1 = LassoCV()
Esempio n. 30
0
    def fit(self):

        # 1. construct a placeholder called 'qhat_k_container' for the list of all q_hat^k (defined in Algorithm 2) of each subsample
        qhat_k_container = list()

        # 2. estimate q_hat^k (for the solution path) on each subsample and save them as elements of the placeholder
        for j in range(self.n_repeat):

            # a. randomly choose a subset of sample points (whose index is 'index_subsample') that is used to generate a subsample in each repeat
            index_subsample = np.random.choice(self.train_size,
                                               self.subsample_size,
                                               replace=False)
            # b. based on 'index_subsample', take the corresponding observations of X out and save them as the subample
            X_subsample = self.X_so[index_subsample]
            # c. based on 'index_subsample', take the corresponding observations of Y out and save them as the subample
            y_subsample = self.y_so[index_subsample]

            # d. scikit-learn requires 'y_subsample' to be an one-dimension array
            y_subsample.shape = (y_subsample.shape[0], )

            # e. given a subsample, compute q_hat^k (the solution path) using lars

            # e(1). call the class 'Lars'
            trial_1 = Lars(n_nonzero_coefs=min(X_subsample.shape[1] +
                                               1, X_subsample.shape[0] + 1))
            # e(2). fit lars on the subsample
            trial_1.fit(X_subsample, y_subsample)
            # e(3). save the active set of lars (indices of variables select by lars) as 'active'.
            active = trial_1.active_

            # f. The active set of lars is ranked based on the chronology of variable inclusion at different stages of lars. For example [2,1,3] means x_2 is included at stage 1, x_1 is included at stage 2 and x_3 is included at stage 3. Based on the active set of lars, we compute q_hat^k (defined as 'qhat_k' in code) as defined in Algorithm 2

            # f(1). we generate 'qhat_k' as an array of zeros;
            qhat_k = np.zeros((1, self.n_dim))
            # f(2). we compute the i-th value of q_hat^k for the corresponding variable based on Algorithm 2; replace i-th term in 'qhat_k' with the value we just compute
            for i in active:

                qhat_k[0, i] = 1 - \
                    (np.where(np.array(active) == i)[0][0]) / (self.n_dim)

            # f(3). we append the result into 'qhat_k_container' as one element of the list
            qhat_k_container.append(qhat_k)

        # 3. if self.lasso == True, we compute CV-lars-lasso and CV-cd on the original sample X and Y (not on the subsample)
        if (self.lasso == True):

            # a(1). call the class for CV-lars-lasso (called LassoLarsCV in Scikit-learn)
            # a(2). we set the number of folds in CV as 10
            trial_2 = LassoLarsCV(cv=10)
            # b. change y into one-dimensional array (required by Scikit-learn)
            yy = self.y
            yy.shape = (self.sample_size, )
            # c.  fit CV-lars-lasso on X and Y
            trial_2.fit(self.X, yy)

            # d. save 'la_list' as the number of variables in the active set of CV-lars-lasso
            la_list = len(trial_2.active_)
            # e. save 'la_vari_list' as the active set of CV-lars-lasso
            la_vari_list = trial_2.active_

            # f. call the class for CV-cd (called LassoCV in Scikit-learn)
            # f(1). we set the number of folds in CV as 10
            # f(2). for reproduction, we fix the random seed of training-validation split in CV (random_state=0)
            trial_3 = LassoCV(cv=10, random_state=0)

            # g.  fit cv-cd on X and Y
            trial_3.fit(self.X, yy)

            # h. save 'cd_list' as the number of variables in the active set of CV-cd
            cd_list = np.count_nonzero(trial_3.coef_)
            # i. save 'cd_vari_list' as the active set of CV-cd
            cd_vari_list = np.nonzero(trial_3.coef_)[0]

        # 4. compute q_hat and Q(c) (defined in Algorithm 2)
        # a(1). we transform the list of all q_hat^k ('qhat_k_container') into a matrix ('qhat_k_container_matrix')
        # a(2). row of the matrix: the q_hat^k on a given subsample for all variables
        # a(3). colum of the matrix: the corresponding value of q_hat^k for a given variable on all subsamples
        qhat_k_container_matrix = np.concatenate(qhat_k_container, axis=0)
        # b.  compute the the value of qhat for each variable (qhat defined in Algorithm 2 of the paper)
        qhat_value = np.mean(qhat_k_container_matrix, axis=0)

        # c. set 'Qc_list' as the container of Q(c) for all value of c
        Qc_list = list()
        # d. set 'c_seq' as the sequence of c for the grid search of c* in solar
        c_seq = np.arange(max(qhat_value), 0.1, self.step_size)

        # e. generate Q(c) for each value of c
        for j in c_seq:
            # e(1). define 'container' as the placeholder of Q(c) when c == j;
            container = list()

            for i in range(self.X.shape[1]):
                # e(2). include all variables into 'container' if their corresponding values in q-hat are larger or equal to j;
                if (qhat_value[i] >= j):

                    container.append(i)
            # e(3). append 'container' (Q(c) when c == j) into 'Qc_list' (the container of Q(c) for all value of c);
            Qc_list.append(container)

        # 5. compute the test error of each value of c
        # we use grid search on test set to choose c*;
        # for each value of c in the grid search, train a OLS of Y_so on the variables of Q(c) in X_so (Y_so and X_so defined at the begining);

        # a. container for test errors
        test_error = list()

        # b. compute the test error of each Q(c) on test set
        # b(0). set i as the indices of all variables in Q(c) for a given value of c;
        for i in Qc_list:
            # b(1). call the LinearRegression class;
            OLS_1 = LinearRegression()
            # b(2). compute OLS of Y_so on the variables in Q(c) in X_so;
            OLS_1.fit(self.X_so[:, i], self.y_so)
            # b(3). compute the L2 prediction error of OLS on test set (X_test, y_test);
            s1 = costs_com(self.X_test[:, i], self.y_test, OLS_1)
            loss_test_1, _ = s1.L2()
            # b(4). save the L2 error as the test error of Q(c) for each value of c; append it into the container of test errors;
            test_error.append(loss_test_1)

        # 6. tuning c via grid search
        # 6(a). transform 'test_error' from a list into an array;
        test_error = np.asarray(test_error)
        # 6(b). save the location of minimum of 'test_error' as 'min_loc_val';
        min_loc_val = np.where(test_error == min(test_error))[0]
        # 6(c). save the correpsonding value of c (c*) as 'opt_c';
        opt_c = c_seq[min_loc_val]
        # 6(d). find Q(c*) and save it as 'Q_opt_c';
        Q_opt_c = Qc_list[max(min_loc_val)]

        # 7. Regression of Y onto the selected variables ( Q(c*) ) in X
        # 7(a). call the LinearRegression class;
        OLS_2 = LinearRegression()
        # 7(b). fit OLS of Y on the variables of Q(c*) in X;
        OLS_2.fit(self.X[:, Qc_list[max(min_loc_val)]], self.y)
        # 7(c). set 'solar_coef' (an array of zeros) as the placeholder of solar regression coefficents
        solar_coef = np.zeros([self.n_dim, 1])
        # 7(d). put the estimated regression coefficents into their corresponding place of 'solar_coef'
        solar_coef[Q_opt_c, 0] = OLS_2.coef_

        # 8. define la_list, la_vari_list as empty list if self.lasso != True (if we don't want to compute cv-lars-lasso and cv-cd)
        if (self.lasso != True):

            la_list = []
            la_vari_list = []
            cd_list = []
            cd_vari_list = []

        return solar_coef, opt_c, test_error, Qc_list, la_list, la_vari_list, cd_list, cd_vari_list
Esempio n. 31
0
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Author: Quan Pan <*****@*****.**>
# License: MIT License
# Create: 2016-12-02

from sklearn.linear_model import Lars

# X = [[0., 0.], [1., 1.], [10., 10.]]
X = [[0.0], [1.0], [10.0]]
y = [0.0, 1.0, 10.0]
# x_preb = [[5., 5.], [-10., -10.]]
x_preb = [[5.], [-10.]]

clf = Lars(n_nonzero_coefs=1)
clf.fit(X, y)
print(clf.coef_)
y_pred = clf.predict(x_preb)
print y_pred
def Lar_regr(features, labels):
    from sklearn.linear_model import Lars
    model = Lars()
    model.fit(features, labels)
    pred = model.predict(features)
    AsGraph(labels, pred)
Esempio n. 33
0
#!/usr/bin/env python
'''
Input variables:
    - X_TRAIN: path of a numpy array with x.
    - Y_TRAIN: path of a numpy array with y.
    - C: number of features to select.
Output files:
    - features_lars.npy: numpy array with the 0-based index of 
    the selected features.
'''

import numpy as np
from sklearn.linear_model import Lars
from sklearn.feature_selection import SelectFromModel

x_train = np.load('${X_TRAIN}')
y_train = np.load('${Y_TRAIN}')

clf = Lars(n_nonzero_coefs = ${C})
clf.fit(x_train, y_train)

sfm = SelectFromModel(clf, prefit = True)
features = np.where(sfm.get_support())[0]
np.save('features_lars.npy', features)
    lasso_gamma = np.array([[0. if abs(x) < 1e-100 else 1. for x in lasso.coef_]]).T
    # P = lambda X: lasso.predict(X)
    lasso_predictor = PredictorWrapper.PredictorWrapper(lasso_beta,lasso_gamma,lasso.predict)
    dill.dump(lasso_predictor,open('%sLASSO.p' % logDir,'wb'))
    with open(logFile,'a') as f:
        f.write('Lasso c: %15.10f        alpha: %15.10f\n' % (1./(2.* X_tr.shape[0]), optLam))



    ##############
    ## LARS_SET ##
    ##############
    kappa = [2,4,10]
    for k in kappa:
        lars = Lars(n_nonzero_coefs=k,fit_intercept=False)
        lars.fit(X_tr,y_tr)
        lars_beta = np.array([lars.coef_]).T
        lars_gamma = np.zeros((X_tr.shape[1],1))
        lars_gamma[lars.active_] = 1.
        lars_predictor = PredictorWrapper.PredictorWrapper(lars_beta,lars_gamma,lars.predict)
        dill.dump(lars_predictor,open('%sLARS_%02d.p' % (logDir,k),'wb'))

    ##############
    ## LARS_OPT ##
    ##############
    larsKappas = np.linspace(0,40,41,dtype=int)

    def larsEval(learned):
        learned_yhat = np.array([learned.predict(X_val)]).T
        learned_mse = sum((y_val - learned_yhat) ** 2)[0]
        return learned_mse
from sklearn.linear_model import Lars
from sklearn.linear_model import LarsCV

# データロード
reg_data, reg_target = make_regression(n_samples=200,
                                       n_features=500,
                                       n_informative=10,
                                       noise=2)

# 1 LARSによる学習 --------------------------------------------------------------------------

# インスタンス生成
lars = Lars(n_nonzero_coefs=10)

# 学習
lars.fit(reg_data, reg_target)

# 比ゼロ係数の数
np.sum(lars.coef_ != 0)

# 2 LARSモデルの実行比較 ---------------------------------------------------------------------

# <ポイント>
# - データを半分に取り分けてLARSモデルで学習

# 変数定義
# --- 訓練データ数
train_n = 100

# インスタンス生成と学習
# --- 非ゼロ係数の数を12個とする
Esempio n. 36
0
def sparse_encode(X,
                  dictionary,
                  algorithm='mp',
                  fit_tol=None,
                  P_cum=None,
                  l0_sparseness=10,
                  C=0.,
                  do_sym=True,
                  verbose=0):
    """Generic sparse coding

    Each column of the result is the solution to a sparse coding problem.

    Parameters
    ----------
    X : array of shape (n_samples, n_pixels)
        Data matrix.

    dictionary : array of shape (n_dictionary, n_pixels)
        The dictionary matrix against which to solve the sparse coding of
        the data. Some of the algorithms assume normalized rows.


    algorithm : {'mp', 'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}
        mp :  Matching Pursuit
        lars: uses the least angle regression method (linear_model.lars_path)
        lasso_lars: uses Lars to compute the Lasso solution
        lasso_cd: uses the coordinate descent method to compute the
        Lasso solution (linear_model.Lasso). lasso_lars will be faster if
        the estimated dictionary are sparse.
        omp: uses orthogonal matching pursuit to estimate the sparse solution
        threshold: squashes to zero all coefficients less than regularization
        from the projection dictionary * data'

    max_iter : int, 1000 by default
        Maximum number of iterations to perform if `algorithm='lasso_cd'`.

    verbose : int
        Controls the verbosity; the higher, the more messages. Defaults to 0.

    Returns
    -------
    code : array of shape (n_samples, n_dictionary)
        The sparse codes

    """
    if X.ndim == 1:
        X = X[:, np.newaxis]
    #n_samples, n_pixels = X.shape

    if algorithm == 'lasso_lars':
        alpha = float(regularization) / n_pixels  # account for scaling

        from sklearn.linear_model import LassoLars

        # Not passing in verbose=max(0, verbose-1) because Lars.fit already
        # corrects the verbosity level.
        cov = np.dot(dictionary, X.T)
        lasso_lars = LassoLars(alpha=fit_tol,
                               fit_intercept=False,
                               verbose=verbose,
                               normalize=False,
                               precompute=None,
                               fit_path=False)
        lasso_lars.fit(dictionary.T, X.T, Xy=cov)
        sparse_code = lasso_lars.coef_.T

    elif algorithm == 'lasso_cd':
        alpha = float(regularization) / n_pixels  # account for scaling

        # TODO: Make verbosity argument for Lasso?
        # sklearn.linear_model.coordinate_descent.enet_path has a verbosity
        # argument that we could pass in from Lasso.
        from sklearn.linear_model import Lasso
        clf = Lasso(alpha=fit_tol,
                    fit_intercept=False,
                    normalize=False,
                    precompute=None,
                    max_iter=max_iter,
                    warm_start=True)

        if init is not None:
            clf.coef_ = init

        clf.fit(dictionary.T, X.T, check_input=check_input)
        sparse_code = clf.coef_.T

    elif algorithm == 'lars':

        # Not passing in verbose=max(0, verbose-1) because Lars.fit already
        # corrects the verbosity level.
        from sklearn.linear_model import Lars
        cov = np.dot(dictionary, X.T)
        lars = Lars(fit_intercept=False,
                    verbose=verbose,
                    normalize=False,
                    precompute=None,
                    n_nonzero_coefs=l0_sparseness,
                    fit_path=False)
        lars.fit(dictionary.T, X.T, Xy=cov)
        sparse_code = lars.coef_.T

    elif algorithm == 'threshold':
        cov = np.dot(dictionary, X.T)
        sparse_code = ((np.sign(cov) *
                        np.maximum(np.abs(cov) - regularization, 0))).T

    elif algorithm == 'omp':
        # TODO: Should verbose argument be passed to this?
        from sklearn.linear_model import orthogonal_mp_gram
        from sklearn.utils.extmath import row_norms

        cov = np.dot(dictionary, X.T)
        gram = np.dot(dictionary, dictionary.T)
        sparse_code = orthogonal_mp_gram(Gram=gram,
                                         Xy=cov,
                                         n_nonzero_coefs=l0_sparseness,
                                         tol=None,
                                         norms_squared=row_norms(X,
                                                                 squared=True),
                                         copy_Xy=False).T

    elif algorithm == 'mp':
        sparse_code = mp(X,
                         dictionary,
                         l0_sparseness=l0_sparseness,
                         fit_tol=fit_tol,
                         P_cum=P_cum,
                         C=C,
                         do_sym=do_sym,
                         verbose=verbose)
    else:
        raise ValueError(
            'Sparse coding method must be "mp", "lasso_lars" '
            '"lasso_cd",  "lasso", "threshold" or "omp", got %s.' % algorithm)
    return sparse_code
Esempio n. 37
0
## ridge回归
ridge = Ridge(alpha=0.8)
ridge.fit(train_X, train_y)
predictions = ridge.predict(test_X)
print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y)))

## lasso回归
lasso = Lasso(alpha=0.9)
lasso.fit(train_X, train_y)
predictions = lasso.predict(test_X)
print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y)))

## 最小角回归
lars = Lars(n_nozero_coefs=100)
lars.fit(train_X, train_y)
predictions = lars.predict(test_X)
print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y)))

## 线性回归
lr = LinearRegression()
lr.fit(train_X, train_y)
predictions = lr.predict(train_X, train_y)
print('MAE is ', mean_absolute_error(np.expm1(predictions), np.expm1(test_y)))

## 决策树回归
dtr = DecisionTreeRegressor(criterion='mae',
                            max_depth=5,
                            min_samples_split=4,
                            max_features='sqrt',
                            min_samples_leaf=2)