Example #1
0
def test_sparse_lasso_not_as_toy_dataset():
    n_samples, n_features, max_iter = 100, 100, 1000
    n_informative = 10

    X, y = make_sparse_data(n_samples, n_features, n_informative)

    X_train, X_test = X[n_samples / 2:], X[:n_samples / 2]
    y_train, y_test = y[n_samples / 2:], y[:n_samples / 2]

    s_clf = SparseLasso(alpha=0.1,
                        fit_intercept=False,
                        max_iter=max_iter,
                        tol=1e-7)
    s_clf.fit(X_train, y_train)
    assert_almost_equal(s_clf.dual_gap_, 0, 4)
    assert s_clf.score(X_test, y_test) > 0.85

    # check the convergence is the same as the dense version
    d_clf = DenseLasso(alpha=0.1,
                       fit_intercept=False,
                       max_iter=max_iter,
                       tol=1e-7)
    d_clf.fit(X_train, y_train)
    assert_almost_equal(d_clf.dual_gap_, 0, 4)
    assert d_clf.score(X_test, y_test) > 0.85

    # check that the coefs are sparse
    assert_equal(np.sum(s_clf.coef_ != 0.0), n_informative)
    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        # False
        self._clf = Lasso(
            alpha=self.hyperparams['alpha'],
            # fit_intercept=self.hyperparams['fit_intercept'],
            # normalize=self.hyperparams['normalize'],
            # precompute=self.hyperparams['precompute'],
            # max_iter=self.hyperparams['max_iter'],
            # tol=self.hyperparams['tol'],
            # warm_start=self.hyperparams['warm_start'],
            # positive=self.hyperparams['positive'],
            # selection=self.hyperparams['selection'],
            random_state=self.random_seed,
        )
        # self._F = None
        # self._F_inv = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._fitted = False
Example #3
0
class LassoImpl():

    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'):
        self._hyperparams = {
            'alpha': alpha,
            'fit_intercept': fit_intercept,
            'normalize': normalize,
            'precompute': precompute,
            'copy_X': copy_X,
            'max_iter': max_iter,
            'tol': tol,
            'warm_start': warm_start,
            'positive': positive,
            'random_state': random_state,
            'selection': selection}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
def test_lasso_zero():
    # Check that the lasso can handle zero data without crashing
    X = [[0], [0], [0]]
    y = [0, 0, 0]
    clf = Lasso(alpha=0.1).fit(X, y)
    pred = clf.predict([[1], [2], [3]])
    assert_array_almost_equal(clf.coef_, [0])
    assert_array_almost_equal(pred, [0, 0, 0])
    assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_zero():
    # Check that the lasso can handle zero data without crashing
    X = [[0], [0], [0]]
    y = [0, 0, 0]
    clf = Lasso(alpha=0.1).fit(X, y)
    pred = clf.predict([[1], [2], [3]])
    assert_array_almost_equal(clf.coef_, [0])
    assert_array_almost_equal(pred, [0, 0, 0])
    assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_zero():
    """Check that the sparse lasso can handle zero data without crashing"""
    X = sp.csc_matrix((3, 1))
    y = [0, 0, 0]
    T = np.array([[1], [2], [3]])
    clf = Lasso().fit(X, y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0])
    assert_array_almost_equal(pred, [0, 0, 0])
    assert_almost_equal(clf.dual_gap_,  0)
Example #7
0
def test_lasso_zero():
    """Check that the sparse lasso can handle zero data without crashing"""
    X = sp.csc_matrix((3, 1))
    y = [0, 0, 0]
    T = np.array([[1], [2], [3]])
    clf = Lasso().fit(X, y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0])
    assert_array_almost_equal(pred, [0, 0, 0])
    assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_readonly_data():
    X = np.array([[-1], [0], [1]])
    Y = np.array([-1, 0, 1])   # just a straight line
    T = np.array([[2], [3], [4]])  # test sample
    with TempMemmap((X, Y)) as (X, Y):
        clf = Lasso(alpha=0.5)
        clf.fit(X, Y)
        pred = clf.predict(T)
        assert_array_almost_equal(clf.coef_, [.25])
        assert_array_almost_equal(pred, [0.5, 0.75, 1.])
        assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_readonly_data():
    X = np.array([[-1], [0], [1]])
    Y = np.array([-1, 0, 1])   # just a straight line
    T = np.array([[2], [3], [4]])  # test sample
    with TempMemmap((X, Y)) as (X, Y):
        clf = Lasso(alpha=0.5)
        clf.fit(X, Y)
        pred = clf.predict(T)
        assert_array_almost_equal(clf.coef_, [.25])
        assert_array_almost_equal(pred, [0.5, 0.75, 1.])
        assert_almost_equal(clf.dual_gap_, 0)
def test_lasso_alpha_warning():
    check_warnings()  # Skip if unsupported Python version
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')
        X = [[-1], [0], [1]]
        Y = [-1, 0, 1]  # just a straight line

        clf = Lasso(alpha=0)
        clf.fit(X, Y)

        assert_greater(len(w), 0)  # warnings should be raised
def test_lasso_alpha_warning():
    check_warnings()  # Skip if unsupported Python version
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        X = [[-1], [0], [1]]
        Y = [-1, 0, 1]  # just a straight line

        clf = Lasso(alpha=0)
        clf.fit(X, Y)

        assert_greater(len(w), 0)  # warnings should be raised
Example #12
0
 def __init__(self,
              alpha=1.0,
              fit_intercept=True,
              normalize=False,
              precompute=False,
              copy_X=True,
              max_iter=1000,
              tol=0.0001,
              warm_start=False,
              positive=False,
              random_state=None,
              selection='cyclic'):
     self._hyperparams = {
         'alpha': alpha,
         'fit_intercept': fit_intercept,
         'normalize': normalize,
         'precompute': precompute,
         'copy_X': copy_X,
         'max_iter': max_iter,
         'tol': tol,
         'warm_start': warm_start,
         'positive': positive,
         'random_state': random_state,
         'selection': selection
     }
     self._wrapped_model = Op(**self._hyperparams)
Example #13
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
def test_deprection_precompute_enet():
    # Test that setting precompute="auto" gives a Deprecation Warning.

    X, y, _, _ = build_dataset(n_samples=20, n_features=10)
    clf = ElasticNet(precompute="auto")
    assert_warns(DeprecationWarning, clf.fit, X, y)
    clf = Lasso(precompute="auto")
    assert_warns(DeprecationWarning, clf.fit, X, y)
Example #15
0
def test_sparse_enet_coordinate_descent():
    """Test that a warning is issued if model does not converge"""
    clf = Lasso(max_iter=2)
    n_samples = 5
    n_features = 2
    X = sp.csc_matrix((n_samples, n_features)) * 1e50
    y = np.ones(n_samples)
    assert_warns(ConvergenceWarning, clf.fit, X, y)
Example #16
0
    def run(self):
        params = {'alpha': float(self.alpha_text.text()),
                  'fit_intercept': self.fitInterceptCheckBox.isChecked(),
                  'max_iter': int(self.maxNumOfIterationsSpinBox.value()),
                  'tol': self.toleranceDoubleSpinBox.value(),
                  'positive': self.forcePositiveCoefficientsCheckBox.isChecked(),
                  'selection': 'random'}
#                  'CV': self.optimizeWCrossValidaitonCheckBox.isChecked()}
        return params, self.getChangedValues(params, Lasso())
def test_lasso_fit_intercept():
    X = [[-1], [0], [1]]
    Y = [-1, 0, 1]

    clf = Lasso(fit_intercept=False)
    clf.fit(X, Y)
    assert_equal(clf.coef_.shape, (1,))

    clf2 = Lasso(fit_intercept=True)
    clf2.fit(X, Y)
    assert_equal(clf.coef_.shape, (1,))
def test_lasso_positive_constraint():
    X = [[-1], [0], [1]]
    y = [1, 0, -1]       # just a straight line with negative slope

    lasso = Lasso(alpha=0.1, max_iter=1000, positive=True)
    lasso.fit(X, y)
    assert_true(min(lasso.coef_) >= 0)

    lasso = Lasso(alpha=0.1, max_iter=1000, precompute=True, positive=True)
    lasso.fit(X, y)
    assert_true(min(lasso.coef_) >= 0)
def test_sparse_input_convergence_warning():
    X, y, _, _ = build_dataset(n_samples=1000, n_features=500)

    with pytest.warns(ConvergenceWarning):
        ElasticNet(max_iter=1, tol=0).fit(
            sparse.csr_matrix(X, dtype=np.float32), y)

    # check that the model converges w/o warnings
    with pytest.warns(None) as record:
        Lasso(max_iter=1000).fit(sparse.csr_matrix(X, dtype=np.float32), y)

    assert not record.list
Example #20
0
def test_fit_simple_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    Solver = h2o4gpu.Lasso

    enet = Solver(glm_stop_early=False, backend=backend)
    print("h2o4gpu fit()")
    enet.fit(X, y)
    print("h2o4gpu predict()")
    print(enet.predict(X))
    print("h2o4gpu score()")
    print(enet.score(X, y))

    enet_wrapper = Solver(positive=True, random_state=1234, backend=backend)
    print("h2o4gpu scikit wrapper fit()")
    enet_wrapper.fit(X, y)
    print("h2o4gpu scikit wrapper predict()")
    print(enet_wrapper.predict(X))
    print("h2o4gpu scikit wrapper score()")
    print(enet_wrapper.score(X, y))

    from sklearn.linear_model.coordinate_descent import Lasso
    enet_sk = Lasso(positive=True, random_state=1234)
    print("Scikit fit()")
    enet_sk.fit(X, y)
    print("Scikit predict()")
    print(enet_sk.predict(X))
    print("Scikit score()")
    print(enet_sk.score(X, y))

    enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray()
    enet_sk_sparse_coef = csr_matrix(enet_sk.sparse_coef_,
                                     dtype=np.float32).toarray()

    if backend != 'h2o4gpu':
        print(enet_sk.coef_)
        print(enet_sk.sparse_coef_)

        print(enet_sk_coef)
        print(enet_sk_sparse_coef)

        print(enet_wrapper.coef_)
        print(enet_wrapper.sparse_coef_)

        print(enet_sk.intercept_)
        print(enet_wrapper.intercept_)

        print(enet_sk.n_iter_)
        print(enet_wrapper.n_iter_)

        print(enet_wrapper.time_prepare)
        print(enet_wrapper.time_upload_data)
        print(enet_wrapper.time_fitonly)

        assert np.allclose(enet_wrapper.coef_, enet_sk_coef)
        assert np.allclose(enet_wrapper.intercept_, enet_sk.intercept_)
        assert np.allclose(enet_wrapper.n_iter_, enet_sk.n_iter_)
def test_sparse_lasso_not_as_toy_dataset():
    n_samples, n_features, max_iter = 100, 100, 1000
    n_informative = 10

    X, y = make_sparse_data(n_samples, n_features, n_informative)

    X_train, X_test = X[n_samples / 2:], X[:n_samples / 2]
    y_train, y_test = y[n_samples / 2:], y[:n_samples / 2]

    s_clf = SparseLasso(alpha=0.1, fit_intercept=False,
                        max_iter=max_iter, tol=1e-7)
    s_clf.fit(X_train, y_train)
    assert_almost_equal(s_clf.dual_gap_, 0, 4)
    assert s_clf.score(X_test, y_test) > 0.85

    # check the convergence is the same as the dense version
    d_clf = DenseLasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
    d_clf.fit(X_train, y_train)
    assert_almost_equal(d_clf.dual_gap_, 0, 4)
    assert d_clf.score(X_test, y_test) > 0.85

    # check that the coefs are sparse
    assert_equal(np.sum(s_clf.coef_ != 0.0), n_informative)
def test_lasso_positive_constraint():
    X = [[-1], [0], [1]]
    y = [1, 0, -1]       # just a straight line with negative slope

    lasso = Lasso(alpha=0.1, max_iter=1000, positive=True)
    lasso.fit(X, y)
    assert min(lasso.coef_) >= 0

    lasso = Lasso(alpha=0.1, max_iter=1000, precompute=True, positive=True)
    lasso.fit(X, y)
    assert min(lasso.coef_) >= 0
def K_fold_CrossValidation(k , dataFrame , target , regressorType):
    trainDataSet = pd.DataFrame(dataFrame)
    regressor = Regression
    if(regressorType == "GDB"):
        regressor = ensemble.GradientBoostingRegressor(n_estimators=1000, max_depth=4, min_samples_split=2,
                                            learning_rate=0.001, loss='ls')
    if(regressorType == "LN"):
        regressor = LinearRegression()
    if (regressorType == "SVR"):
        regressor = SVR(kernel='linear', C=1e3)
    if (regressorType == "LS"):
        regressor = Lasso(alpha=0.001, normalize=True)

    part_size = int(np.floor(len(trainDataSet) / float(k)))
    best_part = 0
    min_error = 1000

    for i in range(0,k):
        trainSubSet = trainDataSet[:][0:i*part_size].append(trainDataSet[:][(i+1)*part_size:])
        testSubSet = trainDataSet[i*part_size:(i+1)*part_size]
        targetSubSet = target[:][0:i*part_size].append(target[:][(i+1)*part_size:])
        desireValue = target[i*part_size:(i+1)*part_size]

        regressor.fit(trainSubSet,targetSubSet.values.ravel())
        predictedValue = regressor.predict(testSubSet)

        value = 0.00
        for i in range(len(predictedValue)):
             print predictedValue[i]
             print desireValue.values[i]
             value += ((predictedValue[i] - desireValue.values[i]) ** 2)
             print "value  -- " , value
        error  = math.sqrt(value / part_size)

        print "error = " , error
        if(error < min_error):
            min_error = error
            best_part = i

    print("min_error =   " , min_error )
    trainSubSet = trainDataSet[:][0:best_part*part_size].append(trainDataSet[:][(best_part+1)*part_size:])
    targetSubSet = target[:][0:best_part*part_size].append(target[:][(best_part+1)*part_size:])
    regressor.fit(trainSubSet,targetSubSet.values.ravel())
    return regressor
def test_lasso_toy():
    # Test Lasso on a toy example for various values of alpha.
    # When validating this against glmnet notice that glmnet divides it
    # against nobs.

    X = [[-1], [0], [1]]
    Y = [-1, 0, 1]       # just a straight line
    T = [[2], [3], [4]]  # test sample

    clf = Lasso(alpha=1e-8)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [1])
    assert_array_almost_equal(pred, [2, 3, 4])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = Lasso(alpha=0.1)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [.85])
    assert_array_almost_equal(pred, [1.7, 2.55, 3.4])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = Lasso(alpha=0.5)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [.25])
    assert_array_almost_equal(pred, [0.5, 0.75, 1.])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = Lasso(alpha=1)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [.0])
    assert_array_almost_equal(pred, [0, 0, 0])
    assert_almost_equal(clf.dual_gap_, 0)
def test_coef_shape_not_zero():
    est_no_intercept = Lasso(fit_intercept=False)
    est_no_intercept.fit(np.c_[np.ones(3)], np.ones(3))
    assert est_no_intercept.coef_.shape == (1,)
Example #26
0
print y_test.shape

print X_train[123, :]
'''
norm1 =  np.linalg.norm(y_train)    
if norm1 != 0:   
    y_train, y_test =  y_train/norm1, y_test/norm1
print norm1
'''

print y_train.shape

model = SVR(C=1.0, gamma=1.0)
model = LinearRegression()

lasso = Lasso(alpha=0.1).fit(X_train, y_train)
enet = ElasticNet(alpha=0.1, l1_ratio=0.7).fit(X_train, y_train)

y_pred = lasso.predict(X_test)

print "MSE", mean_squared_error(y_test, y_pred)
m = np.mean(y_test)
print "MSE (Mean)", mean_squared_error(y_test, m * np.ones(len(y_test)))

print "r^2 on test data", r2_score(y_test, y_pred)

plt.plot(enet.coef_, label='Elastic net coefficients')
plt.plot(lasso.coef_, label='Lasso coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f" % (r2_score(
    y_test, lasso.predict(X_test)), r2_score(y_test, enet.predict(X_test))))
Example #27
0
K_N_N = KNeighborsClassifier()
SUPPORT_VECTOR = svm.SVC(kernel="linear")

# Ensemble classifiers
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
GRADIENT_BOOST_CL = GradientBoostingClassifier(n_estimators=100)
ADA_BOOST = AdaBoostClassifier(n_estimators=100)
EXTRA_TREE = ExtraTreesClassifier(n_estimators=100)


# Regressors
GRADIENT_BOOST_RG = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)
LINEAR_RG = LinearRegression()
RIDGE_RG = Ridge()
LASSO_RG = Lasso()
SVR_RG = SVR()

def getClassifierMap():
    CLASSIFIER_MAP = {
    "DECISION_TREE": DECISION_TREE,
    "LOGISTIC_REGRESSION": LOGISTIC_REGRESSION,
    "NAIVE_BAYS": NAIVE_BAYS,
    "K_N_N": K_N_N,
    "SUPPORT_VECTOR": SUPPORT_VECTOR,
    "RANDOM_FOREST": RANDOM_FOREST,
    "GRADIENT_BOOST": GRADIENT_BOOST_CL,
    "ADA_BOOST": GRADIENT_BOOST_CL,
    "EXTRA_TREE": EXTRA_TREE
    }
    return CLASSIFIER_MAP
                             n_estimators=10), ['predict_proba', 'predict'],
  create_weird_classification_problem_1()),
 (LogisticRegression(), ['predict_proba', 'predict'],
  create_weird_classification_problem_1()),
 (IsotonicRegression(out_of_bounds='clip'), ['predict'],
  create_isotonic_regression_problem_1()),
 (Earth(), ['predict', 'transform'], create_regression_problem_1()),
 (Earth(allow_missing=True), ['predict', 'transform'],
  create_regression_problem_with_missingness_1()),
 (ElasticNet(), ['predict'], create_regression_problem_1()),
 (ElasticNetCV(), ['predict'], create_regression_problem_1()),
 (LassoCV(), ['predict'], create_regression_problem_1()),
 (Ridge(), ['predict'], create_regression_problem_1()),
 (RidgeCV(), ['predict'], create_regression_problem_1()),
 (SGDRegressor(), ['predict'], create_regression_problem_1()),
 (Lasso(), ['predict'], create_regression_problem_1()),
 (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]),
  ['predict', 'predict_proba'], create_weird_classification_problem_1()),
 (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))],
               transformer_weights={
                   'earth': 1,
                   'earth2': 2
               }), ['transform'], create_weird_classification_problem_1()),
 (RandomForestRegressor(), ['predict'], create_regression_problem_1()),
 (CalibratedClassifierCV(LogisticRegression(),
                         'isotonic'), ['predict_proba'],
  create_weird_classification_problem_1()),
 (AdaBoostRegressor(), ['predict'], create_regression_problem_1()),
 (BaggingRegressor(), ['predict'], create_regression_problem_1()),
 (BaggingClassifier(), ['predict_proba'],
  create_weird_classification_problem_1()),
Example #29
0
			'IncrementalPCA':IncrementalPCA(),
			'IsolationForest':IsolationForest(),
			'Isomap':Isomap(),
			'KMeans':KMeans(),
			'KNeighborsClassifier':KNeighborsClassifier(),
			'KNeighborsRegressor':KNeighborsRegressor(),
			'KernelCenterer':KernelCenterer(),
			'KernelDensity':KernelDensity(),
			'KernelPCA':KernelPCA(),
			'KernelRidge':KernelRidge(),
			'LSHForest':LSHForest(),
			'LabelPropagation':LabelPropagation(),
			'LabelSpreading':LabelSpreading(),
			'Lars':Lars(),
			'LarsCV':LarsCV(),
			'Lasso':Lasso(),
			'LassoCV':LassoCV(),
			'LassoLars':LassoLars(),
			'LassoLarsCV':LassoLarsCV(),
			'LassoLarsIC':LassoLarsIC(),
			'LatentDirichletAllocation':LatentDirichletAllocation(),
			'LedoitWolf':LedoitWolf(),
			'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
			'LinearRegression':LinearRegression(),
			'LinearSVC':LinearSVC(),
			'LinearSVR':LinearSVR(),
			'LocallyLinearEmbedding':LocallyLinearEmbedding(),
			'LogisticRegression':LogisticRegression(),
			'LogisticRegressionCV':LogisticRegressionCV(),
			'MDS':MDS(),
			'MLPClassifier':MLPClassifier(),
def test_lasso_toy():
    # Test Lasso on a toy example for various values of alpha.
    # When validating this against glmnet notice that glmnet divides it
    # against nobs.

    X = [[-1], [0], [1]]
    Y = [-1, 0, 1]       # just a straight line
    T = [[2], [3], [4]]  # test sample

    clf = Lasso(alpha=1e-8)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [1])
    assert_array_almost_equal(pred, [2, 3, 4])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = Lasso(alpha=0.1)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [.85])
    assert_array_almost_equal(pred, [1.7, 2.55, 3.4])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = Lasso(alpha=0.5)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [.25])
    assert_array_almost_equal(pred, [0.5, 0.75, 1.])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = Lasso(alpha=1)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [.0])
    assert_array_almost_equal(pred, [0, 0, 0])
    assert_almost_equal(clf.dual_gap_, 0)
class TrimRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                   Hyperparams]):
    """
    Primitive using Trim in combination with Lasso. Code based on JPL's implementation of Lasso. 
    Trim deconfounding paper: https://arxiv.org/pdf/1811.05352.pdf
    `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html>`_
    """

    __author__ = "ISI"
    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "de250522-5edb-4697-8945-56d04baba0e4",
        "version":
        "1.0.0",
        "name":
        "TrimRegressor",
        "description":
        "Lasso enhanced by spectral deconfounding",
        "python_path":
        "d3m.primitives.regression.trim_regressor.TrimRegressor",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/serbanstan/trim-regressor"]
        },
        "algorithm_types": ["REGULARIZED_LEAST_SQUARES", 'FEATURE_SCALING'],
        "primitive_family":
        "REGRESSION",
        "installation": [config.INSTALLATION]

        # "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LASSO, ],
        # "name": "sklearn.linear_model.coordinate_descent.Lasso",
        # "primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
        # "python_path": "d3m.primitives.regression.lasso.SKlearn",
        # "source": {'name': 'JPL', 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html']},
        # "version": "v2019.2.27",
        # "id": "a7100c7d-8d8e-3f2a-a0ee-b4380383ed6c",
        # 'installation': [
        #                # TODO : Will update based on https://gitlab.com/datadrivendiscovery/d3m/issues/137
        #                #{
        #                #    "type": "PIP",
        #                #    "package_uri": "git+https://gitlab.com/datadrivendiscovery/common-primitives.git@26419dde2f660f901066c896a972ae4c438ee236#egg=common_primitives"
        #                #},
        #                {'type': metadata_base.PrimitiveInstallationType.PIP,
        #                   'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
        #                       git_commit=utils.current_git_commit(os.path.dirname(__file__)),
        #                    ),
        #                   }]
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        # False
        self._clf = Lasso(
            alpha=self.hyperparams['alpha'],
            # fit_intercept=self.hyperparams['fit_intercept'],
            # normalize=self.hyperparams['normalize'],
            # precompute=self.hyperparams['precompute'],
            # max_iter=self.hyperparams['max_iter'],
            # tol=self.hyperparams['tol'],
            # warm_start=self.hyperparams['warm_start'],
            # positive=self.hyperparams['positive'],
            # selection=self.hyperparams['selection'],
            random_state=self.random_seed,
        )
        # self._F = None
        # self._F_inv = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._fitted = False

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(
            outputs, self.hyperparams)
        self._fitted = False

    # Computes the linear transform F, so we work in the system (FX, FY) to recover the
    # true betas.
    def _compute_F(self, X_data):
        X = numpy.array(X_data)

        U, d, V = numpy.linalg.svd(X)

        r = len(d)

        tau = sorted(d)[int(r * self.hyperparams['trim_perc'])]

        d_hat = numpy.array([min(x, tau) / x for x in d])

        D_hat = numpy.zeros(U.shape)
        D_hat[:r, :r] = numpy.diag(d_hat)

        D_hat_inv = numpy.zeros(U.shape)
        D_hat_inv[:r, :r] = numpy.diag(1 / d_hat)

        F = numpy.dot(U, numpy.dot(D_hat, U.T))
        F_inv = numpy.dot(U, numpy.dot(D_hat_inv, U.T))

        return F, F_inv

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")
        self._target_columns_metadata = self._get_target_columns_metadata(
            self._training_outputs.metadata)

        shape = self._training_outputs.shape
        # if len(shape) == 2 and shape[1] == 1:
        #     sk_training_output = numpy.ravel(sk_training_output)

        # Don't want to use the d3mIndex columnZ
        X = numpy.array(
            self._training_inputs[self._training_inputs.columns[1:]])
        y = numpy.array(
            self._training_outputs[self._training_outputs.columns[1:]])
        if y.shape[1] == 1:
            y = y.ravel()

        F, _ = self._compute_F(X)

        new_inputs = numpy.dot(F, X)
        new_outputs = numpy.dot(F, y)

        # print(new_inputs.shape)
        # print(new_outputs.shape)

        self._beta = self._clf.fit(new_inputs, new_outputs).coef_

        remainder = y - numpy.dot(X, self._beta)

        # print(y[:10])
        # print(numpy.dot(X, self._beta)[:10])

        self._delta = self._clf.fit(X, remainder).coef_

        self._fitted = True

        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]

        # print(self._training_indices)
        # print(sk_inputs.head())

        # print((self._beta + self._delta).shape)
        # print(self._delta)

        # do prediction without index column
        sk_output = numpy.dot(sk_inputs[sk_inputs.columns[1:]],
                              self._beta + self._delta)
        if len(sk_output.shape) == 1:
            sk_output = sk_output.reshape(sk_output.shape[0], 1)

        # but add it back in afterwards
        idx_col = sk_inputs[sk_inputs.columns[0]].values
        if len(idx_col.shape) == 1:
            idx_col = idx_col.reshape(idx_col.shape[0], 1)

        sk_output = numpy.concatenate((idx_col, sk_output), axis=1)

        if sparse.issparse(sk_output):
            sk_output = sk_output.toarray()
        output = self._wrap_predictions(inputs, sk_output)
        output.columns = self._target_names
        outputs = common_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._target_column_indices,
            columns_list=[output])

        return CallResult(outputs)

    def get_params(self) -> Params:
        if not self._fitted:
            return Params(
                beta=None,
                delta=None,
                # coef_=None,
                # intercept_=None,
                # n_iter_=None,
                # dual_gap_=None,
                # l1_ratio=None,
                training_indices_=self._training_indices,
                target_names_=self._target_names,
                target_column_indices_=self._target_column_indices,
                target_columns_metadata_=self._target_columns_metadata)

        return Params(
            beta=self._beta,
            delta=self._delta,
            # coef_=getattr(self._clf, 'coef_', None),
            # intercept_=getattr(self._clf, 'intercept_', None),
            # n_iter_=getattr(self._clf, 'n_iter_', None),
            # dual_gap_=getattr(self._clf, 'dual_gap_', None),
            # l1_ratio=getattr(self._clf, 'l1_ratio', None),
            training_indices_=self._training_indices,
            target_names_=self._target_names,
            target_columns_metadata_=self._target_columns_metadata,
            target_column_indices_=self._target_column_indices)

    def set_params(self, *, params: Params) -> None:
        self._beta = params['beta'],
        self._delta = params['delta'],
        # self._clf.coef_ = params['coef_']
        # self._clf.intercept_ = params['intercept_']
        # self._clf.n_iter_ = params['n_iter_']
        # self._clf.dual_gap_ = params['dual_gap_']
        # self._clf.l1_ratio = params['l1_ratio']
        self._training_indices = params['training_indices_']
        self._target_names = params['target_names_']
        self._target_column_indices = params['target_column_indices_']
        self._target_columns_metadata = params['target_columns_metadata_']
        self._fitted = False

        if params['coef_'] is not None:
            self._fitted = True
        if params['intercept_'] is not None:
            self._fitted = True
        if params['n_iter_'] is not None:
            self._fitted = True
        if params['dual_gap_'] is not None:
            self._fitted = True
        if params['l1_ratio'] is not None:
            self._fitted = True

    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index,
                                           hyperparams)

        columns_to_produce, columns_not_to_produce = common_utils.get_columns_to_use(
            inputs_metadata,
            use_columns=hyperparams['use_input_columns'],
            exclude_columns=hyperparams['exclude_input_columns'],
            can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False
        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False

    @classmethod
    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return data, list(data.columns), []

        metadata = data.metadata

        def can_produce_column(column_index: int) -> bool:
            accepted_semantic_types = set()
            accepted_semantic_types.add(
                "https://metadata.datadrivendiscovery.org/types/TrueTarget")
            column_metadata = metadata.query(
                (metadata_base.ALL_ELEMENTS, column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            if len(semantic_types) == 0:
                cls.logger.warning(
                    "No semantic types found in column metadata")
                return False
            # Making sure all accepted_semantic_types are available in semantic_types
            if len(accepted_semantic_types - semantic_types) == 0:
                return True
            return False

        target_column_indices, target_columns_not_to_produce = common_utils.get_columns_to_use(
            metadata,
            use_columns=hyperparams['use_output_columns'],
            exclude_columns=hyperparams['exclude_output_columns'],
            can_use_column=can_produce_column)
        targets = common_utils.select_columns(data, target_column_indices)
        target_column_names = []
        for idx in target_column_indices:
            target_column_names.append(data.columns[idx])
        return targets, target_column_names, target_column_indices

    @classmethod
    def _get_target_columns_metadata(
            cls,
            outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]:
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(
                outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = list(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = [
                "https://metadata.datadrivendiscovery.org/types/TrueTarget",
                "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ]
            if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types:
                semantic_types.append(
                    'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
                )
            semantic_types = [
                semantic_type for semantic_type in semantic_types
                if semantic_type not in semantic_types_to_remove
            ]
            column_metadata['semantic_types'] = semantic_types

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    @classmethod
    def _update_predictions_metadata(
        cls, inputs_metadata: metadata_base.DataMetadata,
        outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]
    ) -> metadata_base.DataMetadata:
        outputs_metadata = inputs_metadata.clear(for_value=outputs,
                                                 generate_metadata=True)

        for column_index, column_metadata in enumerate(
                target_columns_metadata):
            outputs_metadata = outputs_metadata.update_column(
                column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs,
                          predictions: ndarray) -> Outputs:
        outputs = d3m_dataframe(predictions, generate_metadata=False)
        outputs.metadata = self._update_predictions_metadata(
            inputs.metadata, outputs, self._target_columns_metadata)
        return outputs

    @classmethod
    def _add_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata):
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict()
            semantic_types = []
            semantic_types.append(
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            column_name = outputs_metadata.query(
                (metadata_base.ALL_ELEMENTS, column_index)).get("name")
            if column_name is None:
                column_name = "output_{}".format(column_index)
            column_metadata["semantic_types"] = semantic_types
            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata


# TrimRegressor.__doc__ = Lasso.__doc__
Example #32
0
print X_train[123,:]

'''
norm1 =  np.linalg.norm(y_train)    
if norm1 != 0:   
    y_train, y_test =  y_train/norm1, y_test/norm1
print norm1
'''

print y_train.shape

model = SVR(C=1.0, gamma=1.0)
model = LinearRegression()

lasso = Lasso(alpha=0.1).fit(X_train, y_train)
enet = ElasticNet(alpha=0.1, l1_ratio=0.7).fit(X_train, y_train)

y_pred = lasso.predict(X_test)

print "MSE", mean_squared_error(y_test, y_pred)
m = np.mean(y_test)
print "MSE (Mean)",mean_squared_error(y_test, m*np.ones(len(y_test)))


print "r^2 on test data", r2_score(y_test, y_pred)

plt.plot(enet.coef_, label='Elastic net coefficients')
plt.plot(lasso.coef_, label='Lasso coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
def test_lasso_alpha_warning():
    X = [[-1], [0], [1]]
    Y = [-1, 0, 1]       # just a straight line

    clf = Lasso(alpha=0)
    assert_warns(UserWarning, clf.fit, X, Y)
Example #34
0
def test_coef_shape_not_zero():
    est_no_intercept = Lasso(fit_intercept=False)
    est_no_intercept.fit(np.c_[np.ones(3)], np.ones(3))
    assert est_no_intercept.coef_.shape == (1, )