def fit(self, X_train, y_train): print("training") for q in tqdm(self.quantiles): reg = QuantileLinearRegression(quantile=q) reg.fit(X_train, y_train) self.estimators.append(reg) print("Done")
def test_quantile_regression_intercept_D2(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]]) Y = numpy.array([[1., 0.], [1.1, 0.1], [1.2, 0.19]]) clr = LinearRegression(fit_intercept=True) clr.fit(X, Y) clq = QuantileLinearRegression(verbose=False, fit_intercept=True) self.assertRaise(lambda: clq.fit(X, Y), ValueError)
def test_quantile_regression_diff_quantile(self): X = numpy.array([[0.1], [0.2], [0.3], [0.4], [0.5], [0.6]]) Y = numpy.array([1., 1.11, 1.21, 10, 1.29, 1.39]) clqs = [] scores = [] for q in [0.25, 0.4999, 0.5, 0.5001, 0.75]: clq = QuantileLinearRegression(verbose=False, fit_intercept=True, quantile=q) clq.fit(X, Y) clqs.append(clq) sc = clq.score(X, Y) scores.append(sc) self.assertGreater(sc, 0) self.assertLesser(abs(clqs[1].intercept_ - clqs[2].intercept_), 0.01) self.assertLesser(abs(clqs[2].intercept_ - clqs[3].intercept_), 0.01) self.assertLesser(abs(clqs[1].coef_[0] - clqs[2].coef_[0]), 0.01) self.assertLesser(abs(clqs[2].coef_[0] - clqs[3].coef_[0]), 0.01) self.assertGreater(abs(clqs[0].intercept_ - clqs[1].intercept_), 0.01) # self.assertGreater(abs(clqs[3].intercept_ - clqs[4].intercept_), 0.01) self.assertGreater(abs(clqs[0].coef_[0] - clqs[1].coef_[0]), 0.05) # self.assertGreater(abs(clqs[3].coef_[0] - clqs[4].coef_[0]), 0.05) self.assertLesser(abs(scores[1] - scores[2]), 0.01) self.assertLesser(abs(scores[2] - scores[3]), 0.01)
def test_quantile_regression_pandas(self): X = pandas.DataFrame(numpy.array([[0.1, 0.2], [0.2, 0.3]])) Y = numpy.array([1., 1.1]) clr = LinearRegression(fit_intercept=False) clr.fit(X, Y) clq = QuantileLinearRegression(fit_intercept=False) clq.fit(X, Y) self.assertEqual(clr.intercept_, 0) self.assertEqualArray(clr.coef_, clq.coef_) self.assertEqual(clq.intercept_, 0) self.assertEqualArray(clr.intercept_, clq.intercept_)
def test_quantile_regression_intercept(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.3]]) Y = numpy.array([1., 1.1, 1.2]) clr = LinearRegression(fit_intercept=True) clr.fit(X, Y) clq = QuantileLinearRegression(verbose=False, fit_intercept=True) clq.fit(X, Y) self.assertNotEqual(clr.intercept_, 0) self.assertNotEqual(clq.intercept_, 0) self.assertEqualArray(clr.intercept_, clq.intercept_) self.assertEqualArray(clr.coef_, clq.coef_)
def test_quantile_regression_list2(self): X = random(1000) eps1 = (random(900) - 0.5) * 0.1 eps2 = random(100) * 2 eps = numpy.hstack([eps1, eps2]) X = X.reshape((1000, 1)) # pylint: disable=E1101 Y = X * 3.4 + 5.6 + eps clq = QuantileLinearRegression(verbose=False, fit_intercept=True) self.assertRaise(lambda: clq.fit(X, Y), ValueError) Y = X.ravel() * 3.4 + 5.6 + eps clq = QuantileLinearRegression(verbose=False, fit_intercept=True) clq.fit(X, Y) clr = LinearRegression(fit_intercept=True) clr.fit(X, Y) self.assertNotEqual(clr.intercept_, 0) self.assertNotEqual(clq.intercept_, 0) self.assertNotEqualArray(clr.coef_, clq.coef_) self.assertNotEqualArray(clr.intercept_, clq.intercept_) self.assertLesser(clq.n_iter_, 10) pr = clr.predict(X) pq = clq.predict(X) self.assertEqual(pr.shape, pq.shape)
def test_quantile_regression_no_intercept_positive(self): X = numpy.array([[0.1, 0.2], [0.2, 0.3]]) Y = numpy.array([1., 1.1]) clr = LinearRegression(fit_intercept=False, positive=True) clr.fit(X, Y) clq = QuantileLinearRegression(fit_intercept=False, positive=True) clq.fit(X, Y) self.assertEqual(clr.intercept_, 0) self.assertEqual(clq.intercept_, 0) self.assertGreater(clr.coef_.min(), 0) self.assertGreater(clq.coef_.min(), 0) self.assertEqualArray(clr.intercept_, clq.intercept_) self.assertEqualArray(clr.coef_[0], clq.coef_[0]) self.assertGreater(clr.coef_[1:].min(), 3) self.assertGreater(clq.coef_[1:].min(), 3)
def test_quantile_regression_grid_search(self): X = random(100) eps1 = (random(90) - 0.5) * 0.1 eps2 = random(10) * 2 eps = numpy.hstack([eps1, eps2]) X = X.reshape((100, 1)) # pylint: disable=E1101 Y = X.ravel() * 3.4 + 5.6 + eps self.assertRaise( lambda: test_sklearn_grid_search_cv( lambda: QuantileLinearRegression(), X, Y), ValueError) res = test_sklearn_grid_search_cv(lambda: QuantileLinearRegression(), X, Y, delta=[0.1, 0.001]) self.assertIn('model', res) self.assertIn('score', res) self.assertGreater(res['score'], 0) self.assertLesser(res['score'], 1)
def test_quantile_regression_pickle(self): X = random(100) eps1 = (random(90) - 0.5) * 0.1 eps2 = random(10) * 2 eps = numpy.hstack([eps1, eps2]) X = X.reshape((100, 1)) # pylint: disable=E1101 Y = X.ravel() * 3.4 + 5.6 + eps test_sklearn_pickle(lambda: LinearRegression(), X, Y) test_sklearn_pickle(lambda: QuantileLinearRegression(), X, Y)
def test_quantile_regression_quantile_check(self): n = 100 X = (numpy.arange(n) / n) Y = X + X * X / n X = X.reshape((n, 1)) for q in [0.1, 0.5, 0.9]: clq = QuantileLinearRegression(verbose=False, fit_intercept=True, quantile=q, max_iter=10) clq.fit(X, Y) y = clq.predict(X) diff = y - Y sign = numpy.sign(diff) # pylint: disable=E1111 pos = (sign > 0).sum() # pylint: disable=W0143 neg = (sign < 0).sum() # pylint: disable=W0143 if q < 0.5: self.assertGreater(neg, pos * 4) if q > 0.5: self.assertLesser(neg * 7, pos)
def test_quantile_regression_list(self): X = [[0.1, 0.2], [0.2, 0.3]] Y = numpy.array([1., 1.1]) clq = QuantileLinearRegression(fit_intercept=False) self.assertRaise(lambda: clq.fit(X, Y), TypeError)
def test_quantile_regression_clone(self): test_sklearn_clone(lambda: QuantileLinearRegression(delta=0.001))
plt.plot(X, y) ols_model = lm.LinearRegression() ols_model.fit(X, y) ols_trend = ols_model.predict(X) print(ols_model.coef_) print(ols_trend[-1] - ols_trend[0]) plt.plot(X, ols_trend, color="r") X_lad = np.array(call_center_data.index).reshape(-1, 1) y_lad = np.array(call_center_data["calls"]) # print(X_lad) lad_model = QuantileLinearRegression(verbose=True) lad_model.fit(X_lad, y_lad) lad_trend = lad_model.predict(X_lad) print(lad_model.coef_) print(lad_trend[-1] - lad_trend[0]) plt.plot(X, lad_trend, color="g") plt.show()
def voronoi_estimation_from_lr(L, B, C=None, D=None, cl=0, qr=True, max_iter=None, verbose=False): """ Determines a Voronoi diagram close to a convex partition defined by a logistic regression in *n* classes. :math:`M \\in \\mathbb{M}_{nd}` a row matrix :math:`(L_1, ..., L_n)`. Every border between two classes *i* and *j* is defined by: :math:`\\scal{L_i}{X} + B = \\scal{L_j}{X} + B`. The function looks for a set of points from which the Voronoi diagram can be inferred. It is done through a linear regression with norm *L1*. See :ref:`l-lrvor-connection`. @param L matrix @param B vector @param C additional conditions (see below) @param D addition condictions (see below) @param cl class on which the additional conditions applies @param qr use quantile regression @param max_iter number of condition to remove until convergence @param verbose display information while training @return matrix :math:`P \\in \\mathbb{M}_{nd}` The function solves the linear system: .. math:: \\begin{array}{rcl} & \\Longrightarrow & \\left\\{\\begin{array}{l}\\scal{\\frac{L_i-L_j}{\\norm{L_i-L_j}}}{P_i + P_j} + 2 \\frac{B_i - B_j}{\\norm{L_i-L_j}} = 0 \\\\ \\scal{P_i- P_j}{u_{ij}} - \\scal{P_i - P_j}{\\frac{L_i-L_j}{\\norm{L_i-L_j}}} \\scal{\\frac{L_i-L_j}{\\norm{L_i-L_j}}}{u_{ij}}=0 \\end{array} \\right. \\end{array} If the number of dimension is big and the number of classes small, the system has multiple solution. Addition condition must be added such as :math:`CP_i=D` where *i=cl*, :math:`P_i` is the Voronoï point attached to class *cl*. `Quantile regression <https://fr.wikipedia.org/wiki/R%C3%A9gression_quantile>`_ is not implemented in :epkg:`scikit-learn`. We use `QuantileLinearRegression <http://www.xavierdupre.fr/app/mlinsights/helpsphinx/mlinsights/mlmodel/quantile_regression.html>`_. After the first iteration, the function determines the furthest pair of points and removes it from the list of equations. If *max_iter* is None, the system goes until the number of equations is equal to the number of points * 2, otherwise it stops after *max_iter* removals. This is not the optimal pair to remove as they could still be neighbors but it should be a good heuristic. """ labels_inv = {} nb_constraints = numpy.zeros((L.shape[0], )) matL = [] matB = [] for i in range(0, L.shape[0]): for j in range(i + 1, L.shape[0]): li = L[i, :] lj = L[j, :] c = (li - lj) nc = (c.T @ c)**0.5 # first condition mat = numpy.zeros((L.shape)) mat[i, :] = c mat[j, :] = c d = -2 * (B[i] - B[j]) matB.append(d) matL.append(mat.ravel()) labels_inv[i, j, 'eq1'] = len(matL) - 1 nb_constraints[i] += 1 nb_constraints[j] += 1 # condition 2 - hides multiple equation # we pick one coor = 0 found = False while not found and coor < len(c): if c[coor] == 0: coor += 1 continue if c[coor] == nc: coor += 1 continue found = True if not found: raise ValueError( "Matrix L has two similar rows {0} and {1}. Problem cannot be solved." .format(i, j)) c /= nc c2 = c * c[coor] mat = numpy.zeros((L.shape)) mat[i, :] = -c2 mat[j, :] = c2 mat[i, coor] += 1 mat[j, coor] -= 1 matB.append(0) matL.append(mat.ravel()) labels_inv[i, j, 'eq2'] = len(matL) - 1 nb_constraints[i] += 1 nb_constraints[j] += 1 nbeq = (L.shape[0] * (L.shape[0] - 1)) // 2 matL = numpy.array(matL) matB = numpy.array(matB) if max_iter is None: max_iter = matL.shape[0] - matL.shape[1] if nbeq * 2 <= L.shape[0] * L.shape[1]: if C is None and D is None: warnings.warn( "[voronoi_estimation_from_lr] Additional condition are required." ) if C is not None and D is not None: matL = numpy.vstack([matL, numpy.zeros((1, matL.shape[1]))]) a = cl * L.shape[1] b = a + L.shape[1] matL[-1, a:b] = C if not isinstance(D, float): raise TypeError("D must be a float not {0}".format(type(D))) matB = numpy.hstack([matB, [D]]) elif C is None and D is None: pass else: raise ValueError( "C and D must be None together or not None together.") sample_weight = numpy.ones((matL.shape[0], )) tol = numpy.abs(matL.ravel()).max() * 1e-8 / matL.shape[0] order_removed = [] removed = set() for it in range(0, max(max_iter, 1)): if qr: clr = QuantileLinearRegression(fit_intercept=False, max_iter=max(matL.shape)) else: clr = LinearRegression(fit_intercept=False) clr.fit(matL, matB, sample_weight=sample_weight) score = clr.score(matL, matB, sample_weight) res = clr.coef_ res = res.reshape(L.shape) # early stopping if score < tol: if verbose: print( '[voronoi_estimation_from_lr] iter={0}/{1} score={2} tol={3}' .format(it + 1, max_iter, score, tol)) break # defines the best pair of points to remove dist2 = pairwise_distances(res, res) dist = [(d, n // dist2.shape[0], n % dist2.shape[1]) for n, d in enumerate(dist2.ravel())] dist = [_ for _ in dist if _[1] < _[2]] dist.sort(reverse=True) # test equal points if dist[-1][0] < tol: _, i, j = dist[-1] eq1 = labels_inv[i, j, 'eq1'] eq2 = labels_inv[i, j, 'eq2'] if sample_weight[eq1] == 0 and sample_weight[eq2] == 0: sample_weight[eq1] = 1 sample_weight[eq2] = 1 nb_constraints[i] += 1 nb_constraints[j] += 1 else: keep = (i, j) pos = len(order_removed) - 1 while pos >= 0: i, j = order_removed[pos] if i in keep or j in keep: eq1 = labels_inv[i, j, 'eq1'] eq2 = labels_inv[i, j, 'eq2'] if sample_weight[eq1] == 0 and sample_weight[eq2] == 0: sample_weight[eq1] = 1 sample_weight[eq2] = 1 nb_constraints[i] += 1 nb_constraints[j] += 1 break pos -= 1 if pos < 0: forma = 'Two classes have been merged in a single Voronoi point (dist={0} < {1}). max_iter should be lower than {2}' raise VoronoiEstimationError( forma.format(dist[-1][0], tol, it)) dmax, i, j = dist[0] pos = 0 while ( i, j ) in removed or nb_constraints[i] == 0 or nb_constraints[j] == 0: pos += 1 if pos == len(dist): break dmax, i, j = dist[pos] if pos == len(dist): break removed.add((i, j)) order_removed.append((i, j)) eq1 = labels_inv[i, j, 'eq1'] eq2 = labels_inv[i, j, 'eq2'] sample_weight[eq1] = 0 sample_weight[eq2] = 0 nb_constraints[i] -= 1 nb_constraints[j] -= 1 if verbose: print( '[voronoi_estimation_from_lr] iter={0}/{1} score={2:.3g} tol={3:.3g} del P{4},{5} d={6:.3g}' .format(it + 1, max_iter, score, tol, i, j, dmax)) return res
metrics.mean_squared_error(Y_tre, model_lasso.predict(X_tre)))) preds.plot(x="preds", y="Standardize residuals", kind="scatter") df_results = pd.DataFrame({ 'Predicted MSRP': model_lasso.predict(X_tee), 'Actual MSRP': Y_tee }) df_results.plot('Actual MSRP', 'Predicted MSRP', kind='scatter') # In[ ]: #from mlinsights.mlmodel import PiecewiseRegressor #from sklearn.tree import DecisionTreeRegressor clqs = {} for qu in [0.25, 0.5, 0.85]: clq = QuantileLinearRegression(quantile=qu) clq.fit(X_tr, Y_tr) clqs['q=%1.2f' % qu] = clq print(clq) print('Training Mean Absolute Error:', metrics.mean_absolute_error(Y_tr, clq.predict(X_tr))) print('Testing Mean Squared Error:', metrics.mean_squared_error(Y_te, clq.predict(X_te))) print('Training Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_tr, clq.predict(X_tr)))) print('Testing Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_te, clq.predict(X_te)))) R2_tr = r2_score(Y_tr, clq.predict(X_tr)) print(R2_tr)