def test_condense_around_knots_matches_mse_with_random_pwl_curves(self): # Given two PWLCurves on a set of knots, the condensed points around those # knots should preserve the MSE diff between those curves. np.random.seed(5) x = np.sort(np.random.normal(size=100)) y = x + np.random.normal(size=100) w = np.random.uniform(size=100) # Select four random xs to serve as knots. knot_xs = np.sort(np.random.choice(x, size=4, replace=False)) # Generate two random PWLCurves using the knot_xs. knot_ys_1 = np.random.normal(size=len(knot_xs)) knot_ys_2 = np.random.normal(size=len(knot_xs)) pwlcurve_1 = pwlcurve.PWLCurve(list(zip(knot_xs, knot_ys_1))) pwlcurve_2 = pwlcurve.PWLCurve(list(zip(knot_xs, knot_ys_2))) # Now generate the condensed points. condensed_x, condensed_y, condensed_w = ( linear_condense.condense_around_knots(x, y, w, knot_xs)) self.assertLessEqual(len(condensed_x), 2 * len(knot_xs) - 2) # Ensure delta line MSEs are the same on the full and condensed data. full_data_mse_1 = _curve_error_on_points(x, y, w, pwlcurve_1) full_data_mse_2 = _curve_error_on_points(x, y, w, pwlcurve_2) full_data_mse_delta = full_data_mse_1 - full_data_mse_2 condensed_mse_1 = _curve_error_on_points(condensed_x, condensed_y, condensed_w, pwlcurve_1) condensed_mse_2 = _curve_error_on_points(condensed_x, condensed_y, condensed_w, pwlcurve_2) condensed_mse_delta = condensed_mse_1 - condensed_mse_2 self.assertAlmostEqual(full_data_mse_delta, condensed_mse_delta)
def test_condense_around_knots_with_repeated_points_on_knots(self): # For (x,y,w) such that x is in knot_xs, condense_around_knots must # decide whether to put (x,y,w) in the lower condensed range [prev_knot, x] # or the higher condensed range [x, next_knot]. Either way is fine, so long # as x is placed in precisely one of the two ranges. This test ensures that # condense_around_knots handles such (x,y,w) correctly. np.random.seed(12) knot_xs = np.array([1., 5., 8., 10.]) x = np.sort(np.random.randint(10, size=1000)) y = x + np.random.normal(size=len(x)) w = np.random.uniform(size=len(x)) # Generate two random PWLCurves using the knot_xs. knot_ys_1 = np.random.normal(size=len(knot_xs)) knot_ys_2 = np.random.normal(size=len(knot_xs)) pwlcurve_1 = pwlcurve.PWLCurve(list(zip(knot_xs, knot_ys_1))) pwlcurve_2 = pwlcurve.PWLCurve(list(zip(knot_xs, knot_ys_2))) # Now generate the condensed points. condensed_x, condensed_y, condensed_w = ( linear_condense.condense_around_knots(x, y, w, knot_xs)) self.assertLessEqual(len(condensed_x), 2 * len(knot_xs) - 2) # Ensure delta line MSEs are the same on the full and condensed data. full_data_mse_1 = _curve_error_on_points(x, y, w, pwlcurve_1) full_data_mse_2 = _curve_error_on_points(x, y, w, pwlcurve_2) full_data_mse_delta = full_data_mse_1 - full_data_mse_2 condensed_mse_1 = _curve_error_on_points(condensed_x, condensed_y, condensed_w, pwlcurve_1) condensed_mse_2 = _curve_error_on_points(condensed_x, condensed_y, condensed_w, pwlcurve_2) condensed_mse_delta = condensed_mse_1 - condensed_mse_2 self.assertAlmostEqual(full_data_mse_delta, condensed_mse_delta)
def test_condense_around_knots_matches_mse_with_random_pwl_curves(self): # condense_around_knots picks a set of candidate knots and then # linearly condenses points around those candidate knots. Given any two # piecewise-linear curves defined on candidate knots, the condensed points # should preserve the MSE diff between those curves. np.random.seed(13) x = np.sort(np.random.normal(size=937)) y = x + np.random.normal(size=len(x)) w = np.random.uniform(size=len(x)) knot_xs, condensed_x, condensed_y, condensed_w = ( linear_condense.sample_condense_points(x, y, w, 100)) # Generate two random PWLCurves using the knot_xs. knot_ys_1 = np.random.normal(size=len(knot_xs)) knot_ys_2 = np.random.normal(size=len(knot_xs)) pwlcurve_1 = pwlcurve.PWLCurve(list(zip(knot_xs, knot_ys_1))) pwlcurve_2 = pwlcurve.PWLCurve(list(zip(knot_xs, knot_ys_2))) # Ensure delta line MSEs are the same on the full and condensed data. full_data_mse_1 = _curve_error_on_points(x, y, w, pwlcurve_1) full_data_mse_2 = _curve_error_on_points(x, y, w, pwlcurve_2) full_data_mse_delta = full_data_mse_1 - full_data_mse_2 condensed_mse_1 = _curve_error_on_points(condensed_x, condensed_y, condensed_w, pwlcurve_1) condensed_mse_2 = _curve_error_on_points(condensed_x, condensed_y, condensed_w, pwlcurve_2) condensed_mse_delta = condensed_mse_1 - condensed_mse_2 self.assertAlmostEqual(full_data_mse_delta, condensed_mse_delta)
def test_fit_pwl_with_four_segment_unimodal(self): x = np.arange(51, dtype=float) / 10 y = pwlcurve.PWLCurve([(0, 0), (1, 2), (2, 5), (3, 2), (4, 0)]).eval(x) self.assert_allclose(y, pwl_predict( x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic))
def test_fit_pwl_points_non_mono_two_segment(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (25, 25), (50, 0)]).eval(x) w = np.ones_like(x) curve_xs, curve_ys = fitter.fit_pwl_points(x, x, y, w, 2) self.assert_allclose(curve_xs, [0, 25, 50]) self.assert_allclose(curve_ys, [0, 25, 0])
def test_fit_pwl_with_four_segment_unimodal_with_slope_restrictions(self): x = np.arange(51, dtype=float) / 10 y = pwlcurve.PWLCurve([(0, 0), (1, 2), (2, 5), (3, 2), (4, 0)]).eval(x) # -3 <= true_slope <= 3. FitUnimodalPWL can find the ideal fit unless we # require a min_slope > -3 or a max_slope < 3. self.assert_allclose( y, pwl_predict(x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic, min_slope=-3, max_slope=3)) # Min slope is too large for a perfect fit. self.assert_notallclose( y, pwl_predict(x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic, min_slope=-2)) # Max slope is too small for a perfect fit. self.assert_notallclose( y, pwl_predict(x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic, max_slope=2))
def test_fit_pwl_unimodal_with_forced_direction(self): x = np.arange(51, dtype=float) / 10 y = pwlcurve.PWLCurve([(0, 0), (1, 2), (2, 5), (3, 2), (4, 0)]).eval(x) # y is concave down, so a concave solution is ideal. concave_curve = fitter.fit_pwl( x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic_concave_down) convex_curve = fitter.fit_pwl(x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic_concave_up) self.assert_allclose(y, concave_curve.eval(x)) self.assert_notallclose(y, convex_curve.eval(x)) # -y is concave up, so a convex solution is ideal. concave_curve = fitter.fit_pwl( x, -y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic_concave_down) convex_curve = fitter.fit_pwl(x, -y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic_concave_up) self.assert_allclose(-y, convex_curve.eval(x)) self.assert_notallclose(-y, concave_curve.eval(x))
def test_simple_slope_restrictions(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (50, 75)]).eval(x) w = np.ones_like(x) # The true slope is 1.5, which is compatible with any slope restrictions # that allow slope=1.5. self.assert_allclose(y, pwl_predict(x, y, w, 1, min_slope=1)) self.assert_allclose(y, pwl_predict(x, y, w, 1, max_slope=2)) self.assert_allclose(y, pwl_predict(x, y, w, 1, min_slope=1, mono=False)) self.assert_allclose(y, pwl_predict(x, y, w, 1, max_slope=2, mono=False)) # An ideal fit isn't possible if we prevent slope=1.5. self.assert_notallclose(y, pwl_predict(x, y, w, 1, max_slope=1)) self.assert_notallclose(y, pwl_predict(x, y, w, 1, min_slope=2)) self.assert_notallclose( y, pwl_predict(x, y, w, 1, max_slope=1, mono=False)) self.assert_notallclose( y, pwl_predict(x, y, w, 1, min_slope=2, mono=False))
def test_eval_clamping_with_differing_float_precision(self): curve = pwlcurve.PWLCurve([(1.0, -0.07331), (2.0, -0.1255)], fx=np.log1p) xs = np.array([0., 1., 2., 3.], dtype=np.float32) expected_ys = [-0.07331, -0.07331, -0.1255, -0.1255] self.assert_allclose(expected_ys, curve.eval(xs))
def test_mono_increasing_three_segment_pwl(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (10, 1), (25, 25), (50, 60)]).eval(x) w = np.ones_like(x) self.assert_allclose(y, pwl_predict(x, y, w, 3)) self.assert_allclose(y, pwl_predict(x, y, w, 4))
def test_learn_ends_has_no_effect_when_endpoints_are_ideal(self): # In this case, the ideal fit uses the endpoints, so no need to learn ends. x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (10, 1), (25, 25), (50, 60)]).eval(x) w = np.ones_like(x) self.assertEqual(fitter.fit_pwl(x, y, w, 3, learn_ends=True), fitter.fit_pwl(x, y, w, 3, learn_ends=False))
def __init__(self, feature_name: str, control_points: Sequence[Tuple[float, float]], fx: Callable[[Sequence[float]], Sequence[float]] = transform.identity): self._feature_name = feature_name self._curve = pwlcurve.PWLCurve(control_points, fx) super(PWLCurveModel, self).__init__()
def test_reports_correct_error(self): np.random.seed(58440) x = np.sort(np.random.uniform(size=100)) y = x**2 + np.random.normal(scale=.2, size=100) w = np.random.uniform(size=100) knots = [.2, .5, .8, .9] solver = fitter._WeightedLeastSquaresPWLSolver(x, y, w) knot_ys, reported_error = solver.solve(knots) points = list(zip(knots, knot_ys)) true_error = _curve_error_on_points(x, y, w, pwlcurve.PWLCurve(points)) self.assertAlmostEqual(true_error, reported_error) mono_solver = fitter._WeightedLeastSquaresPWLSolver(x, y, w, min_slope=0) knot_ys, reported_error = mono_solver.solve(knots) points = list(zip(knots, knot_ys)) true_error = _curve_error_on_points(x, y, w, pwlcurve.PWLCurve(points)) self.assertAlmostEqual(true_error, reported_error)
def test_non_mono_three_segment_pwl(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (10, 25), (25, 10), (50, 60)]).eval(x) w = np.ones_like(x) self.assert_allclose( y, pwl_predict(x, y, w, 3, mono=False, fx=transform.identity)) self.assert_allclose( y, pwl_predict(x, y, w, 4, mono=False, fx=transform.identity))
def test_one_segment_pwl_with_flat_ends(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (10, 0), (40, 50), (50, 50)]).eval(x) w = np.ones_like(x) # A one-segment PWLCurve can fit fn perfectly, but only if its knots are # [(10, 0), (40, 50)]. This test confirms that fit_pwl learns those knots. curve = fitter.fit_pwl(x, y, w, 1) self.assert_allclose([(10, 0), (40, 50)], curve.points) self.assert_allclose(y, curve.eval(x)) self.assertEqual(transform.identity, curve.fx)
def __init__(self, feature_name: str, control_points: Sequence[Tuple[float, float]], fx: Union[Callable[[Sequence[float]], Sequence[float]], str] = transform.identity): self._feature_name = feature_name if isinstance(fx, str): fx = pwlcurve.PWLCurve.STR_TO_FX[fx] self._curve = pwlcurve.PWLCurve(control_points, fx) super(PWLCurveModel, self).__init__()
def test_non_mono_increasing_two_segment_pwl_with_flat_ends(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (10, 0), (25, 15), (40, 0), (50, 0)]).eval(x) w = np.ones_like(x) # A two-segment PWLCurve can fit fn perfectly, but only if its knots are # [(10, 0), (25, 15), (40, 0)]. This test confirms that fit_pwl will learn # those knots. curve = fitter.fit_pwl(x, y, w, 2, mono=False, fx=transform.identity) self.assert_allclose([(10, 0), (25, 15), (40, 0)], curve.points) self.assert_allclose(y, curve.eval(x))
def test_one_segment_pwl_with_flat_ends_but_no_learning_ends(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (10, 0), (40, 50), (50, 50)]).eval(x) w = np.ones_like(x) # A one-segment PWLCurve can fit fn perfectly, but only if its knots are # [(10, 0), (40, 50)]. In this test, we disable learn_ends, and show that # the fitter can't learn the ideal fit because it's forced to use 0 and 50 # as control points. curve = fitter.fit_pwl(x, y, w, 1, learn_ends=False) self.assertEqual([0, 50], curve.xs) self.assert_notallclose(y, curve.eval(x))
def test_eval_with_exp_transform(self): orig_points = [(1., 5.), (5., 13.), (10., 15.)] xs = [0, 1, 2, 5, 7.5, 10, 20] # Shift x to exponential space. curve_xs, curve_ys = zip(*orig_points) curve_xs = np.exp(curve_xs) # Perform interpolation in the log-x space, counteracting the shift in x. curve = pwlcurve.PWLCurve(list(zip(curve_xs, curve_ys)), np.log) exp_x = np.exp(xs) expected_ys = [5, 5, 7, 13, 14, 15, 15] self.assert_allclose(expected_ys, curve.eval(exp_x))
def test_non_mono_two_segment_log(self): exp_x = 1 + np.arange(51, dtype=float) x = np.log(exp_x) y = pwlcurve.PWLCurve([(x[0], 0), (x[25], 25), (x[50], 0)]).eval(x) w = np.ones_like(x) # Piecewise-linear in log space. self.assert_allclose(y, pwl_predict(exp_x, y, w, 2, mono=False, fx=np.log)) self.assert_allclose(y, pwl_predict(exp_x, y, w, 3, mono=False, fx=np.log)) self.assert_allclose(y, pwl_predict(exp_x, y, w, 4, mono=False, fx=np.log)) # Monotone curves can't fit this data closely. self.assert_notallclose(y, pwl_predict(exp_x, y, w, 2, mono=True, fx=np.log))
def test_fit_pwl_points_required_x_knots(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (25, 25), (50, 0)]).eval(x) w = np.ones_like(x) # Optimal knots don't change the fit. self.assertEqual( fitter.fit_pwl_points(x, x, y, w, 2, required_x_knots=[0, 25]), fitter.fit_pwl_points(x, x, y, w, 2)) # Suboptimal knots do change the fit. self.assert_notallclose( fitter.fit_pwl_points(x, x, y, w, 2, required_x_knots=[5]), fitter.fit_pwl_points(x, x, y, w, 2))
def test_fit_pwl_with_weights(self): x = np.array([0., 25., 50]) y = pwlcurve.PWLCurve([(0, 0), (25, 30), (50, 50)]).eval(x) w = np.array([1., 1., 2.]) # The fit with weights tip up on low end but down on the high end. pred_ys_weightless = pwl_predict(x, y, np.ones_like(x), 1) pred_ys = pwl_predict(x, y, w, 1) self.assertLess(pred_ys_weightless[0], pred_ys[0]) self.assertGreater(pred_ys_weightless[-1], pred_ys[-1]) # With segments=2, weights have no effect since we can fit perfectly. self.assert_allclose(y, pwl_predict(x, y, np.ones_like(x), 2)) self.assert_allclose(y, pwl_predict(x, y, w, 2))
def test_fit_pwl_unimodal_on_non_unimodal_data(self): x = np.arange(51, dtype=float) / 10 y = pwlcurve.PWLCurve([(0, 0), (1, 2), (2, 0), (3, 2), (4, 0)]).eval(x) curve = fitter.fit_pwl(x, y, num_segments=4, fx=transform.identity, mono=fitter.MonoType.bitonic) # An unrestricted fit should change directions three times, but a unimodal # curve will only change once. self.assertEqual(1, count_slope_inversions(curve.ys)) # Should be concave down -- increasing at first, decreasing at the end. self.assertLess(curve.ys[0], curve.ys[1]) self.assertLess(curve.ys[-1], curve.ys[-2])
def test_non_mono_two_segment_pwl(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, 0), (25, 25), (50, 0)]).eval(x) w = np.ones_like(x) # Unfortunately, fitter will learn a log1p transform for this problem unless # we override transforms. self.assert_allclose( y, pwl_predict(x, y, w, 2, mono=False, fx=transform.identity)) self.assert_allclose( y, pwl_predict(x, y, w, 3, mono=False, fx=transform.identity)) self.assert_allclose( y, pwl_predict(x, y, w, 4, mono=False, fx=transform.identity)) # Monotone curves can't fit this data closely. self.assert_notallclose( y, pwl_predict(x, y, w, 2, mono=True, fx=transform.identity))
def fit_pwl(x: Sequence[float], y: Sequence[float], w: Optional[Sequence[float]] = None, num_segments: int = 3, num_samples: int = 100, mono: Union[MonoType, bool] = MonoType.mono, min_slope: Optional[float] = None, max_slope: Optional[float] = None, fx: Optional[Callable[[np.ndarray], np.ndarray]] = None, learn_ends: bool = True) -> pwlcurve.PWLCurve: """Fits a PWLCurve from x to y, minimizing weighted MSE. Attempts to find a piecewise linear curve which is as close to ys as possible, in a least squares sense. ~O(len(x) + qlog(q) + (num_samples^2)(num_segments^3)) time complexity, where q is ~min(10**6, len(x)). The len(x) term occurs because of downsampling to q points. The qlog(q) term comes from sorting after downsampling. The other term comes from fit_pwl_points, which greedily searches for the best combination of knots and solves a constrained linear least squares expression for each. Args: x: (Sequence of floats) independent variable. y: (Sequence of floats) dependent variable. w: (None or Sequence of floats) the weights on data points. num_segments: (positive int) Number of linear segments. More segments increases quality at the cost of complexity. num_samples: (positive int) Number of potential knot locations to try for the PWL curve. More samples improves fit quality, but slows fitting. At 100 samples, fit_pwl runs in 1-2 seconds. At 1000 samples, it runs in under a minute. At 10,000 samples, expect an hour. mono: (MonoType enum) Restrictions to apply in curve fitting, with monotonicity as the default. See MonoType for all options. min_slope: (None or float) Minimum slope between each adjacent pair of knots. Set to 0 for a monotone increasing solution. max_slope: (None or float) Maximum slope between each adjacent pair of knots. Set to 0 for a monotone decreasing solution. fx: (None or a strictly increasing 1D function) User-specified transform on x, to apply before piecewise-linear curve fitting. If None, fit_pwl chooses a transform using a heuristic. To specify fitting with no transform, pass in transform.identity. learn_ends: (boolean) Whether to learn x-values for the curve's endpoints. Learning endpoints allows for better-fitting curves with the same number of segments. If False, fit_pwl forces the curve to use min(x) and max(x) as knots, which constrains the solution space. Returns: The fit curve. """ utils.expect(num_segments > 0, 'Cannot fit %d segment PWL' % num_segments) utils.expect(num_samples > num_segments, 'num_samples must be at least num_segments + 1') x, y, w = sort_and_sample(x, y, w) if fx is None: fx = transform.find_best_transform(x, y, w) original_x = x trans_x = fx(x) utils.expect( np.isfinite(trans_x[[0, -1]]).all(), 'Transform must be defined on x.') # Pick a subset of x to use as candidate knots, and compress x, y, w around # those candidate knots. x_knots, x, y, w = (linear_condense.sample_condense_points( trans_x, y, w, num_samples)) if mono == MonoType.mono: min_slope, max_slope = _get_mono_slope_bounds(y, w, min_slope, max_slope) bitonic_peak, bitonic_concave_down = _bitonic_peak_and_direction( x, y, w, mono) # Fit a piecewise-linear curve in the transformed space. required_knots = None if learn_ends else x_knots[[0, -1]] x_pnts, y_pnts = fit_pwl_points(x_knots, x, y, w, num_segments, min_slope, max_slope, bitonic_peak, bitonic_concave_down, required_knots) # Recover the control point xs in the pre-transform space. x_pnts = original_x[trans_x.searchsorted(x_pnts)] if np.all(y_pnts == y_pnts[0]): # The curve is constant. curve_points = [(x_pnts[0] - 1, y_pnts[0]), (x_pnts[0], y_pnts[0])] else: curve_points = list(zip(x_pnts, y_pnts)) return pwlcurve.PWLCurve(curve_points, fx)
def test_round_large_values(self): curve = pwlcurve.PWLCurve([(1234, 54321), (56789, 14321)]) rounded_curve = curve.round_to_sig_figs(2) self.assertEqual([(1200, 54000), (57000, 14000)], rounded_curve.points)
def test_mono_decreasing_log1p_line(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(x[0], 75), (x[-1], -1)], np.log1p).eval(x) w = np.ones_like(x) self.assert_allclose(y, pwl_predict(x, y, w, 1)) self.assert_allclose(y, pwl_predict(x, y, w, 2))
def test_mono_increasing_line(self): x = np.arange(51, dtype=float) y = pwlcurve.PWLCurve([(0, -1), (50, 75)]).eval(x) w = np.ones_like(x) self.assert_allclose(y, pwl_predict(x, y, w, 1)) self.assert_allclose(y, pwl_predict(x, y, w, 2))
def test_round_no_change(self): curve = pwlcurve.PWLCurve([(1., 5.), (5., 13.), (10., 15.)]) rounded_curve = curve.round_to_sig_figs(2) self.assertEqual(curve.points, rounded_curve.points)
def test_round_increases_figs_for_close_xs(self): curve = pwlcurve.PWLCurve([(1.23456, 5.4321), (1.23467, 6.5432), (5.6789, 14.321)]) rounded_curve = curve.round_to_sig_figs(2) self.assertEqual([(1.2346, 5.4), (1.2347, 6.5), (5.6789, 14)], rounded_curve.points)