def check_classifier_ratio(clf, method, cv): # Passing distributions directly p0 = Normal(mu=0.0) p1 = Normal(mu=0.1) ratio = ClassifierRatio( CalibratedClassifierCV(base_estimator=clf, method=method, cv=cv)) ratio.fit(numerator=p0, denominator=p1, n_samples=10000) reals = np.linspace(-1, 1, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean( np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01 # Passing X, y only X = np.vstack((p0.rvs(5000), p1.rvs(5000))) y = np.zeros(10000, dtype=np.int) y[5000:] = 1 ratio = ClassifierRatio( CalibratedClassifierCV(base_estimator=clf, method=method, cv=cv)) ratio.fit(X=X, y=y) reals = np.linspace(-1, 1, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean( np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01
def test_fit_with_constraints(): p = Normal() X = st.norm(loc=0.05, scale=1.0).rvs(5000, random_state=0).reshape(-1, 1) p.fit(X, constraints=[{ "param": p.mu, "type": "ineq", "fun": lambda mu: mu }, { "param": p.mu, "type": "ineq", "fun": lambda mu: 0.1 - mu }, { "param": p.sigma, "type": "ineq", "fun": lambda sigma: sigma }, { "param": (p.mu, p.sigma), "type": "ineq", "fun": lambda mu, sigma: mu * sigma }]) assert p.mu.get_value() >= 0.0 assert p.mu.get_value() <= 0.1 assert p.sigma.get_value() >= 0.0 assert p.mu.get_value() * p.sigma.get_value() >= 0.0
def test_mixin_composition(): # Check composed expressions as parameters a = theano.shared(0.0) b = theano.shared(-1.0) mu = a + b - 1.0 sigma = T.abs_(a * b) p = Normal(mu=mu, sigma=sigma) assert a in p.parameters_ assert b in p.parameters_ # Compose parameters with observed variables a = theano.shared(1.0) b = theano.shared(0.0) y = T.dmatrix(name="y") p = Normal(mu=a * y + b) assert len(p.parameters_) == 3 assert a in p.parameters_ assert b in p.parameters_ assert p.sigma in p.parameters_ assert p.mu not in p.parameters_ assert len(p.observeds_) == 1 assert y in p.observeds_ # Check signatures data_X = np.random.rand(10, 1) data_y = np.random.rand(10, 1) p.pdf(X=data_X, y=data_y) p.cdf(X=data_X, y=data_y) p.rvs(10, y=data_y) # Check error a = theano.shared(1.0) b = theano.shared(0.0) y = T.dmatrix() # y must be named assert_raises(ValueError, Normal, mu=a * y + b)
def test_mixture_api(): # Check basic API p1 = Normal(mu=0.0, sigma=T.constant(1.0)) p2 = Normal(mu=1.0, sigma=2.0) m = Mixture(components=[p1, p2], weights=[0.25]) assert len(m.components) == 2 assert len(m.weights) == 2 assert len(m.parameters_) == 4 assert len(m.constants_) == 1 assert len(m.observeds_) == 0 assert p1.mu in m.parameters_ assert p1.sigma in m.constants_ assert p2.mu in m.parameters_ assert p2.sigma in m.parameters_ assert m.X == p1.X assert m.X == p2.X assert m.ndim == p1.ndim assert m.ndim == p2.ndim m = Mixture(components=[p1, p2]) w = m.compute_weights() assert_array_equal(w, [0.5, 0.5]) y = T.dscalar(name="y") w1 = T.constant(0.25) w2 = y * 2 m = Mixture(components=[p1, p2], weights=[w1, w2]) assert y in m.observeds_ # Check errors assert_raises(ValueError, Mixture, components=[p1, p1, p1], weights=[1.0])
def test_likelihood_free_mixture(): p1 = Normal(random_state=1) p2 = Normal(mu=2.0, random_state=1) h1 = Histogram(bins=50).fit(p1.rvs(10000)) h2 = Histogram(bins=50).fit(p2.rvs(10000)) m1 = Mixture(components=[p1, p2]) m2 = Mixture(components=[h1, h2]) # Check whether pdf, nnlf and cdf have been overriden assert isinstance(m1.pdf, theano.compile.function_module.Function) assert isinstance(m1.nnlf, theano.compile.function_module.Function) assert isinstance(m1.cdf, theano.compile.function_module.Function) assert isinstance(m2.pdf, types.MethodType) assert isinstance(m2.nnlf, types.MethodType) assert isinstance(m2.cdf, types.MethodType) # Compare pdfs rng = check_random_state(1) X = rng.rand(100, 1) * 10 - 5 assert np.mean(np.abs(m1.pdf(X) - m2.pdf(X))) < 0.05 # Test sampling X = m2.rvs(10) assert X.shape == (10, 1) # Check errors assert_raises(NotImplementedError, m2.fit, X)
def check_fit(mu, sigma): p = Normal() X = st.norm(loc=mu, scale=sigma).rvs(5000, random_state=0).reshape(-1, 1) s0 = p.score(X) p.fit(X) assert np.abs(p.mu.get_value() - mu) <= 0.1 assert np.abs(p.sigma.get_value() - sigma) <= 0.1 assert p.score(X) >= s0
def test_calibrated_classifier_ratio_identity(): p = Normal(mu=0.0) ratio = CalibratedClassifierRatio(base_estimator=ElasticNetCV()) ratio.fit(numerator=p, denominator=p, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p.pdf(reals) / p.pdf(reals)) == 0.0 assert_array_almost_equal(ratio.predict(reals), np.ones(len(reals))) assert_array_almost_equal(ratio.predict(reals, log=True), np.zeros(len(reals)))
def test_linear_transform_1d(): p0 = Normal() pt = LinearTransform(p0, A=np.array([[0.5]])) X0 = p0.rvs(10, random_state=0) Xt = pt.rvs(10, random_state=0) assert X0.shape == Xt.shape assert_array_equal(X0 * 0.5, Xt) assert_array_equal(p0.pdf(X0), pt.pdf(Xt)) assert_array_equal(p0.nll(X0), pt.nll(Xt))
def test_join(): p = Join(components=[Normal(mu=0), Normal(mu=1), Normal(mu=2)]) assert p.ndim == 3 assert len(p.parameters_) == 6 X = p.rvs(10000, random_state=1) assert X.shape == (10000, 3) assert np.abs(np.mean(X[:, 0]) - 0.) < 0.05 assert np.abs(np.mean(X[:, 1]) - 1.) < 0.05 assert np.abs(np.mean(X[:, 2]) - 2.) < 0.05 assert_array_almost_equal(-np.log(p.pdf(X)), p.nll(X))
def test_classifier_ratio_identity(): p = Normal(mu=0.0) ratio = ClassifierRatio( CalibratedClassifierCV(base_estimator=ElasticNetCV())) ratio.fit(numerator=p, denominator=p, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p.pdf(reals) / p.pdf(reals)) == 0.0 assert_array_almost_equal(ratio.predict(reals), np.ones(len(reals))) assert_array_almost_equal(ratio.predict(reals, log=True), np.zeros(len(reals)))
def test_known_density(): components = [Normal(mu=0.0), Normal(mu=0.25), Normal(mu=0.5)] p0 = Mixture(components=components, weights=[0.45, 0.1, 0.45]) p1 = Mixture(components=[components[0]] + [components[2]]) ratio = KnownDensityRatio(numerator=p0, denominator=p1) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.01 assert np.mean(np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01 assert ratio.nllr(reals) == -ratio.predict(reals, log=True).sum()
def check_normal(mu, sigma): rng = check_random_state(1) p_carl = Normal(mu=mu, sigma=sigma) p_scipy = st.norm(loc=mu, scale=sigma) X = rng.rand(50, 1) assert_array_almost_equal(p_carl.pdf(X), p_scipy.pdf(X.ravel())) assert_array_almost_equal(p_carl.cdf(X), p_scipy.cdf(X.ravel())) assert_array_almost_equal(-np.log(p_carl.pdf(X)), p_carl.nll(X))
def test_decomposed_ratio(): components = [Normal(mu=0.0), Normal(mu=0.25), Normal(mu=0.5)] p0 = Mixture(components=components, weights=[0.45, 0.1, 0.45]) p1 = Mixture(components=[components[0]] + [components[2]]) ratio = DecomposedRatio( ClassifierRatio(CalibratedClassifierCV(base_estimator=ElasticNetCV()))) ratio.fit(numerator=p0, denominator=p1, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean(np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01
def test_parameterized_regressor(): mu = theano.shared(0) p = Normal(mu=mu) X = p.rvs(100) y = p.pdf(X).astype(np.float32) tf = ParameterStacker(params=[mu]) clf = ParameterizedRegressor(DecisionTreeRegressor(), params=[mu]) clf.fit(tf.transform(X), y) assert clf.n_features_ == 1 assert_array_almost_equal(y, clf.predict(tf.transform(X)), decimal=3)
def test_decomposed_ratio_identity(): components = [Normal(mu=0.0), Normal(mu=0.25), Normal(mu=0.5)] p = Mixture(components=components, weights=[0.45, 0.1, 0.45]) ratio = DecomposedRatio( ClassifierRatio(CalibratedClassifierCV(base_estimator=ElasticNetCV()))) ratio.fit(numerator=p, denominator=p, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p.pdf(reals) / p.pdf(reals)) == 0.0 assert_array_almost_equal(ratio.predict(reals), np.ones(len(reals))) assert_array_almost_equal(ratio.predict(reals, log=True), np.zeros(len(reals)))
def test_fit_with_constraints(): p = Normal() X = st.norm(loc=0.05, scale=1.0).rvs(5000, random_state=0).reshape(-1, 1) p.fit(X, constraints=[ {"param": p.mu, "type": "ineq", "fun": lambda mu: mu}, {"param": p.mu, "type": "ineq", "fun": lambda mu: 0.1 - mu}, {"param": p.sigma, "type": "ineq", "fun": lambda sigma: sigma}, {"param": (p.mu, p.sigma), "type": "ineq", "fun": lambda mu, sigma: mu * sigma}]) assert p.mu.get_value() >= 0.0 assert p.mu.get_value() <= 0.1 assert p.sigma.get_value() >= 0.0 assert p.mu.get_value() * p.sigma.get_value() >= 0.0
def check_classifier_ratio(clf, method, cv): # Passing distributions directly p0 = Normal(mu=0.0) p1 = Normal(mu=0.1) ratio = ClassifierRatio(CalibratedClassifierCV(base_estimator=clf, method=method, cv=cv)) ratio.fit(numerator=p0, denominator=p1, n_samples=10000) reals = np.linspace(-1, 1, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean(np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01 # Passing X, y only X = np.vstack((p0.rvs(5000), p1.rvs(5000))) y = np.zeros(10000, dtype=np.int) y[5000:] = 1 ratio = ClassifierRatio(CalibratedClassifierCV(base_estimator=clf, method=method, cv=cv)) ratio.fit(X=X, y=y) reals = np.linspace(-1, 1, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean(np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01
def test_kde(): # Test API p = Normal(random_state=1) X = p.rvs(10000) k = KernelDensity() k.fit(X) reals = np.linspace(-3, 3).reshape(-1, 1) assert np.mean(np.abs(p.pdf(reals) - k.pdf(reals))) < 0.05 assert np.mean(np.abs(p.nnlf(reals) - k.nnlf(reals))) < 0.05 # Test sampling X = k.rvs(10000) assert np.abs(np.mean(X)) < 0.05
def test_parameter_stacker(): mu = theano.shared(0) sigma = theano.shared(1) p = Normal(mu=mu, sigma=sigma) X = p.rvs(10) tf = ParameterStacker(params=[mu, sigma]) Xt = tf.transform(X) assert Xt.shape == (10, 1+2) assert_array_almost_equal(Xt[:, 1], np.zeros(10)) assert_array_almost_equal(Xt[:, 2], np.ones(10)) mu.set_value(1) Xt = tf.transform(X) assert_array_almost_equal(Xt[:, 1], np.ones(10))
def check_mixture_pdf(w0, w1, mu1, sigma1, mu2, sigma2): rng = check_random_state(1) p1 = Normal(mu=mu1, sigma=sigma1) p2 = Normal(mu=mu2, sigma=sigma2) m = Mixture(components=[p1, p2], weights=[w0, w1]) q1 = st.norm(loc=mu1, scale=sigma1) q2 = st.norm(loc=mu2, scale=sigma2) X = rng.rand(50, 1) assert_array_almost_equal( m.pdf(X).ravel(), w0 * q1.pdf(X).ravel() + (w1 if w1 is not None else (1 - w0)) * q2.pdf(X).ravel())
def test_fit(): p1 = Normal(mu=T.constant(0.0), sigma=T.constant(2.0)) p2 = Normal(mu=T.constant(3.0), sigma=T.constant(2.0)) p3 = Exponential(inverse_scale=T.constant(0.5)) g = theano.shared(0.5) m = Mixture(components=[p1, p2, p3], weights=[g, g*g]) X = np.concatenate([st.norm(loc=0.0, scale=2.0).rvs(300, random_state=0), st.norm(loc=3.0, scale=2.0).rvs(100, random_state=1), st.expon(scale=1. / 0.5).rvs(500, random_state=2)]) X = X.reshape(-1, 1) s0 = m.score(X) m.fit(X) assert np.abs(g.eval() - 1. / 3.) < 0.05 assert m.score(X) >= s0
def test_mixin_external(): # Check external parameters mu = theano.shared(0.0) sigma = theano.shared(1.0) p = Normal(mu=mu, sigma=sigma) assert mu == p.mu assert sigma == p.sigma
def test_mixin_constants(): # Check with constants mu = T.constant(0.0) sigma = T.constant(1.0) p = Normal(mu=mu, sigma=sigma) assert len(p.parameters_) == 0 assert len(p.constants_) == 2 assert mu in p.constants_ assert sigma in p.constants_
def test_mv_mixture(): p1 = MultivariateNormal(mu=np.array([0.0, 0.0]), sigma=np.eye(2)) p2 = MultivariateNormal(mu=np.array([2.0, 2.0]), sigma=0.5 * np.eye(2)) m = Mixture(components=[p1, p2]) assert m.ndim == 2 X = m.rvs(100) assert X.shape == (100, 2) assert_raises(ValueError, Mixture, components=[p1, Normal()])
def test_join_non_theano(): h0 = Histogram(interpolation="linear", bins=30) h1 = Histogram(interpolation="linear", bins=30) h2 = Histogram(interpolation="linear", bins=30) h0.fit(Normal(mu=0).rvs(10000, random_state=0)) h1.fit(Normal(mu=1).rvs(10000, random_state=1)) h2.fit(Normal(mu=2).rvs(10000, random_state=2)) p = Join(components=[h0, h1, h2]) assert p.ndim == 3 assert len(p.parameters_) == 0 X = p.rvs(10000, random_state=1) assert X.shape == (10000, 3) assert np.abs(np.mean(X[:, 0]) - 0.) < 0.05 assert np.abs(np.mean(X[:, 1]) - 1.) < 0.05 assert np.abs(np.mean(X[:, 2]) - 2.) < 0.05 assert_array_almost_equal(-np.log(p.pdf(X)), p.nll(X))
def test_mixin_sklearn_params(): # get_params p = Normal(mu=0.0, sigma=1.0) params = p.get_params() assert len(params) == 2 assert "mu" in params assert "sigma" in params # for parameters, set_params should change the value contained old_mu = p.get_params()["mu"] p.set_params(mu=42.0) new_mu = p.get_params()["mu"] assert old_mu is new_mu assert new_mu.get_value() == 42.0 # check errors p = Normal(mu=T.constant(0.0), sigma=1.0) assert_raises(ValueError, p.set_params, mu=1.0)
def test_mixin_base(): # Check raw parameters p = Normal(mu=0.0, sigma=1.0) assert isinstance(p, DistributionMixin) assert len(p.parameters_) == 2 assert p.mu in p.parameters_ assert p.sigma in p.parameters_ assert isinstance(p.mu, SharedVariable) assert isinstance(p.sigma, SharedVariable) assert p.mu.get_value() == 0.0 assert p.sigma.get_value() == 1.0 assert len(p.observeds_) == 0 assert isinstance(p.X, TensorVariable)
def generate_samples_for_blow_up_demo(n_samples=50000): """ Generate 3 independent Gaussian variables and apply linear transformation to them. These Gaussian have different means and different sigmas for target and original distribution. This is example of samples with regions with high target samples number zero original samples. In this case exact reweighting rule blow up and the same happens for algorithms. :param int n_samples: number of generated samples for original/target distributions. For test samples 2*n_samples will be generated :return: train original, train target, exact weights for train original, test original, test target, exact weights for test original """ p0 = Join(components=[ Normal(mu=1, sigma=0.7), Normal(mu=-1, sigma=0.7), Normal(mu=1, sigma=1.5) ]) p1 = Join(components=[ Normal(mu=0, sigma=0.7), Normal(mu=0, sigma=0.7), Normal(mu=0, sigma=1.5) ]) R = make_sparse_spd_matrix(3, alpha=0.5, random_state=7) p0 = LinearTransform(p0, R) p1 = LinearTransform(p1, R) X0 = p0.rvs(n_samples, random_state=777) X1 = p1.rvs(n_samples, random_state=777) exact_weights = numpy.exp(p0.nll(X0) - p1.nll(X0)) exact_weights[numpy.isinf(exact_weights)] = 1. # generate samples to test reweighting rule (to avoid overfitting) X0_roc = p0.rvs(2 * n_samples, random_state=777 * 2) X1_roc = p1.rvs(2 * n_samples, random_state=777 * 2) # Weighted with true ratios exact_weights_roc = numpy.exp(p0.nll(X0_roc) - p1.nll(X0_roc)) exact_weights_roc[numpy.isinf(exact_weights_roc)] = 1. draw_distributions(X0, X1, numpy.ones(len(X0))) print "Exact weights are used (inf weights are set to 1)" draw_distributions(X0, X1, exact_weights) return X0, X1, exact_weights, X0_roc, X1_roc, exact_weights_roc
def test_rvs(): p1 = Normal(mu=0.0, sigma=T.constant(1.0), random_state=0) p2 = Normal(mu=2.0, sigma=2.0, random_state=0) m = Mixture(components=[p1, p2], weights=[0.25], random_state=0) X = m.rvs(2000) assert (np.mean(X) - (0.25 * p1.mu.eval() + 0.75 * p2.mu.eval())) < 0.1
def check_rvs(mu, sigma, random_state): p = Normal(mu=mu, sigma=sigma) samples = p.rvs(10000, random_state=random_state) assert np.abs(np.mean(samples) - mu) <= 0.05 assert np.abs(np.std(samples) - sigma) <= 0.05
def test_fit_with_bounds(): p = Normal() X = st.norm(loc=0.05, scale=1.0).rvs(5000, random_state=0).reshape(-1, 1) p.fit(X, bounds=[{"param": p.sigma, "bounds": (0, None)}]) assert p.sigma.get_value() >= 0.0
def generate_samples(with_linear_transformation=False, add_variation=False, n_samples=50000, verbose=True): """ Generate 5 independent variables: two Gaussian, mixture of Gaussian, two exponents. Two Gaussian have different means for original and target distributions. if with_linear_transformation is True then add linear transformation of generated 5 variables. if add_variation is True then add random values in variance to obtain gaussian pdf for orignal and target samples not only with different mean but also with different variance. :param bool with_linear_transformation: apply or not linear transformation for samples features :param bool add_variation: make or not different variance for Gaussian distribution for original and target samples. :param int n_samples: number of generated samples for original/target distributions. For test samples 2*n_samples will be generated :param bool verbose: print and plot additional info during generation. :return: train original, train target, exact weights for train original, test original, test target, exact weights for test original """ # define linear transformation matrix R = make_sparse_spd_matrix(5, alpha=0.5, random_state=7) variation_origin, variation_target = (0, 0) if add_variation: r = check_random_state(42) variation_origin, variation_target = r.uniform() / 3., r.uniform() / 3. p0 = Join(components=[ Normal(mu=.5, sigma=1 + variation_origin), Normal(mu=-.5, sigma=3 + variation_origin), Mixture(components=[Normal(mu=-2, sigma=1), Normal(mu=2, sigma=0.5)]), Exponential(inverse_scale=3.0), Exponential(inverse_scale=0.5) ]) p1 = Join(components=[ Normal(mu=0, sigma=1 + variation_target), Normal(mu=0, sigma=3 + variation_target), Mixture(components=[Normal(mu=-2, sigma=1), Normal(mu=2, sigma=0.5)]), Exponential(inverse_scale=3.0), Exponential(inverse_scale=0.5) ]) if with_linear_transformation: p0 = LinearTransform(p0, R) p1 = LinearTransform(p1, R) X0 = p0.rvs(n_samples, random_state=777) X1 = p1.rvs(n_samples, random_state=777) exact_weights = numpy.exp(p0.nll(X0) - p1.nll(X0)) exact_weights[numpy.isinf(exact_weights)] = 0. # generate samples to test reweighting rule (to avoid overfitting) X0_roc = p0.rvs(2 * n_samples, random_state=777 * 2) X1_roc = p1.rvs(2 * n_samples, random_state=777 * 2) # Weighted with true ratios exact_weights_roc = numpy.exp(p0.nll(X0_roc) - p1.nll(X0_roc)) exact_weights_roc[numpy.isinf(exact_weights_roc)] = 0. if verbose: print "Original distribution" fig = corner.corner(X0, bins=20, smooth=0.85, labels=["X0", "X1", "X2", "X3", "X4"]) plt.show() print "Target distribution" fig = corner.corner(X1, bins=20, smooth=0.85, labels=["X0", "X1", "X2", "X3", "X4"]) plt.show() print "Exact reweighting" # In this example, we know p0(x) and p1(x) exactly, #so we can compare the other can compare the approximate reweighting approaches with the exact weights. draw_distributions(X0, X1, exact_weights) return X0, X1, exact_weights, X0_roc, X1_roc, exact_weights_roc
true_theta = np.array([1.0, -1.0]) make_plots = True # Simulator A = theano.shared(true_theta[0], name="A") B = theano.shared(true_theta[1], name="B") R = np.array([[1.31229955, 0.10499961, 0.48310515, -0.3249938, -0.26387927], [0.10499961, 1.15833058, -0.55865473, 0.25275522, -0.39790775], [0.48310515, -0.55865473, 2.25874579, -0.52087938, -0.39271231], [0.3249938, 0.25275522, -0.52087938, 1.4034925, -0.63521059], [-0.26387927, -0.39790775, -0.39271231, -0.63521059, 1.]]) p0 = LinearTransform( Join(components=[ Normal(mu=A, sigma=1), Normal(mu=B, sigma=3), Mixture(components=[Normal(mu=-2, sigma=1), Normal(mu=2, sigma=0.5)]), Exponential(inverse_scale=3.0), Exponential(inverse_scale=0.5) ]), R) def simulator(theta, n_samples, random_state=None): A.set_value(theta[0]) B.set_value(theta[1]) return p0.rvs(n_samples, random_state=random_state) X_obs = simulator(true_theta, 20000, random_state=rng)