コード例 #1
0
    def perform_test(self, dat, return_simulated_stats=False):
        """
        dat: an instance of Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            null_sim = self.null_sim
            n_simulate = null_sim.n_simulate
            X = dat.data()
            n = X.shape[0]
            J = self.V.shape[0]

            nfssd, fea_tensor = self.compute_stat(dat,
                                                  return_feature_tensor=True)
            sim_results = null_sim.simulate(self, dat, fea_tensor)
            arr_nfssd = sim_results['sim_stats']

            # approximate p-value with the permutations
            pvalue = np.mean(arr_nfssd > nfssd)

        results = {
            'alpha': self.alpha,
            'pvalue': pvalue,
            'test_stat': nfssd,
            'h0_rejected': pvalue < alpha,
            'n_simulate': n_simulate,
            'time_secs': t.secs,
        }
        if return_simulated_stats:
            results['sim_stats'] = arr_nfssd
        return results
コード例 #2
0
ファイル: mmd.py プロジェクト: ccheng2021/kernel
    def perform_test(self,
                     dat,
                     candidate_kernels=None,
                     return_mmdtest=False,
                     tr_proportion=0.2,
                     reg=1e-3):
        """
        dat: an instance of Data
        candidate_kernels: a list of Kernel's to choose from
        tr_proportion: proportion of sample to be used to choosing the best
            kernel
        reg: regularization parameter for the test power criterion 
        """
        with util.ContextTimer() as t:
            seed = self.seed
            p = self.p
            ds = p.get_datasource()
            p_sample = ds.sample(dat.sample_size(), seed=seed + 77)
            xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion,
                                            seed=seed + 18)
            # ytr, yte are of type data.Data
            ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion,
                                       seed=seed + 12)

            # training and test data
            tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
            te_tst_data = fdata.TSTData(xte.data(), yte.data())

            if candidate_kernels is None:
                # Assume a Gaussian kernel. Construct a list of
                # kernels to try based on multiples of the median heuristic
                med = util.meddistance(tr_tst_data.stack_xy(), 1000)
                list_gwidth = np.hstack(
                    ((med**2) * (2.0**np.linspace(-4, 4, 10))))
                list_gwidth.sort()
                candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]

            alpha = self.alpha

            # grid search to choose the best Gaussian width
            besti, powers = tst.QuadMMDTest.grid_search_kernel(
                tr_tst_data, candidate_kernels, alpha, reg=reg)
            # perform test
            best_ker = candidate_kernels[besti]
            mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha)
            results = mmdtest.perform_test(te_tst_data)
            if return_mmdtest:
                results['mmdtest'] = mmdtest

        results['time_secs'] = t.secs
        return results
コード例 #3
0
    def perform_test(self,
                     dat,
                     return_simulated_stats=False,
                     return_ustat_gram=False):
        """
        dat: a instance of Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            n_simulate = self.n_simulate
            X = dat.data()
            n = X.shape[0]

            _, H = self.compute_stat(dat, return_ustat_gram=True)
            test_stat = n * np.mean(H)
            # bootrapping
            sim_stats = np.zeros(n_simulate)
            with util.NumpySeedContext(seed=self.seed):
                for i in range(n_simulate):
                    W = self.bootstrapper(n)
                    # n * [ (1/n^2) * \sum_i \sum_j h(x_i, x_j) w_i w_j ]
                    boot_stat = W.dot(H.dot(old_div(W, float(n))))
                    # This is a bootstrap version of n*V_n
                    sim_stats[i] = boot_stat

            # approximate p-value with the permutations
            pvalue = np.mean(sim_stats > test_stat)

        results = {
            'alpha': self.alpha,
            'pvalue': pvalue,
            'test_stat': test_stat,
            'h0_rejected': pvalue < alpha,
            'n_simulate': n_simulate,
            'time_secs': t.secs,
        }
        if return_simulated_stats:
            results['sim_stats'] = sim_stats
        if return_ustat_gram:
            results['H'] = H

        return results
コード例 #4
0
ファイル: mmd.py プロジェクト: ccheng2021/kernel
    def perform_test(self, dat):
        """
        dat: an instance of Data
        """
        with util.ContextTimer() as t:
            seed = self.seed
            mmdtest = self.mmdtest
            p = self.p

            # Draw sample from p. #sample to draw is the same as that of dat
            ds = p.get_datasource()
            p_sample = ds.sample(dat.sample_size(), seed=seed + 12)

            # Run the two-sample test on p_sample and dat
            # Make a two-sample test data
            tst_data = fdata.TSTData(p_sample.data(), dat.data())
            # Test
            results = mmdtest.perform_test(tst_data)

        results['time_secs'] = t.secs
        return results
コード例 #5
0
    def perform_test(self, dat):
        """
        dat: a instance of Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            X = dat.data()
            n = X.shape[0]

            # H: length-n vector
            _, H = self.compute_stat(dat, return_pointwise_stats=True)
            test_stat = np.sqrt(old_div(n, 2)) * np.mean(H)
            stat_var = np.mean(H**2)
            pvalue = stats.norm.sf(test_stat, loc=0, scale=np.sqrt(stat_var))

        results = {
            'alpha': self.alpha,
            'pvalue': pvalue,
            'test_stat': test_stat,
            'h0_rejected': pvalue < alpha,
            'time_secs': t.secs,
        }
        return results
コード例 #6
0
    def optimize_locs_params(
        p,
        dat,
        b0,
        c0,
        test_locs0,
        reg=1e-2,
        max_iter=100,
        tol_fun=1e-5,
        disp=False,
        locs_bounds_frac=100,
        b_lb=-20.0,
        b_ub=-1e-4,
        c_lb=1e-6,
        c_ub=1e3,
    ):
        """
        Optimize the test locations and the the two parameters (b and c) of the
        IMQ kernel by maximizing the test power criterion. 
             k(x,y) = (c^2 + ||x-y||^2)^b 
            where c > 0 and b < 0. 
        data should not be the same data as used in the actual test (i.e.,
        should be a held-out set). This function is deterministic.

        - p: UnnormalizedDensity specifying the problem.
        - b0: initial parameter value for b (in the kernel)
        - c0: initial parameter value for c (in the kernel)
        - dat: a Data object (training set)
        - test_locs0: Jxd numpy array. Initial V.
        - reg: reg to add to the mean/sqrt(variance) criterion to become
            mean/sqrt(variance + reg)
        - max_iter: #gradient descent iterations
        - tol_fun: termination tolerance of the objective value
        - disp: True to print convergence messages
        - locs_bounds_frac: When making box bounds for the test_locs, extend
            the box defined by coordinate-wise min-max by std of each coordinate
            multiplied by this number.
        - b_lb: absolute lower bound on b. b is always < 0.
        - b_ub: absolute upper bound on b
        - c_lb: absolute lower bound on c. c is always > 0.
        - c_ub: absolute upper bound on c

        #- If the lb, ub bounds are None 
        
        Return (V test_locs, b, c, optimization info log)
        """
        """
        In the optimization, we will parameterize b with its square root.
        Square back and negate to form b. c is not parameterized in any special
        way since it enters to the kernel with c^2. Absolute value of c will be
        taken to make sure it is positive.
        """
        J = test_locs0.shape[0]
        X = dat.data()
        n, d = X.shape

        def obj(sqrt_neg_b, c, V):
            b = -sqrt_neg_b**2
            return -IMQFSSD.power_criterion(p, dat, b, c, V, reg=reg)

        flatten = lambda sqrt_neg_b, c, V: np.hstack(
            (sqrt_neg_b, c, V.reshape(-1)))

        def unflatten(x):
            sqrt_neg_b = x[0]
            c = x[1]
            V = np.reshape(x[2:], (J, d))
            return sqrt_neg_b, c, V

        def flat_obj(x):
            sqrt_neg_b, c, V = unflatten(x)
            return obj(sqrt_neg_b, c, V)

        # gradient
        #grad_obj = autograd.elementwise_grad(flat_obj)
        # Initial point
        b02 = np.sqrt(-b0)
        x0 = flatten(b02, c0, test_locs0)

        # Make a box to bound test locations
        X_std = np.std(X, axis=0)
        # X_min: length-d array
        X_min = np.min(X, axis=0)
        X_max = np.max(X, axis=0)

        # V_lb: J x d
        V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1))
        V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1))

        # (J*d+2) x 2. Make sure to bound the reparamterized values (not the original)
        """
        For b, b2 := sqrt(-b)
            lb <= b <= ub < 0 means 

            sqrt(-ub) <= b2 <= sqrt(-lb)
            Note the positions of ub, lb.
        """
        x0_lb = np.hstack((np.sqrt(-b_ub), c_lb, np.reshape(V_lb, -1)))
        x0_ub = np.hstack((np.sqrt(-b_lb), c_ub, np.reshape(V_ub, -1)))
        x0_bounds = list(zip(x0_lb, x0_ub))

        # optimize. Time the optimization as well.
        # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html
        grad_obj = autograd.elementwise_grad(flat_obj)
        with util.ContextTimer() as timer:
            opt_result = scipy.optimize.minimize(
                flat_obj,
                x0,
                method='L-BFGS-B',
                bounds=x0_bounds,
                tol=tol_fun,
                options={
                    'maxiter': max_iter,
                    'ftol': tol_fun,
                    'disp': disp,
                    'gtol': 1.0e-06,
                },
                jac=grad_obj,
            )

        opt_result = dict(opt_result)
        opt_result['time_secs'] = timer.secs
        x_opt = opt_result['x']
        sqrt_neg_b, c, V_opt = unflatten(x_opt)
        b = -sqrt_neg_b**2
        assert util.is_real_num(b), 'b is not real. Was {}'.format(b)
        assert b < 0
        assert util.is_real_num(c), 'c is not real. Was {}'.format(c)
        assert c > 0

        return V_opt, b, c, opt_result
コード例 #7
0
    def optimize_locs(p,
                      dat,
                      b,
                      c,
                      test_locs0,
                      reg=1e-5,
                      max_iter=100,
                      tol_fun=1e-5,
                      disp=False,
                      locs_bounds_frac=100):
        """
        Optimize just the test locations by maximizing a test power criterion,
        keeping the kernel parameters b, c fixed to the specified values. data
        should not be the same data as used in the actual test (i.e., should be
        a held-out set). This function is deterministic.

        - p: an UnnormalizedDensity specifying the problem
        - dat: a Data object
        - b, c: kernel parameters of the IMQ kernel. Not optimized.
        - test_locs0: Jxd numpy array. Initial V.
        - reg: reg to add to the mean/sqrt(variance) criterion to become
            mean/sqrt(variance + reg)
        - max_iter: #gradient descent iterations
        - tol_fun: termination tolerance of the objective value
        - disp: True to print convergence messages
        - locs_bounds_frac: When making box bounds for the test_locs, extend
            the box defined by coordinate-wise min-max by std of each coordinate
            multiplied by this number.
        
        Return (V test_locs, optimization info log)
        """
        J = test_locs0.shape[0]
        X = dat.data()
        n, d = X.shape

        def obj(V):
            return -IMQFSSD.power_criterion(p, dat, b, c, V, reg=reg)

        flatten = lambda V: np.reshape(V, -1)

        def unflatten(x):
            V = np.reshape(x, (J, d))
            return V

        def flat_obj(x):
            V = unflatten(x)
            return obj(V)

        # Initial point
        x0 = flatten(test_locs0)

        # Make a box to bound test locations
        X_std = np.std(X, axis=0)
        # X_min: length-d array
        X_min = np.min(X, axis=0)
        X_max = np.max(X, axis=0)
        # V_lb: J x d
        V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1))
        V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1))
        # (J*d) x 2.
        x0_bounds = list(
            zip(
                V_lb.reshape(-1)[:, np.newaxis],
                V_ub.reshape(-1)[:, np.newaxis]))

        # optimize. Time the optimization as well.
        # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html
        grad_obj = autograd.elementwise_grad(flat_obj)
        with util.ContextTimer() as timer:
            opt_result = scipy.optimize.minimize(
                flat_obj,
                x0,
                method='L-BFGS-B',
                bounds=x0_bounds,
                tol=tol_fun,
                options={
                    'maxiter': max_iter,
                    'ftol': tol_fun,
                    'disp': disp,
                    'gtol': 1.0e-06,
                },
                jac=grad_obj,
            )

        opt_result = dict(opt_result)
        opt_result['time_secs'] = timer.secs
        x_opt = opt_result['x']
        V_opt = unflatten(x_opt)
        return V_opt, opt_result
コード例 #8
0
    def optimize_locs_widths(
        p,
        dat,
        gwidth0,
        test_locs0,
        reg=1e-2,
        max_iter=100,
        tol_fun=1e-5,
        disp=False,
        locs_bounds_frac=100,
        gwidth_lb=None,
        gwidth_ub=None,
        use_2terms=False,
    ):
        """
        Optimize the test locations and the Gaussian kernel width by 
        maximizing a test power criterion. data should not be the same data as
        used in the actual test (i.e., should be a held-out set). 
        This function is deterministic.

        - data: a Data object
        - test_locs0: Jxd numpy array. Initial V.
        - reg: reg to add to the mean/sqrt(variance) criterion to become
            mean/sqrt(variance + reg)
        - gwidth0: initial value of the Gaussian width^2
        - max_iter: #gradient descent iterations
        - tol_fun: termination tolerance of the objective value
        - disp: True to print convergence messages
        - locs_bounds_frac: When making box bounds for the test_locs, extend
            the box defined by coordinate-wise min-max by std of each coordinate
            multiplied by this number.
        - gwidth_lb: absolute lower bound on the Gaussian width^2
        - gwidth_ub: absolute upper bound on the Gaussian width^2
        - use_2terms: If True, then besides the signal-to-noise ratio
          criterion, the objective function will also include the first term
          that is dropped.

        #- If the lb, ub bounds are None, use fraction of the median heuristics 
        #    to automatically set the bounds.
        
        Return (V test_locs, gaussian width, optimization info log)
        """
        J = test_locs0.shape[0]
        X = dat.data()
        n, d = X.shape

        # Parameterize the Gaussian width with its square root (then square later)
        # to automatically enforce the positivity.
        def obj(sqrt_gwidth, V):
            return -GaussFSSD.power_criterion(
                p, dat, sqrt_gwidth**2, V, reg=reg, use_2terms=use_2terms)

        flatten = lambda gwidth, V: np.hstack((gwidth, V.reshape(-1)))

        def unflatten(x):
            sqrt_gwidth = x[0]
            V = np.reshape(x[1:], (J, d))
            return sqrt_gwidth, V

        def flat_obj(x):
            sqrt_gwidth, V = unflatten(x)
            return obj(sqrt_gwidth, V)

        # gradient
        #grad_obj = autograd.elementwise_grad(flat_obj)
        # Initial point
        x0 = flatten(np.sqrt(gwidth0), test_locs0)

        #make sure that the optimized gwidth is not too small or too large.
        fac_min = 1e-2
        fac_max = 1e2
        med2 = util.meddistance(X, subsample=1000)**2
        if gwidth_lb is None:
            gwidth_lb = max(fac_min * med2, 1e-3)
        if gwidth_ub is None:
            gwidth_ub = min(fac_max * med2, 1e5)

        # Make a box to bound test locations
        X_std = np.std(X, axis=0)
        # X_min: length-d array
        X_min = np.min(X, axis=0)
        X_max = np.max(X, axis=0)
        # V_lb: J x d
        V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1))
        V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1))
        # (J*d+1) x 2. Take square root because we parameterize with the square
        # root
        x0_lb = np.hstack((np.sqrt(gwidth_lb), np.reshape(V_lb, -1)))
        x0_ub = np.hstack((np.sqrt(gwidth_ub), np.reshape(V_ub, -1)))
        x0_bounds = list(zip(x0_lb, x0_ub))

        # optimize. Time the optimization as well.
        # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html
        grad_obj = autograd.elementwise_grad(flat_obj)
        with util.ContextTimer() as timer:
            opt_result = scipy.optimize.minimize(
                flat_obj,
                x0,
                method='L-BFGS-B',
                bounds=x0_bounds,
                tol=tol_fun,
                options={
                    'maxiter': max_iter,
                    'ftol': tol_fun,
                    'disp': disp,
                    'gtol': 1.0e-07,
                },
                jac=grad_obj,
            )

        opt_result = dict(opt_result)
        opt_result['time_secs'] = timer.secs
        x_opt = opt_result['x']
        sq_gw_opt, V_opt = unflatten(x_opt)
        gw_opt = sq_gw_opt**2

        assert util.is_real_num(
            gw_opt), 'gw_opt is not real. Was %s' % str(gw_opt)

        return V_opt, gw_opt, opt_result