def optimize_auto_init(p, dat, J, **ops): """ Optimize parameters by calling optimize_locs_widths(). Automatically initialize the test locations and the Gaussian width. Return optimized locations, Gaussian width, optimization info """ assert J > 0 # Use grid search to initialize the gwidth X = dat.data() n_gwidth_cand = 5 gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) med2 = util.meddistance(X, 1000)**2 k = kernel.KGauss(med2 * 2) # fit a Gaussian to the data and draw to initialize V0 V0 = util.fit_gaussian_draw(X, J, seed=829, reg=1e-6) list_gwidth = np.hstack(((med2) * gwidth_factors)) besti, objs = GaussFSSD.grid_search_gwidth(p, dat, V0, list_gwidth) gwidth = list_gwidth[besti] assert util.is_real_num( gwidth), 'gwidth not real. Was %s' % str(gwidth) assert gwidth > 0, 'gwidth not positive. Was %.3g' % gwidth logging.info('After grid search, gwidth=%.3g' % gwidth) V_opt, gwidth_opt, info = GaussFSSD.optimize_locs_widths( p, dat, gwidth, V0, **ops) # set the width bounds #fac_min = 5e-2 #fac_max = 5e3 #gwidth_lb = fac_min*med2 #gwidth_ub = fac_max*med2 #gwidth_opt = max(gwidth_lb, min(gwidth_opt, gwidth_ub)) return V_opt, gwidth_opt, info
def perform_test(self, dat, candidate_kernels=None, return_mmdtest=False, tr_proportion=0.2, reg=1e-3): """ dat: an instance of Data candidate_kernels: a list of Kernel's to choose from tr_proportion: proportion of sample to be used to choosing the best kernel reg: regularization parameter for the test power criterion """ with util.ContextTimer() as t: seed = self.seed p = self.p ds = p.get_datasource() p_sample = ds.sample(dat.sample_size(), seed=seed + 77) xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed + 18) # ytr, yte are of type data.Data ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed + 12) # training and test data tr_tst_data = fdata.TSTData(xtr.data(), ytr.data()) te_tst_data = fdata.TSTData(xte.data(), yte.data()) if candidate_kernels is None: # Assume a Gaussian kernel. Construct a list of # kernels to try based on multiples of the median heuristic med = util.meddistance(tr_tst_data.stack_xy(), 1000) list_gwidth = np.hstack( ((med**2) * (2.0**np.linspace(-4, 4, 10)))) list_gwidth.sort() candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth] alpha = self.alpha # grid search to choose the best Gaussian width besti, powers = tst.QuadMMDTest.grid_search_kernel( tr_tst_data, candidate_kernels, alpha, reg=reg) # perform test best_ker = candidate_kernels[besti] mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha) results = mmdtest.perform_test(te_tst_data) if return_mmdtest: results['mmdtest'] = mmdtest results['time_secs'] = t.secs return results
def optimize_locs_widths( p, dat, gwidth0, test_locs0, reg=1e-2, max_iter=100, tol_fun=1e-5, disp=False, locs_bounds_frac=100, gwidth_lb=None, gwidth_ub=None, use_2terms=False, ): """ Optimize the test locations and the Gaussian kernel width by maximizing a test power criterion. data should not be the same data as used in the actual test (i.e., should be a held-out set). This function is deterministic. - data: a Data object - test_locs0: Jxd numpy array. Initial V. - reg: reg to add to the mean/sqrt(variance) criterion to become mean/sqrt(variance + reg) - gwidth0: initial value of the Gaussian width^2 - max_iter: #gradient descent iterations - tol_fun: termination tolerance of the objective value - disp: True to print convergence messages - locs_bounds_frac: When making box bounds for the test_locs, extend the box defined by coordinate-wise min-max by std of each coordinate multiplied by this number. - gwidth_lb: absolute lower bound on the Gaussian width^2 - gwidth_ub: absolute upper bound on the Gaussian width^2 - use_2terms: If True, then besides the signal-to-noise ratio criterion, the objective function will also include the first term that is dropped. #- If the lb, ub bounds are None, use fraction of the median heuristics # to automatically set the bounds. Return (V test_locs, gaussian width, optimization info log) """ J = test_locs0.shape[0] X = dat.data() n, d = X.shape # Parameterize the Gaussian width with its square root (then square later) # to automatically enforce the positivity. def obj(sqrt_gwidth, V): return -GaussFSSD.power_criterion( p, dat, sqrt_gwidth**2, V, reg=reg, use_2terms=use_2terms) flatten = lambda gwidth, V: np.hstack((gwidth, V.reshape(-1))) def unflatten(x): sqrt_gwidth = x[0] V = np.reshape(x[1:], (J, d)) return sqrt_gwidth, V def flat_obj(x): sqrt_gwidth, V = unflatten(x) return obj(sqrt_gwidth, V) # gradient #grad_obj = autograd.elementwise_grad(flat_obj) # Initial point x0 = flatten(np.sqrt(gwidth0), test_locs0) #make sure that the optimized gwidth is not too small or too large. fac_min = 1e-2 fac_max = 1e2 med2 = util.meddistance(X, subsample=1000)**2 if gwidth_lb is None: gwidth_lb = max(fac_min * med2, 1e-3) if gwidth_ub is None: gwidth_ub = min(fac_max * med2, 1e5) # Make a box to bound test locations X_std = np.std(X, axis=0) # X_min: length-d array X_min = np.min(X, axis=0) X_max = np.max(X, axis=0) # V_lb: J x d V_lb = np.tile(X_min - locs_bounds_frac * X_std, (J, 1)) V_ub = np.tile(X_max + locs_bounds_frac * X_std, (J, 1)) # (J*d+1) x 2. Take square root because we parameterize with the square # root x0_lb = np.hstack((np.sqrt(gwidth_lb), np.reshape(V_lb, -1))) x0_ub = np.hstack((np.sqrt(gwidth_ub), np.reshape(V_ub, -1))) x0_bounds = list(zip(x0_lb, x0_ub)) # optimize. Time the optimization as well. # https://docs.scipy.org/doc/scipy/reference/optimize.minimize-lbfgsb.html grad_obj = autograd.elementwise_grad(flat_obj) with util.ContextTimer() as timer: opt_result = scipy.optimize.minimize( flat_obj, x0, method='L-BFGS-B', bounds=x0_bounds, tol=tol_fun, options={ 'maxiter': max_iter, 'ftol': tol_fun, 'disp': disp, 'gtol': 1.0e-07, }, jac=grad_obj, ) opt_result = dict(opt_result) opt_result['time_secs'] = timer.secs x_opt = opt_result['x'] sq_gw_opt, V_opt = unflatten(x_opt) gw_opt = sq_gw_opt**2 assert util.is_real_num( gw_opt), 'gw_opt is not real. Was %s' % str(gw_opt) return V_opt, gw_opt, opt_result
d =2 # sample n = 800 mean = np.zeros(d) variance = 1.0 qmean = mean.copy() qmean[0] = 0 qvariance = variance p = density.IsotropicNormal(mean, variance) ds = data.DSIsotropicNormal(qmean, qvariance) # ds = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2)) dat = ds.sample(n, seed=seed+1) X = dat.data() sig2 = util.meddistance(X, subsample=1000) ** 2 def simulatepm(N, p_change): ''' :param N: :param p_change: :return: ''' X = np.zeros(N) - 1 change_sign = np.random.rand(N) < p_change for i in range(N): if change_sign[i]: X[i] = -X[i - 1] else:
def main(): random.seed(0) n = 6000 # number of data points divisable by num_Gassians num_Gaussians = 3 input_dim = 2 mean_param = np.zeros((input_dim, num_Gaussians)) cov_param = np.zeros((input_dim, input_dim, num_Gaussians)) mean_param[:, 0] = [2, 8] mean_param[:, 1] = [-10, -4] mean_param[:, 2] = [-1, -7] cov_mat = np.empty((2, 2)) cov_mat[0, 0] = 1 cov_mat[1, 1] = 4 cov_mat[0, 1] = -0.25 cov_mat[1, 0] = -0.25 cov_param[:, :, 0] = cov_mat cov_mat[0, 1] = 0.4 cov_mat[1, 0] = 0.4 cov_param[:, :, 1] = cov_mat cov_param[:, :, 2] = 2 * np.eye(input_dim) data_samps, true_labels = generate_data(mean_param, cov_param, n) # test how to use RFF for computing the kernel matrix med = util.meddistance(data_samps) # sigma2 = med**2 sigma2 = med # it seems to be more useful to use smaller length scale than median heuristic print('length scale from median heuristic is', sigma2) # random Fourier features n_features = 100 n_classes = num_Gaussians """ training a Generator via minimizing MMD """ mini_batch_size = 1000 input_size = 10 hidden_size_1 = 100 hidden_size_2 = 50 output_size = input_dim + n_classes # model = Generative_Model(input_dim=input_dim, how_many_Gaussians=num_Gaussians) model = Generative_Model(input_size=input_size, hidden_size_1=hidden_size_1, hidden_size_2=hidden_size_2, output_size=output_size, n_classes=n_classes) # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) optimizer = optim.Adam(model.parameters(), lr=1e-2) # optimizer = optim.SGD(model.parameters(), lr=0.001) how_many_epochs = 1000 how_many_iter = np.int(n / mini_batch_size) training_loss_per_epoch = np.zeros(how_many_epochs) draws = n_features // 2 W_freq = np.random.randn(draws, input_dim) / np.sqrt(sigma2) """ computing mean embedding of true data """ emb1_input_features = RFF_Gauss(n_features, torch.Tensor(data_samps), W_freq) emb1_labels = torch.Tensor(true_labels) outer_emb1 = torch.einsum('ki,kj->kij', [emb1_input_features, emb1_labels]) mean_emb1 = torch.mean(outer_emb1, 0) print('Starting Training') for epoch in range( how_many_epochs): # loop over the dataset multiple times running_loss = 0.0 for i in range(how_many_iter): # zero the parameter gradients optimizer.zero_grad() outputs = model(torch.randn((mini_batch_size, input_size))) samp_input_features = outputs[:, 0:input_dim] samp_labels = outputs[:, -n_classes:] """ computing mean embedding of generated samples """ emb2_input_features = RFF_Gauss(n_features, samp_input_features, W_freq) emb2_labels = samp_labels outer_emb2 = torch.einsum('ki,kj->kij', [emb2_input_features, emb2_labels]) mean_emb2 = torch.mean(outer_emb2, 0) loss = torch.norm(mean_emb1 - mean_emb2, p=2)**2 loss.backward() optimizer.step() # print statistics running_loss += loss.item() if running_loss <= 1e-4: break print('epoch # and running loss are ', [epoch, running_loss]) training_loss_per_epoch[epoch] = running_loss plt.figure(1) plt.subplot(121) true_labl = np.argmax(true_labels, axis=1) plt.scatter(data_samps[:, 0], data_samps[:, 1], c=true_labl, label=true_labl) plt.title('true data') plt.subplot(122) model.eval() generated_samples = samp_input_features.detach().numpy() generated_labels = samp_labels.detach().numpy() labl = np.argmax(generated_labels, axis=1) plt.scatter(generated_samples[:, 0], generated_samples[:, 1], c=labl, label=labl) plt.title('simulated data') plt.figure(2) plt.plot(training_loss_per_epoch) plt.title('MMD as a function of epoch') plt.show()