def load_stack(class_data_loader, classes=None, seed=28, max_class_size=None): """ This function is deterministic given the seed. max_class_size: if specified, then randomly select at most max_class_size points in each class. """ if classes is None: classes = cifar10_classes list_arrays = [] label_arrays = [] with util.NumpySeedContext(seed=seed): for c in classes: log.l().info('Loading cifar10 class: {}'.format(c)) arr = class_data_loader(c) nc = arr.shape[0] if max_class_size is not None: ncmax = min(nc, max_class_size) else: ncmax = nc Ind = util.subsample_ind(nc, ncmax, seed=seed + 3) sub_arr = arr[Ind, :] class_label = cifar10_class_ind_dict[c] Yc = np.ones(ncmax) * class_label list_arrays.append(sub_arr) label_arrays.append(Yc) stack = np.vstack(list_arrays) label_stack = np.hstack(label_arrays) assert stack.shape[0] <= len(classes) * max_class_size assert stack.shape[0] == len(label_stack) return stack, label_stack
def ume_power_criterion(X, Y, Z, Vp, Vq, k, reg): fea_pr = ume_feature_matrix(X, Z, Vp, k) # n x Jp fea_qr = ume_feature_matrix(Y, Z, Vq, k) # n x Jq umehp, var_pr = ume_ustat_h1_mean_variance(fea_pr, return_variance=True, use_unbiased=True) umehq, var_qr = ume_ustat_h1_mean_variance(fea_qr, return_variance=True, use_unbiased=True) if (var_pr <= 0).any(): log.l().warning('Non-positive var_pr detected. Was {}'.format(var_pr)) if (var_qr <= 0).any(): log.l().warning('Non-positive var_qr detected. War {}'.format(var_qr)) mean_h1 = umehp - umehq # mean features mean_pr = torch.mean(fea_pr, dim=0) mean_qr = torch.mean(fea_qr, dim=0) t1 = 4.0 * torch.mean( torch.matmul(fea_pr, mean_pr) * torch.matmul(fea_qr, mean_qr)) t2 = 4.0 * torch.sum(mean_pr**2) * torch.sum(mean_qr**2) # compute the cross-covariance var_pqr = t1 - t2 var_h1 = var_pr - 2.0 * var_pqr + var_qr power_criterion = mean_h1 / torch.sqrt(var_h1 + reg) return power_criterion
def perform_test(self, dat): """ :param dat: an instance of kmod.data.Data """ with util.ContextTimer() as t: alpha = self.alpha X = dat.data() n = X.shape[0] #mean and variance are not yet scaled by \sqrt{n} # The variance is the same for both H0 and H1. mean_h1, var = self.get_H1_mean_variance(dat) stat = (n**0.5) * mean_h1 null_std = var**0.5 if null_std <= 1e-6: log.l().warning( 'SD of the null distribution is too small. Was {}. Will not reject H0.' .format(null_std)) pval = np.inf else: # Assume the mean of the null distribution is 0 pval = stats.norm.sf(stat, loc=0, scale=null_std) results = { 'alpha': self.alpha, 'pvalue': pval, 'test_stat': stat, 'h0_rejected': pval < alpha, 'time_secs': t.secs, } return results
def problems_folder(): """ Return the full path to the problems folder """ import kmod.config as config problems_path = config.resource_configs['problems_path'] log.l().warning( 'The function problems_folder() is deprecated. Use prob_model_folder() instead' ) return problems_path
def get_H1_mean_variance(self, dat, return_variance=True): """ Return the mean and variance under H1 of the test statistic = \sqrt{n}(UME(P, R)^2 - UME(Q, R))^2. The estimator of the mean is unbiased (can be negative). The variance is also valid under H0. :returns: (mean, variance) If return_variance is False, :returns: mean """ umep = self.umep umeq = self.umeq # form a two-sample test dataset between datap and dat (data from R) Z = dat.data() datapr = tstdata.TSTData(self.datap.data(), Z) dataqr = tstdata.TSTData(self.dataq.data(), Z) # get the feature matrices (correlated) fea_pr = umep.feature_matrix(datapr) # n x Jp fea_qr = umeq.feature_matrix(dataqr) # n x Jq assert fea_pr.shape[1] == self.V.shape[0] assert fea_qr.shape[1] == self.W.shape[0] # umehp = ume_hat(p, r) umehp, var_pr = tst.UMETest.ustat_h1_mean_variance( fea_pr, return_variance=True, use_unbiased=True) umehq, var_qr = tst.UMETest.ustat_h1_mean_variance( fea_qr, return_variance=True, use_unbiased=True) if var_pr <= 0: log.l().warning( 'Non-positive var_pr detected. Was {}'.format(var_pr)) if var_qr <= 0: log.l().warning( 'Non-positive var_qr detected. Was {}'.format(var_qr)) #assert var_pr > 0, 'var_pr was {}'.format(var_pr) #assert var_qr > 0, 'var_qr was {}'.format(var_qr) mean_h1 = umehp - umehq if not return_variance: return mean_h1 # mean features mean_pr = np.mean(fea_pr, axis=0) mean_qr = np.mean(fea_qr, axis=0) t1 = 4.0 * np.mean(np.dot(fea_pr, mean_pr) * np.dot(fea_qr, mean_qr)) t2 = 4.0 * np.sum(mean_pr**2) * np.sum(mean_qr**2) # compute the cross-covariance var_pqr = t1 - t2 var_h1 = var_pr - 2.0 * var_pqr + var_qr return mean_h1, var_h1
def perform_test(self, dat): """perform the model comparison test and return values computed in a dictionary: { alpha: 0.01, pvalue: 0.0002, test_stat: 2.3, h0_rejected: True, time_secs: ... } :param dat: an instance of kmod.data.Data """ with util.ContextTimer() as t: alpha = self.alpha X = dat.data() n = X.shape[0] # mean and variance are not yet scaled by \sqrt{n} # The variance is the same for both H0 and H1. mean_h1, var = self.get_H1_mean_variance(dat) if not util.is_real_num(var) or var < 0: log.l().warning('Invalid H0 variance. Was {}'.format(var)) stat = (n**0.5) * mean_h1 # Assume the mean of the null distribution is 0 pval = stats.norm.sf(stat, loc=0, scale=var**0.5) if not util.is_real_num(pval): log.l().warning( 'p-value is not a real number. Was {}'.format(pval)) results = { 'alpha': self.alpha, 'pvalue': pval, 'test_stat': stat, 'h0_rejected': pval < alpha, 'time_secs': t.secs, } return results
def get_H1_mean_variance(self, dat): """ Return the mean and variance under H1 of the test statistic = \sqrt{n}(FSSD(p)^2 - FSSD(q)^2). The estimator of the mean is unbiased (can be negative). The estimator of the variance is biased. The variance is also valid under H0. :returns: (mean, variance) """ fssdp = self.fssdp fssdq = self.fssdq X = dat.data() # Feature tensor: n x d x Jp where n = sample size. Xip = fssdp.feature_tensor(X) n, d, Jp = Xip.shape # Feature tensor: n x d x Jq where n = sample size. Xiq = fssdq.feature_tensor(X) Jq = Xiq.shape[2] assert Xiq.shape[0] == n assert Xiq.shape[1] == d statp, varp = gof.FSSD.ustat_h1_mean_variance(Xip, return_variance=True, use_unbiased=True) if varp <= 0: log.l().warning('varp is not positive. Was {}'.format(varp)) statq, varq = gof.FSSD.ustat_h1_mean_variance(Xiq, return_variance=True, use_unbiased=True) if varq <= 0: log.l().warning('varq is not positive. Was {}'.format(varq)) mean_h1 = statp - statq # compute the cross covariance (i.e., diagonal entries of the # covariance of the asymptotic joint normal). # mu: d*J vector Taup = np.reshape(Xip, [n, d * Jp]) Tauq = np.reshape(Xiq, [n, d * Jq]) # length-d*Jp vector mup = np.mean(Taup, 0) muq = np.mean(Tauq, 0) varpq = 4.0 * np.mean(np.dot(Taup, mup) * np.dot( Tauq, muq)) - 4.0 * np.sum(mup**2) * np.sum(muq**2) variance = varp - 2.0 * varpq + varq if variance <= 0: log.l().warning( 'variance of the stat is not positive. Was {}'.format( variance)) return mean_h1, variance
def _download_data(self, feature_folder): for class_name in self.classes: filename = '{}.npy'.format(class_name) npy_path = self.data_path(feature_folder, filename) try: if not os.path.exists(npy_path): dir_path = self.data_path(feature_folder) os.makedirs(dir_path, exist_ok=True) relative_path = [ 'problems', self.dataname, feature_folder, filename ] url = os.path.join(self.data_url, *relative_path) log.l().info('Downloading {}'.format(url)) util.download_to(url, npy_path) log.l().info('Saved to {}'.format(npy_path)) except urllib.error.HTTPError: log.l().warning('File does not exist in the server')
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument( '--data_dir', type=str, default=glo.data_file('mnist/'), help= 'Full path to the folder containing Mnist training data. Mnist data will be downloaded if not existed already.' ) parser.add_argument( '--prob_model_dir', type=str, default=glo.prob_model_folder('mnist_cnn'), help='Full path to the folder to be used to save mnist-cnn classifier' 's related files.') args = parser.parse_args() print('Training options: ') pprint.pprint(vars(args), width=5) use_cuda = not args.no_cuda and torch.cuda.is_available() os.makedirs(args.prob_model_dir, exist_ok=True) model_fname = 'mnist_cnn_ep{}_s{}.pt'.format(args.epochs, args.seed) model_fpath = os.path.join(args.prob_model_dir, model_fname) log.l().info( 'Will save the trained CNN classifier model to {}'.format(model_fpath)) log.l().info('Starting training') torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} mnist_folder = args.data_dir train_loader = torch.utils.data.DataLoader(datasets.MNIST( mnist_folder, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( mnist_folder, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = MnistClassifier().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) # save the model model.save(model_fpath)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--n_epochs', type=int, default=30, help='number of epochs of training') parser.add_argument('--batch_size', type=int, default=64, help='size of the batches') parser.add_argument('--lr', type=float, default=0.0002, help='adam: learning rate') parser.add_argument('--b1', type=float, default=0.5, help='adam: decay of first order momentum of gradient') parser.add_argument('--b2', type=float, default=0.999, help='adam: decay of first order momentum of gradient') parser.add_argument( '--n_cpu', type=int, default=2, help='number of cpu threads to use during batch generation') parser.add_argument('--latent_dim', type=int, default=100, help='dimensionality of the latent space') parser.add_argument( '--sample_interval', type=int, default=400, help='Create sample images every this many number of minibatch updates' ) parser.add_argument( '--data_dir', type=str, default=glo.data_file('mnist/'), help= 'Full path to the folder containing Mnist training data. Mnist data will be downloaded if not existed already.' ) parser.add_argument( '--prob_model_dir', type=str, default=glo.prob_model_folder('mnist_dcgan'), help= 'Full path to the folder to be used to save mnist-dcgan related files e.g., generated images, model.' ) opt = parser.parse_args() opt_dict = vars(opt) print('Training options: ') pprint.pprint(opt_dict, width=5) # training a DCGAN dcgan = DCGAN(**opt_dict) model_fname = 'mnist_dcgan_ep{}_bs{}.pt'.format(opt.n_epochs, opt.batch_size) model_fpath = os.path.join(opt.prob_model_dir, model_fname) log.l().info('Will save the trained DCGAN model to {}'.format(model_fpath)) log.l().info('Starting training') dcgan.train() # save the generator as an object of type kmod.gen.PTNoiseTransformer g = dcgan.generator f_sample_noise = dcgan.sample_noise # get output sizes by sampling one image cuda = True if torch.cuda.is_available() else False Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor z = Variable(f_sample_noise(1).type(Tensor)) gen_img = g(z) in_out_shapes = (dcgan.latent_dim, gen_img.shape[1:]) G = gen.PTNoiseTransformerAdapter(module=g, f_sample_noise=f_sample_noise, in_out_shapes=in_out_shapes, tensor_type=Tensor) # save() is a method from kmod.net.SerializableModule G.save(model_fpath)