Exemple #1
0
def load_stack(class_data_loader, classes=None, seed=28, max_class_size=None):
    """
    This function is deterministic given the seed.
    
    max_class_size: if specified, then randomly select at most max_class_size 
        points in each class.
    """
    if classes is None:
        classes = cifar10_classes
    list_arrays = []
    label_arrays = []
    with util.NumpySeedContext(seed=seed):
        for c in classes:
            log.l().info('Loading cifar10 class: {}'.format(c))
            arr = class_data_loader(c)
            nc = arr.shape[0]
            if max_class_size is not None:
                ncmax = min(nc, max_class_size)
            else:
                ncmax = nc
            Ind = util.subsample_ind(nc, ncmax, seed=seed + 3)
            sub_arr = arr[Ind, :]
            class_label = cifar10_class_ind_dict[c]
            Yc = np.ones(ncmax) * class_label

            list_arrays.append(sub_arr)
            label_arrays.append(Yc)
    stack = np.vstack(list_arrays)
    label_stack = np.hstack(label_arrays)
    assert stack.shape[0] <= len(classes) * max_class_size
    assert stack.shape[0] == len(label_stack)
    return stack, label_stack
Exemple #2
0
def ume_power_criterion(X, Y, Z, Vp, Vq, k, reg):
    fea_pr = ume_feature_matrix(X, Z, Vp, k)  # n x Jp
    fea_qr = ume_feature_matrix(Y, Z, Vq, k)  # n x Jq
    umehp, var_pr = ume_ustat_h1_mean_variance(fea_pr,
                                               return_variance=True,
                                               use_unbiased=True)
    umehq, var_qr = ume_ustat_h1_mean_variance(fea_qr,
                                               return_variance=True,
                                               use_unbiased=True)

    if (var_pr <= 0).any():
        log.l().warning('Non-positive var_pr detected. Was {}'.format(var_pr))
    if (var_qr <= 0).any():
        log.l().warning('Non-positive var_qr detected. War {}'.format(var_qr))
    mean_h1 = umehp - umehq

    # mean features
    mean_pr = torch.mean(fea_pr, dim=0)
    mean_qr = torch.mean(fea_qr, dim=0)
    t1 = 4.0 * torch.mean(
        torch.matmul(fea_pr, mean_pr) * torch.matmul(fea_qr, mean_qr))
    t2 = 4.0 * torch.sum(mean_pr**2) * torch.sum(mean_qr**2)

    # compute the cross-covariance
    var_pqr = t1 - t2
    var_h1 = var_pr - 2.0 * var_pqr + var_qr

    power_criterion = mean_h1 / torch.sqrt(var_h1 + reg)
    return power_criterion
Exemple #3
0
    def perform_test(self, dat):
        """
        :param dat: an instance of kmod.data.Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            X = dat.data()
            n = X.shape[0]
            #mean and variance are not yet scaled by \sqrt{n}
            # The variance is the same for both H0 and H1.
            mean_h1, var = self.get_H1_mean_variance(dat)
            stat = (n**0.5) * mean_h1
            null_std = var**0.5
            if null_std <= 1e-6:
                log.l().warning(
                    'SD of the null distribution is too small. Was {}. Will not reject H0.'
                    .format(null_std))
                pval = np.inf
            else:
                # Assume the mean of the null distribution is 0
                pval = stats.norm.sf(stat, loc=0, scale=null_std)

        results = {
            'alpha': self.alpha,
            'pvalue': pval,
            'test_stat': stat,
            'h0_rejected': pval < alpha,
            'time_secs': t.secs,
        }
        return results
Exemple #4
0
def problems_folder():
    """
    Return the full path to the problems folder 
    """
    import kmod.config as config
    problems_path = config.resource_configs['problems_path']
    log.l().warning(
        'The function problems_folder() is deprecated. Use prob_model_folder() instead'
    )
    return problems_path
Exemple #5
0
    def get_H1_mean_variance(self, dat, return_variance=True):
        """
        Return the mean and variance under H1 of the 
        test statistic = \sqrt{n}(UME(P, R)^2 - UME(Q, R))^2.
        The estimator of the mean is unbiased (can be negative). The variance
        is also valid under H0.

        :returns: (mean, variance)

        If return_variance is False, 
        :returns: mean
        """
        umep = self.umep
        umeq = self.umeq
        # form a two-sample test dataset between datap and dat (data from R)
        Z = dat.data()
        datapr = tstdata.TSTData(self.datap.data(), Z)
        dataqr = tstdata.TSTData(self.dataq.data(), Z)

        # get the feature matrices (correlated)
        fea_pr = umep.feature_matrix(datapr)  # n x Jp
        fea_qr = umeq.feature_matrix(dataqr)  # n x Jq
        assert fea_pr.shape[1] == self.V.shape[0]
        assert fea_qr.shape[1] == self.W.shape[0]

        # umehp = ume_hat(p, r)
        umehp, var_pr = tst.UMETest.ustat_h1_mean_variance(
            fea_pr, return_variance=True, use_unbiased=True)
        umehq, var_qr = tst.UMETest.ustat_h1_mean_variance(
            fea_qr, return_variance=True, use_unbiased=True)

        if var_pr <= 0:
            log.l().warning(
                'Non-positive var_pr detected. Was {}'.format(var_pr))
        if var_qr <= 0:
            log.l().warning(
                'Non-positive var_qr detected. Was {}'.format(var_qr))
        #assert var_pr > 0, 'var_pr was {}'.format(var_pr)
        #assert var_qr > 0, 'var_qr was {}'.format(var_qr)
        mean_h1 = umehp - umehq

        if not return_variance:
            return mean_h1

        # mean features
        mean_pr = np.mean(fea_pr, axis=0)
        mean_qr = np.mean(fea_qr, axis=0)
        t1 = 4.0 * np.mean(np.dot(fea_pr, mean_pr) * np.dot(fea_qr, mean_qr))
        t2 = 4.0 * np.sum(mean_pr**2) * np.sum(mean_qr**2)

        # compute the cross-covariance
        var_pqr = t1 - t2
        var_h1 = var_pr - 2.0 * var_pqr + var_qr
        return mean_h1, var_h1
Exemple #6
0
    def perform_test(self, dat):
        """perform the model comparison test and return values computed in a
        dictionary: 
        {
            alpha: 0.01,
            pvalue: 0.0002,
            test_stat: 2.3,
            h0_rejected: True,
            time_secs: ...
        }

        :param dat: an instance of kmod.data.Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            X = dat.data()
            n = X.shape[0]
            # mean and variance are not yet scaled by \sqrt{n}
            # The variance is the same for both H0 and H1.
            mean_h1, var = self.get_H1_mean_variance(dat)
            if not util.is_real_num(var) or var < 0:
                log.l().warning('Invalid H0 variance. Was {}'.format(var))
            stat = (n**0.5) * mean_h1
            # Assume the mean of the null distribution is 0
            pval = stats.norm.sf(stat, loc=0, scale=var**0.5)
            if not util.is_real_num(pval):
                log.l().warning(
                    'p-value is not a real number. Was {}'.format(pval))

        results = {
            'alpha': self.alpha,
            'pvalue': pval,
            'test_stat': stat,
            'h0_rejected': pval < alpha,
            'time_secs': t.secs,
        }
        return results
Exemple #7
0
    def get_H1_mean_variance(self, dat):
        """
        Return the mean and variance under H1 of the 
        test statistic = \sqrt{n}(FSSD(p)^2 - FSSD(q)^2).
        The estimator of the mean is unbiased (can be negative). The estimator
        of the variance is biased. The variance is also valid under H0.

        :returns: (mean, variance)
        """
        fssdp = self.fssdp
        fssdq = self.fssdq
        X = dat.data()

        # Feature tensor: n x d x Jp where n = sample size.
        Xip = fssdp.feature_tensor(X)
        n, d, Jp = Xip.shape
        # Feature tensor: n x d x Jq where n = sample size.
        Xiq = fssdq.feature_tensor(X)
        Jq = Xiq.shape[2]
        assert Xiq.shape[0] == n
        assert Xiq.shape[1] == d

        statp, varp = gof.FSSD.ustat_h1_mean_variance(Xip,
                                                      return_variance=True,
                                                      use_unbiased=True)
        if varp <= 0:
            log.l().warning('varp is not positive. Was {}'.format(varp))
        statq, varq = gof.FSSD.ustat_h1_mean_variance(Xiq,
                                                      return_variance=True,
                                                      use_unbiased=True)
        if varq <= 0:
            log.l().warning('varq is not positive. Was {}'.format(varq))
        mean_h1 = statp - statq

        # compute the cross covariance (i.e., diagonal entries of the
        # covariance of the asymptotic joint normal).
        # mu: d*J vector
        Taup = np.reshape(Xip, [n, d * Jp])
        Tauq = np.reshape(Xiq, [n, d * Jq])
        # length-d*Jp vector
        mup = np.mean(Taup, 0)
        muq = np.mean(Tauq, 0)
        varpq = 4.0 * np.mean(np.dot(Taup, mup) * np.dot(
            Tauq, muq)) - 4.0 * np.sum(mup**2) * np.sum(muq**2)
        variance = varp - 2.0 * varpq + varq
        if variance <= 0:
            log.l().warning(
                'variance of the stat is not positive. Was {}'.format(
                    variance))
        return mean_h1, variance
Exemple #8
0
    def _download_data(self, feature_folder):
        for class_name in self.classes:
            filename = '{}.npy'.format(class_name)
            npy_path = self.data_path(feature_folder, filename)
            try:
                if not os.path.exists(npy_path):
                    dir_path = self.data_path(feature_folder)
                    os.makedirs(dir_path, exist_ok=True)

                    relative_path = [
                        'problems', self.dataname, feature_folder, filename
                    ]
                    url = os.path.join(self.data_url, *relative_path)
                    log.l().info('Downloading {}'.format(url))
                    util.download_to(url, npy_path)
                    log.l().info('Saved to {}'.format(npy_path))
            except urllib.error.HTTPError:
                log.l().warning('File does not exist in the server')
Exemple #9
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument(
        '--data_dir',
        type=str,
        default=glo.data_file('mnist/'),
        help=
        'Full path to the folder containing Mnist training data. Mnist data will be downloaded if not existed already.'
    )
    parser.add_argument(
        '--prob_model_dir',
        type=str,
        default=glo.prob_model_folder('mnist_cnn'),
        help='Full path to the folder to be used to save mnist-cnn classifier'
        's related files.')

    args = parser.parse_args()
    print('Training options: ')
    pprint.pprint(vars(args), width=5)
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    os.makedirs(args.prob_model_dir, exist_ok=True)
    model_fname = 'mnist_cnn_ep{}_s{}.pt'.format(args.epochs, args.seed)
    model_fpath = os.path.join(args.prob_model_dir, model_fname)
    log.l().info(
        'Will save the trained CNN classifier model to {}'.format(model_fpath))

    log.l().info('Starting training')

    torch.manual_seed(args.seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    mnist_folder = args.data_dir
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        mnist_folder,
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        mnist_folder,
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = MnistClassifier().to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader)

    # save the model
    model.save(model_fpath)
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_epochs',
                        type=int,
                        default=30,
                        help='number of epochs of training')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='size of the batches')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0002,
                        help='adam: learning rate')
    parser.add_argument('--b1',
                        type=float,
                        default=0.5,
                        help='adam: decay of first order momentum of gradient')
    parser.add_argument('--b2',
                        type=float,
                        default=0.999,
                        help='adam: decay of first order momentum of gradient')
    parser.add_argument(
        '--n_cpu',
        type=int,
        default=2,
        help='number of cpu threads to use during batch generation')
    parser.add_argument('--latent_dim',
                        type=int,
                        default=100,
                        help='dimensionality of the latent space')
    parser.add_argument(
        '--sample_interval',
        type=int,
        default=400,
        help='Create sample images every this many number of minibatch updates'
    )
    parser.add_argument(
        '--data_dir',
        type=str,
        default=glo.data_file('mnist/'),
        help=
        'Full path to the folder containing Mnist training data. Mnist data will be downloaded if not existed already.'
    )
    parser.add_argument(
        '--prob_model_dir',
        type=str,
        default=glo.prob_model_folder('mnist_dcgan'),
        help=
        'Full path to the folder to be used to save mnist-dcgan related files e.g., generated images, model.'
    )

    opt = parser.parse_args()
    opt_dict = vars(opt)
    print('Training options: ')
    pprint.pprint(opt_dict, width=5)

    # training a DCGAN
    dcgan = DCGAN(**opt_dict)
    model_fname = 'mnist_dcgan_ep{}_bs{}.pt'.format(opt.n_epochs,
                                                    opt.batch_size)
    model_fpath = os.path.join(opt.prob_model_dir, model_fname)
    log.l().info('Will save the trained DCGAN model to {}'.format(model_fpath))

    log.l().info('Starting training')
    dcgan.train()

    # save the generator as an object of type kmod.gen.PTNoiseTransformer
    g = dcgan.generator
    f_sample_noise = dcgan.sample_noise

    # get output sizes by sampling one image
    cuda = True if torch.cuda.is_available() else False
    Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
    z = Variable(f_sample_noise(1).type(Tensor))
    gen_img = g(z)
    in_out_shapes = (dcgan.latent_dim, gen_img.shape[1:])
    G = gen.PTNoiseTransformerAdapter(module=g,
                                      f_sample_noise=f_sample_noise,
                                      in_out_shapes=in_out_shapes,
                                      tensor_type=Tensor)
    # save() is a method from kmod.net.SerializableModule
    G.save(model_fpath)