Beispiel #1
0
    def fit_transform(self, X, y):
        N, d = X.shape
        classes = y.unique()

        # compute S_w
        S_w = torch.zeros(d, d)

        for c in classes:
            x_c = X[y == c]
            pi_c = x_c.shape[0] / N
            S_w += pi_c * cov(x_c.t())

        # compute S_b
        C = cov(X.t())
        S_b = C - S_w

        M = S_w.inverse() @ S_b

        # compute eigen value and eigen vector
        eigvals, eigvecs = M.eig(True)

        indices = eigvals[:, 0].sort(descending=True)[1][:self.n_dims]

        self.w = eigvecs[:, indices]

        return X @ self.w
Beispiel #2
0
def minvar_nls_loo(sim):

    T, N = sim.shape

    X = sim.X

    P = np.zeros((N, N))
    q = np.zeros(N)

    for k in range(T):

        _k = list(range(T))
        del _k[k]

        S_k = cov(X[_k, :])
        _, U_k = eig(S_k)

        Xk = X[k].reshape(N, 1)
        C_k = U_k.T @ Xk @ Xk.T @ U_k
        alpha_k = U_k.T @ np.ones(N)
        A_k = np.diag(alpha_k)

        P += A_k @ C_k.T @ C_k @ A_k
        q += -A_k @ C_k.T @ alpha_k

    #@for

    z = np.linalg.solve(P, -q)
    d = 1 / z

    return d
Beispiel #3
0
 def hook_fn(self, module, input, output):
     # self.layer_channels[module] = output  # here not needed b/c not used again
     # pool = nn.AdaptiveAvgPool2d(10)
     pool = nn.AvgPool2d(output.size()[2:])
     analyse = pool(output)
     analyse = analyse.view(analyse.size()[0], -1, 1)
     if 0 < len(analyse):
         analyse = torch.cat((analyse[0], analyse[1]), dim=1)
     covm = cov(analyse)
     self.covariance_matrices.append(covm)
     self.eigenvalues.append(torch.symeig(covm))
Beispiel #4
0
    def fit_transform(self, X):
        self.mean = X.mean(0, keepdim=True)

        X -= self.mean

        C = cov(X.t())

        eigvals, eigvecs = C.eig(True)

        indices = eigvals[:, 0].sort(descending=True)[1][:self.n_dims]

        self.w = eigvecs[:, indices]

        return X @ self.w
Beispiel #5
0
    def __init__(self, n=100, N=1000, T=1.00, a=-0.4):
        """
        Constructor for class
        """
        # Basic assignments
        self.T = T  # Maturity
        self.n = n  # Number of time steps
        self.dt = 1.0 / self.n  # Step size
        self.s = int(self.n * self.T)  # Steps
        self.t = np.linspace(0, self.T, 1 + self.s)[np.newaxis, :]  # Time grid
        self.a = a  # Alpha = H - 0.5
        self.N = N  # Paths

        # Construct hybrid scheme correlation structure
        self.e = np.array([0, 0])
        self.c = utils.cov(self.a, self.n)
    def stepTraining(self, batch_x):
        this_batch_size = batch_x.size()[0]
        batch_x = batch_x.to(self.device)
        self.G.train()
        self.E.train()

        with torch.enable_grad():
            r""" 
            \mathbb{E}_{q_{\phi}(z \mid x )} \log p_\theta(x \mid z) 
            - \mathrm{KL}(q_{\phi}(z \mid x ) \| p(z))
            - \lambda (\sum_{i \neq j} cov( \mu(x) )_{ij}^2 + 10 * \sum_i ( cov( \mu(x) )_{ii} - 1)^2 )
            """

            # encode
            hidden_code = self.E(batch_x)
            mu, log_var = torch.chunk(hidden_code, 2,
                                      dim=1)  # mean and log variance.
            z = self._reparametrize(mu, log_var)
            # decode
            out = self.G(z)

            # two losses of vae
            reconstruction_loss = F.mse_loss(
                out, batch_x, reduction='sum').div(this_batch_size)
            disentangled_loss = self._kl_divergence(
                mu, log_var).div(this_batch_size)

            # the moments matching
            cov_matching_loss = utils.cov(mu.t()).triu(diagonal=1).pow(2).sum() \
                                + 10 * torch.var(mu, 0).sub(1).pow(2).sum()

            # final loss
            total_loss = reconstruction_loss + disentangled_loss + self.lambda_ * cov_matching_loss

            self.vae_optimizer.zero_grad()
            total_loss.backward()
            self.vae_optimizer.step()

        loss_dict = {
            'reconstruction_loss': reconstruction_loss.item(),
            'disentangled_loss': disentangled_loss.item(),
            'cov_matching_loss': cov_matching_loss.item(),
            'total_loss': total_loss.item(),
        }

        return loss_dict
err_t4 = []
err_t5 = []

for i, n in enumerate(n_l):
    args.n = int(n)
    non_pr = []
    covs_t1 = []
    covs_t2 = []
    covs_t3 = []
    covs_t4 = []
    covs_t5 = []
    print(n)
    for i in range(100):
        if i % 50 == 0: print(i)
        X = torch.distributions.MultivariateNormal(dist_mean, dist_cov).sample((args.n,))
        non_pr.append(mahalanobis_dist(utils.cov(X.clone()), dist_cov))

        args.t = 1
        args.rho = Ps1
        covs_t1.append(mahalanobis_dist(cov_est(X.clone(), args), dist_cov))
        
        args.t = 2
        args.rho = Ps2
        covs_t2.append(mahalanobis_dist(cov_est(X.clone(), args), dist_cov))
        
        args.t = 3
        args.rho = Ps3
        covs_t3.append(mahalanobis_dist(cov_est(X.clone(), args), dist_cov))
          
        args.t = 4
        args.rho = Ps4
Beispiel #8
0
    def cov_est(self):
        ''' Calculates sample eigenvalues and eigenvectors from 
        matrix of returns X '''

        self.S = S = cov(self.X)
        self.lam, self.U = eig(S)
Beispiel #9
0
def doc_word_embed_content_noise(content_path,
                                 noise_path,
                                 whiten_path=None,
                                 content_lines=None,
                                 noise_lines=None,
                                 opt=None):
    no_add_set = set()
    doc_word_embed_f = doc_word_embed_sen
    content_words_ar, content_word_embeds = doc_word_embed_f(
        content_path, no_add_set, content_lines=content_lines)
    words_set = set(content_words_ar)
    noise_words_ar, noise_word_embeds = doc_word_embed_f(
        noise_path, set(content_words_ar), content_lines=noise_lines)
    content_words_ar.extend(noise_words_ar)
    words_ar = content_words_ar
    word_embeds = torch.cat((content_word_embeds, noise_word_embeds), dim=0)

    whitening = opt.whiten if opt is not None else True  #True #April, temporary normalize by inlier covariance!
    if whitening and whiten_path is not None:
        #use an article of data in the inliers topic to whiten data.
        whiten_ar, whiten_word_embeds = doc_word_embed_f(
            whiten_path, set()
        )  #, content_lines=content_lines)#,content_lines=content_lines) ######april!!

        whiten_cov = utils.cov(whiten_word_embeds)
        fast_whiten = False  #True
        if not fast_whiten:
            U, D, V_t = linalg.svd(whiten_cov)
            #D_avg = D.mean() #D[len(D)//2]
            #print('D_avg! {}'.format(D_avg))

            cov_inv = torch.from_numpy(
                np.matmul(linalg.pinv(np.diag(np.sqrt(D))),
                          U.transpose())).to(utils.device)
            #cov_inv = torch.from_numpy(np.matmul(U, np.matmul(linalg.pinv(np.diag(np.sqrt(D))), V_t))).to(utils.device)

            word_embeds0 = word_embeds
            #change multiplication order!
            word_embeds = torch.mm(cov_inv, word_embeds.t()).t()
            if False:

                after_cov = utils.cov(word_embeds)
                U1, D1, V_t1 = linalg.svd(after_cov)
                pdb.set_trace()

                content_whitened = torch.mm(cov_inv,
                                            content_word_embeds.t()).t()
                after_cov2 = utils.cov(content_whitened)
                _, D1, _ = linalg.svd(after_cov2)
                print('after whitening D {}'.format(D1[:7]))
        else:
            #### faster whitening
            sv = decom.TruncatedSVD(30)
            sv.fit(whiten_cov.cpu().numpy())
            top_evals, top_evecs = sv.singular_values_, sv.components_
            top_evals = torch.from_numpy(1 / np.sqrt(top_evals)).to(
                utils.device)
            top_evecs = torch.from_numpy(top_evecs).to(utils.device)
            #pdb.set_trace()

            X = word_embeds
            projected = torch.mm(top_evecs.t() / (top_evecs**2).sum(-1),
                                 torch.mm(top_evecs, X.t())).t()
            #eval_ones = torch.eye(len(top_evals), device=top_evals.device)
            ##projected = torch.mm(torch.mm(top_evecs.t(), eval_ones), torch.mm(top_evecs, X.t())).t()

            #(d x k) * (k x d) * (d x n), project onto and squeeze the components along top evecs
            ##word_embeds = torch.mm((top_evecs/top_evals.unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-torch.mm(top_evecs.t(), torch.mm(top_evecs, X.t()) ).t())
            #pdb.set_trace()
            ##word_embeds = torch.mm((top_evecs/(top_evals*(top_evecs**2).sum(-1)).unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-projected )
            #word_embeds = torch.mm((top_evecs/(top_evals*(top_evecs**2).sum(-1)).unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-projected )
            word_embeds = torch.mm(torch.mm(top_evecs.t(), top_evals.diag()),
                                   torch.mm(top_evecs,
                                            X.t())).t() + (X - projected)

    noise_idx = torch.LongTensor(
        list(range(len(content_word_embeds),
                   len(word_embeds)))).to(utils.device)
    if False:
        #normalie per direction
        word_embeds_norm = ((word_embeds - word_embeds.mean(0))**2).sum(
            dim=1, keepdim=True).sqrt()
    debug_top_dir = False
    if debug_top_dir:
        w1 = (content_word_embeds - word_embeds.mean(0)
              )  #/word_embeds_norm[:len(content_word_embeds)]

        w2 = (noise_word_embeds - word_embeds.mean(0)
              )  #/word_embeds_norm[len(content_word_embeds):]
        mean_diff = ((w1.mean(0) - w2.mean(0))**2).sum().sqrt()
        w1_norm = (w1**2).sum(-1).sqrt().mean()
        w2_norm = (w2**2).sum(-1).sqrt().mean()
        X = (word_embeds - word_embeds.mean(0))  #/word_embeds_norm
        cov = torch.mm(X.t(), X) / word_embeds.size(0)
        U, D, V_t = linalg.svd(cov.cpu().numpy())
        U1 = torch.from_numpy(U[1]).to(utils.device)
        mean1_dir = w1.mean(0)
        mean1_proj = (mean1_dir * U1).sum()
        mean2_dir = w2.mean(0)
        mean2_proj = (mean2_dir * U1).sum()
        diff_proj = ((mean1_dir - mean2_dir) * U1).sum()

        #plot histogram of these projections
        proj1 = (w1 * U1).sum(-1)
        proj2 = (w2 * U1).sum(-1)
        utils.hist(proj1, 'inliers')
        utils.hist(proj2, 'outliers')
        pdb.set_trace()
    #word_embeds=(word_embeds - word_embeds.mean(0))/word_embeds_norm
    return words_ar, word_embeds, noise_idx
Beispiel #10
0
def alpha16(df):
    """
    Alpha#16
    (-1 * rank(covariance(rank(high), rank(volume), 5))) 
    """
    return (-1 * u.rank(u.cov(u.rank(df.high), u.rank(df.volume), 5)))
Beispiel #11
0
def alpha13(df):
    """
    Alpha#13
    (-1 * rank(covariance(rank(close), rank(volume), 5)))
    """
    return (-1 * u.rank(u.cov(u.rank(df.close), u.rank(df.volume), 5)))
Beispiel #12
0
    def sample_batch(self, batch_size, target_rng=255.):
        """
        Sample a batch.

        batch_size: (int) size of batch

        Returns

        batch:  (tensor)
        labels: (tensor)
        params: (dict) the sampled parameters for images in this batch

        Hold object properties constant for now across +/- samples. Fix later.
        """
        if not torch.is_tensor(target_rng):
            target_rng = torch.tensor(target_rng).float()
        if self.siamese:
            image_batch = torch.zeros(
                (batch_size, self.img_size, self.img_size, 2),
                requires_grad=self.batch_grad)
        else:
            image_batch = torch.zeros(
                (batch_size, self.img_size, self.img_size),
                requires_grad=self.batch_grad)
        image_batch = image_batch.to(self.device)
        label_batch = torch.zeros((batch_size, 1),
                                  dtype=torch.long,
                                  device=self.device)
        num_object_ps = self.sample_lambda0_r(batch_size=batch_size,
                                              d=self.dists[0])
        num_objects = num_object_ps.rsample([batch_size]).abs()
        if self.dists[0]['family'] == 'categorical':
            num_objects = self.st_op(num_objects)
            obj_cat = torch.arange(1,
                                   num_objects.shape[-1] + 1,
                                   dtype=num_objects.dtype,
                                   requires_grad=True).to(self.device)
            obj_cat = obj_cat.reshape(1, -1, 1, 1)
            obj_cat = obj_cat.repeat(batch_size, 1, 1, 1)
            num_objects = (obj_cat * num_objects.reshape(
                batch_size, self.max_objects, 1, 1)).sum(1, keepdims=True)
            num_objects = torch.abs(
                torch.clamp(-(obj_cat - self.min_objects - num_objects), 0, 1))
        elif self.dists[0]['family'] == 'relaxed_bernoulli':
            num_objects = num_object_ps.rsample([batch_size])
            num_objects = self.st_op(num_objects)
            num_objects[:, :self.min_objects] = 1.
        elif ('gaussian' in self.dists[0]['family']
              or 'normal' in self.dists[0]['family']):
            num_objects = (num_objects.round() -
                           num_objects).detach() + num_objects
            num_objects = torch.clamp(num_objects.reshape(-1, 1, 1, 1),
                                      self.min_objects, self.max_objects)
            obj_cat = torch.arange(1,
                                   self.max_objects + 1,
                                   dtype=num_objects.dtype,
                                   requires_grad=True).to(self.device)
            obj_cat = obj_cat.reshape(1, -1, 1, 1)
            obj_cat = obj_cat.repeat(batch_size, 1, 1, 1)
            num_objects = torch.abs(
                torch.clamp(-(obj_cat - self.min_objects + 1 - num_objects), 0,
                            1))  # noqa
        else:
            raise NotImplementedError(self.dists[0]['family'])
        dynamic_range_ps = self.sample_lambda0_r(
            batch_size=batch_size,
            d=self.dists[2],
            offset=self.min_dynamic_range)  # Dist object... used to have + 2
        dynamic_range = torch.tanh(
            dynamic_range_ps.rsample(
                (batch_size, self.max_objects, self.img_size, self.img_size)))
        object_size_ps = self.sample_lambda0_r(batch_size=batch_size,
                                               d=self.dists[1],
                                               offset=1)
        if self.one_object_size_per_batch:
            object_sizes = object_size_ps.rsample([batch_size]).abs()
            if self.dists[1]['family'] == 'categorical':
                object_sizes = self.argmax(self.st_op(object_sizes))
        else:
            object_sizes = object_size_ps.rsample(
                [batch_size, self.max_objects]).abs()
            if self.dists[1]['family'] == 'categorical':
                object_sizes = self.st_op(object_sizes)
                object_sizes = self.argmax(object_sizes)
            elif ('gaussian' in self.dists[1]['family']
                  or 'normal' in self.dists[1]['family']):
                object_sizes = (object_sizes.round() -
                                object_sizes).detach() + object_sizes
            else:
                raise NotImplementedError(self.dists[1]['family'])
        object_sizes = object_sizes + self.min_object_size
        object_radiuses = torch.clamp(object_sizes, self.min_object_size,
                                      self.max_object_size)
        y_range = torch.arange(0, self.img_size).to(self.device)  # v1
        x_range = torch.arange(0, self.img_size).to(self.device)  # v1
        yys, xxs = torch.meshgrid(y_range, x_range)  # v1
        yys = yys.unsqueeze(0).repeat(self.max_objects, 1, 1).float()  # v1
        xxs = xxs.unsqueeze(0).repeat(self.max_objects, 1, 1).float()  # v1
        gau = self.sample_lambda0_r(d=self.dists[3], batch_size=batch_size)

        # Object location grids -- See (1) below for explanation
        cyys, cxxs = torch.meshgrid(torch.arange(self.grid_res),
                                    torch.arange(self.grid_res))
        adj_ceil = self.img_size - self.max_object_size
        # y_offset = (self.img_size - cyys.max()) / 2
        # x_offset = (self.img_size - cxxs.max()) / 2
        # cyys = cyys + y_offset
        # cxxs = cxxs + x_offset
        loc_grid = torch.stack([cyys.reshape(-1),
                                cxxs.reshape(-1)]).to(self.device)
        for bidx in range(batch_size):
            # Sample size of objects
            object_radius = object_radiuses[bidx]
            lab = (torch.rand(1) > .5).float()
            if lab == 1 and not self.one_object_size_per_batch:
                object_radius[1] = object_radius[0]  # Copy the sizes

            # (1) Create a grid of locations, where objects will be placed
            # Random uniform per location, then select the self.max_objects top locations
            # Scale the positions of the grid (plus random jitter)
            # Choose the selected object locations in the masking step below
            positions = torch.rand(loc_grid.shape[1],
                                   requires_grad=False,
                                   device=self.device)
            position_thresh = torch.argsort(positions)[:self.max_objects]

            # Gradient for spatial scale comes from here:
            # coords = loc_grid[position_thresh]
            loc_scale = gau.rsample([2])  # .abs()
            loc_scale = (loc_scale.ceil() - loc_scale).detach() + loc_scale
            coords = loc_grid * loc_scale.reshape(-1, 1)
            max_coords = coords.max(1)[0]
            y_offset = ((self.img_size - max_coords[0]) / 2).floor()
            x_offset = ((self.img_size - max_coords[1]) / 2).floor()
            coords = coords[:, position_thresh] + torch.stack(
                (y_offset, x_offset)).reshape(-1, 1)
            coords = torch.clamp(coords, 0, adj_ceil)

            # Draw objects
            by = coords[0].reshape(self.max_objects, 1, 1)
            bx = coords[1].reshape(self.max_objects, 1, 1)
            obj_d = torch.pow(yys - by, 2) + torch.pow(xxs - bx, 2)
            if self.one_object_size_per_batch:
                obj_mask = torch.clamp(
                    ((object_radius.reshape(1, 1, 1) + 1) - obj_d), 0, 1)
            else:
                obj_mask = torch.clamp(
                    ((object_radius.reshape(self.max_objects, 1, 1) + 1) -
                     obj_d), 0, 1)
            obj = obj_mask * dynamic_range[bidx]
            if lab == 1:
                q_idx = torch.nonzero(obj[0])  # Query
                t_idx = torch.nonzero(obj[1])  # Target
                same_tex = dynamic_range[bidx, 0, q_idx[:, 0], q_idx[:, 1]]
                obj[1, t_idx[:, 0], t_idx[:, 1]] = same_tex

            # Mask to only show num_objects locations
            if self.dists[0]['family'] == 'categorical':
                obj = obj * num_objects[bidx]
            else:
                obj = obj * num_objects[bidx].reshape(self.max_objects, 1, 1)

            # Aggregate the batch
            if self.siamese:
                image_batch[bidx, ..., 0] = obj[0]
                image_batch[bidx, ..., 1] = obj[1:].sum(0)
            else:
                image_batch[bidx] = obj.sum(0)

            # Change task to SR if requested
            if self.task == 'sr':
                masked_coords = coords.detach() * num_objects[bidx].detach(
                ).squeeze(-1)  # noqa
                masked_coords = masked_coords[torch.nonzero(
                    masked_coords.sum(-1))]  # noqa
                masked_coords = masked_coords.reshape(-1, 2)
                es, vs = torch.eig(utils.cov(masked_coords), eigenvectors=True)
                # theta = torch.atan2(v[1, 0], v[0, 0]) * (180. / math.pi)
                sorted_es = torch.argsort(es[:, 0], dim=0,
                                          descending=True)  # Only real part
                vs = vs[:, sorted_es]  # Column vectors
                theta = torch.atan2(torch.abs(vs[1, 0]),
                                    vs[0, 0]) * (180. / math.pi)
                lab = 0
                if theta >= 45 and theta < 135 or theta >= 225 and theta < 315:
                    lab = 1  # what is the elegant way of doing this ^^
            label_batch[bidx] = lab

        # Hardcode the normalization
        image_batch = torch.repeat_interleave(image_batch.unsqueeze(1),
                                              3,
                                              dim=1)
        image_batch = (image_batch + 1.) / 2.
        image_batch = image_batch - self.norm_mean
        image_batch = image_batch / self.norm_std

        # image_batch = utils.normalize_fun(
        #     image_batch,
        #     reshape=self.reshape,
        #     mean=self.norm_mean,
        #     std=self.norm_std)
        # # Convert labels to one-hot
        # y = torch.eye(self.num_classes).to(self.device)
        # label_batch = y[label_batch].squeeze(1).long()
        del yys  # v1
        del xxs  # v1
        del y_range, x_range
        return image_batch, label_batch.squeeze()
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data',
                        choices=[
                            'swissroll', '8gaussians', 'pinwheel', 'circles',
                            'moons', '2spirals', 'checkerboard', 'rings'
                        ],
                        type=str,
                        default='moons')
    parser.add_argument('--niters', type=int, default=10000)
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--test_batch_size', type=int, default=1000)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--weight_decay', type=float, default=0)
    parser.add_argument('--critic_weight_decay', type=float, default=0)
    parser.add_argument('--save', type=str, default='/tmp/test_lsd')
    parser.add_argument('--mode',
                        type=str,
                        default="lsd",
                        choices=['lsd', 'sm'])
    parser.add_argument('--viz_freq', type=int, default=100)
    parser.add_argument('--save_freq', type=int, default=10000)
    parser.add_argument('--log_freq', type=int, default=100)
    parser.add_argument('--base_dist', action="store_true")
    parser.add_argument('--c_iters', type=int, default=5)
    parser.add_argument('--l2', type=float, default=10.)
    parser.add_argument('--exact_trace', action="store_true")
    parser.add_argument('--n_steps', type=int, default=10)
    args = parser.parse_args()

    # logger
    utils.makedirs(args.save)
    logger = utils.get_logger(logpath=os.path.join(args.save, 'logs'),
                              filepath=os.path.abspath(__file__))
    logger.info(args)

    # fit a gaussian to the training data
    init_size = 1000
    init_batch = sample_data(args, init_size).requires_grad_()
    mu, std = init_batch.mean(0), init_batch.std(0)
    base_dist = distributions.Normal(mu, std)

    # neural netz
    critic = networks.SmallMLP(2, n_out=2)
    net = networks.SmallMLP(2)

    ebm = EBM(net, base_dist if args.base_dist else None)
    ebm.to(device)
    critic.to(device)

    # for sampling
    init_fn = lambda: base_dist.sample_n(args.test_batch_size)
    cov = utils.cov(init_batch)
    sampler = HMCSampler(ebm,
                         .3,
                         5,
                         init_fn,
                         device=device,
                         covariance_matrix=cov)

    logger.info(ebm)
    logger.info(critic)

    # optimizers
    optimizer = optim.Adam(ebm.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay,
                           betas=(.0, .999))
    critic_optimizer = optim.Adam(critic.parameters(),
                                  lr=args.lr,
                                  betas=(.0, .999),
                                  weight_decay=args.critic_weight_decay)

    time_meter = utils.RunningAverageMeter(0.98)
    loss_meter = utils.RunningAverageMeter(0.98)

    ebm.train()
    end = time.time()
    for itr in range(args.niters):

        optimizer.zero_grad()
        critic_optimizer.zero_grad()

        x = sample_data(args, args.batch_size)
        x.requires_grad_()

        if args.mode == "lsd":
            # our method

            # compute dlogp(x)/dx
            logp_u = ebm(x)
            sq = keep_grad(logp_u.sum(), x)
            fx = critic(x)
            # compute (dlogp(x)/dx)^T * f(x)
            sq_fx = (sq * fx).sum(-1)

            # compute/estimate Tr(df/dx)
            if args.exact_trace:
                tr_dfdx = exact_jacobian_trace(fx, x)
            else:
                tr_dfdx = approx_jacobian_trace(fx, x)

            stats = (sq_fx + tr_dfdx)
            loss = stats.mean()  # estimate of S(p, q)
            l2_penalty = (
                fx * fx).sum(1).mean() * args.l2  # penalty to enforce f \in F

            # adversarial!
            if args.c_iters > 0 and itr % (args.c_iters + 1) != 0:
                (-1. * loss + l2_penalty).backward()
                critic_optimizer.step()
            else:
                loss.backward()
                optimizer.step()

        elif args.mode == "sm":
            # score matching for reference
            fx = ebm(x)
            dfdx = torch.autograd.grad(fx.sum(),
                                       x,
                                       retain_graph=True,
                                       create_graph=True)[0]
            eps = torch.randn_like(dfdx)  # use hutchinson here as well
            epsH = torch.autograd.grad(dfdx,
                                       x,
                                       grad_outputs=eps,
                                       create_graph=True,
                                       retain_graph=True)[0]

            trH = (epsH * eps).sum(1)
            norm_s = (dfdx * dfdx).sum(1)

            loss = (trH + .5 * norm_s).mean()
            loss.backward()
            optimizer.step()
        else:
            assert False

        loss_meter.update(loss.item())
        time_meter.update(time.time() - end)

        if itr % args.log_freq == 0:
            log_message = (
                'Iter {:04d} | Time {:.4f}({:.4f}) | Loss {:.4f}({:.4f})'.
                format(itr, time_meter.val, time_meter.avg, loss_meter.val,
                       loss_meter.avg))
            logger.info(log_message)

        if itr % args.save_freq == 0 or itr == args.niters:
            ebm.cpu()
            utils.makedirs(args.save)
            torch.save({
                'args': args,
                'state_dict': ebm.state_dict(),
            }, os.path.join(args.save, 'checkpt.pth'))
            ebm.to(device)

        if itr % args.viz_freq == 0:
            # plot dat
            plt.clf()
            npts = 100
            p_samples = toy_data.inf_train_gen(args.data, batch_size=npts**2)
            q_samples = sampler.sample(args.n_steps)

            ebm.cpu()

            x_enc = critic(x)
            xes = x_enc.detach().cpu().numpy()
            trans = xes.min()
            scale = xes.max() - xes.min()
            xes = (xes - trans) / scale * 8 - 4

            plt.figure(figsize=(4, 4))
            visualize_transform(
                [p_samples, q_samples.detach().cpu().numpy(), xes],
                ["data", "model", "embed"], [ebm], ["model"],
                npts=npts)

            fig_filename = os.path.join(args.save, 'figs',
                                        '{:04d}.png'.format(itr))
            utils.makedirs(os.path.dirname(fig_filename))
            plt.savefig(fig_filename)
            plt.close()

            ebm.to(device)
        end = time.time()

    logger.info('Training has finished, can I get a yeet?')