def psi_init(L, hidden_layer_sizes, nout, Form='euler'): nat_ording = True model_r=MADE(L,hidden_layer_sizes, nout, \ num_masks=1, natural_ordering=nat_ording) model_i=MADE(L,hidden_layer_sizes, nout, \ num_masks=1, natural_ordering=nat_ording) ppsi = Psi(model_r, model_i, L, form=Form, autoregressive=True) return ppsi
def MakeMade(scale, cols_to_train, seed, dataset, fixed_ordering=None, special_orders=[], layers=4, residual=False, dropout=False, per_row_dropout=False, prefix_dropout=False, fixed_dropout_ratio=False, disable_learnable_unk=False, input_no_emb_if_leq=True, embs_tied=False, embed_size=32): # TODO: if passed in a single heuristic order, be sure to InvertOrder(). num_masks = 1 if len(special_orders): num_masks = len(special_orders) model = MADE( nin=len(cols_to_train), hidden_sizes=[scale] * layers if layers > 0 else [512, 256, 512, 128, 1024], nout=sum([c.DistributionSize() for c in cols_to_train]), input_bins=[c.DistributionSize() for c in cols_to_train], input_encoding="embed", output_encoding="embed", seed=seed, do_direct_io_connections=False, natural_ordering=False if seed is not None else True, residual_connections=residual, embed_size=embed_size, fixed_ordering=fixed_ordering, dropout_p=dropout or per_row_dropout or prefix_dropout, fixed_dropout_p=fixed_dropout_ratio, num_masks=num_masks, per_row_dropout_p=per_row_dropout, prefix_dropout=prefix_dropout, disable_learnable_unk=disable_learnable_unk, input_no_emb_if_leq=input_no_emb_if_leq, embs_tied=embs_tied, ).to(get_device()) if len(special_orders): print('assigning to model.orderings:') print(special_orders) model.orderings = special_orders return model
def MakeModel(scale, cols_to_train, seed, fixed_ordering=None): if args.inv_order: print('Inverting order!!!!!!!!!!') fixed_ordering = InvertOrder(fixed_ordering) return MADE( nin=len(cols_to_train), hidden_sizes=[ scale, ] * 4, nout=sum([c.DistributionSize() for c in cols_to_train]), input_bins=[c.DistributionSize() for c in cols_to_train], input_encoding="embed", output_encoding="embed", embed_size=64, # input_no_emb_if_leq=False, embs_tied=args.embs_tied, input_no_emb_if_leq=True, seed=seed, natural_ordering=False if seed is not None else True, residual_connections=args.residual, fixed_ordering=fixed_ordering, do_direct_io_connections=args.direct_io, dropout_p=args.dropout, ).to(DEVICE)
def __init__(self, n_z, n_h, n_made): super(InverseAutoregressiveBlock, self).__init__() # made: take as inputs: z_{t-1}, h; output: m_t, s_t self.made = MADE(num_input=n_z, num_output=n_z * 2, num_hidden=n_made, num_context=n_h) self.sigmoid_arg_bias = nn.Parameter(torch.ones(n_z) * 2)
def __init__(self, data_dim, mask, hidden_dims, n_hidden, made_rev): super(NewCouplingLayer, self).__init__() self.mask = mask == 1 self.made_rev = made_rev self.n_1 = np.ceil(data_dim / 2).astype(int) self.n_2 = np.floor(data_dim / 2).astype(int) self.made = MADE(n_in=self.n_1, hidden_dims=[n_hidden], gaussian=True) self.scale = ScaleTranslate( self.n_1, self.n_2, n_hidden, hidden_dims, actfun="tanh" ) self.translate = ScaleTranslate(self.n_1, self.n_2, n_hidden, hidden_dims)
def execute_one_round(): args = {'data_name' : config.data_name, 'train_size' : config.train_size, 'valid_size' : config.validation_size, 'test_size' : config.test_size} if args['data_name'] == 'grid': args['width'] = config.width args['height'] = config.height elif args['data_name'] == 'Boltzmann': args['n'] = config.n_boltzmann args['m'] = config.m_boltzmann elif args['data_name'] == 'k_sparse': args['n'] = config.n_of_k_sparse args['sparsity_degree'] = config.sparsity_degree elif args['data_name'] == 'BayesNet': args['n'] = config.n_of_BayesNet args['par_num'] = config.par_num_of_BayesNet elif args['data_name'].startswith('mnist'): args['digit'] = config.digit data = get_data(args) print('data loaded') # if config.generate_samples: # n = len(data['train_data']) # for i in range(n): # im = Image.fromarray(255*data['train_data'][i,:].reshape([config.height, config.width])) # im.convert('RGB').save(config.generated_samples_dir+'train_' + str(i)+'.png') model = MADE() print('model initiated') model.fit(data['train_data'], data['valid_data']) pred = model.predict(data['test_data']) res = dict() res['NLL'], res['KL'] = evaluate(pred, data['test_data_probs']) print('KL: ' + str(res['KL']), file=sys.stderr) print('NLL: ' + str(res['NLL']), file=sys.stderr) sys.stderr.flush() res['train_end_epochs'] = model.train_end_epochs res['num_of_connections'] = model.num_of_connections() if config.generate_samples: n = config.num_of_generated_samples_each_execution generated_samples = model.generate(n).reshape(n, config.height, config.width) for i in range(n): im = Image.fromarray(255*generated_samples[i,:,:]) im.convert('RGB').save(config.generated_samples_dir+str(i)+'.png') return res
def MakeMadeDmv(cols_to_train, seed, fixed_ordering=None): if args.inv_order: print('Inverting order!!!!!!!!!!') fixed_ordering = InvertOrder(fixed_ordering) if args.special_dmv_arch: return MADE( nin=len(cols_to_train), hidden_sizes=[256] * 5, nout=sum([c.DistributionSize() for c in cols_to_train]), input_bins=[c.DistributionSize() for c in cols_to_train], input_encoding="embed", output_encoding="embed", embed_size=128, input_no_emb_if_leq=True, embs_tied=True, seed=seed, do_direct_io_connections=True, #args.direct_io, natural_ordering=False if seed is not None else True, residual_connections=args.residual, fixed_ordering=fixed_ordering, dropout_p=args.dropout, ).to(DEVICE) hiddens = [args.fc_hiddens] * args.layers natural_ordering = False if args.layers == 0: # Default ckpt. hiddens = [512, 256, 512, 128, 1024] natural_ordering = True model = MADE( nin=len(cols_to_train), hidden_sizes=hiddens, residual_connections=args.residual, nout=sum([c.DistributionSize() for c in cols_to_train]), input_bins=[c.DistributionSize() for c in cols_to_train], input_encoding="embed" if args.dataset in ["dmv-full", "kdd", "synthetic"] else "binary", output_encoding="embed" if args.dataset in ["dmv-full", "kdd", "synthetic"] else "one_hot", seed=seed, do_direct_io_connections=args.direct_io, natural_ordering=False if seed is not None else True, fixed_ordering=fixed_ordering, dropout_p=args.dropout, num_masks=max(1, args.special_orders), ).to(DEVICE) # XXX this is copied from train_many_orderings if args.special_orders > 0: special_orders = [ # # MutInfo Max Marg # np.array([6, 1, 4, 0, 7, 3, 5, 2, 10, 9, 8]), # # CL Max Marg/Dom # np.array([6, 1, 4, 0, 5, 7, 3, 2, 10, 9, 8]), # # Random # np.random.RandomState(0).permutation(np.arange(11)), ][:args.special_orders] k = len(special_orders) for i in range(k, args.special_orders): special_orders.append( np.random.RandomState(i - k + 1).permutation( np.arange(len(cols_to_train)))) print('Special orders', np.array(special_orders)) if args.inv_order: for i, order in enumerate(special_orders): special_orders[i] = np.asarray(InvertOrder(order)) print('Inverted special orders:', special_orders) model.orderings = special_orders if args.use_query_order: model.use_query_order = True if args.use_best_order: model.use_best_order = True if args.use_worst_order: model.use_worst_order = True return model
def BuckyBall(): start_time = time.time() init_out_dir() print_args() if args.ham == 'buckey': ham = buckyball_2(args.beta) # elif args.ham == 'sk': # ham = SKModel(args.n, args.beta, args.device, seed=args.seed) # elif args.ham == 'full': # ham = FullModel() # elif args.ham == 'buckey': # ham = buckyball_2(args.beta) else: raise ValueError('Unknown ham: {}'.format(args.ham)) #ham.J.requires_grad = False net = MADE(**vars(args)) net.to(args.device) my_log('{}\n'.format(net)) params = list(net.parameters()) params = list(filter(lambda p: p.requires_grad, params)) nparams = int(sum([np.prod(p.shape) for p in params])) my_log('Total number of trainable parameters: {}'.format(nparams)) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr) elif args.optimizer == 'sgdm': optimizer = torch.optim.SGD(params, lr=args.lr, momentum=0.9) elif args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(params, lr=args.lr, alpha=0.99) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0.9, 0.999)) elif args.optimizer == 'adam0.5': optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0.5, 0.999)) else: raise ValueError('Unknown optimizer: {}'.format(args.optimizer)) init_time = time.time() - start_time my_log('init_time = {:.3f}'.format(init_time)) my_log('Training...') sample_time = 0 train_time = 0 start_time = time.time() if args.beta_anneal_to < args.beta: args.beta_anneal_to = args.beta beta = args.beta while beta <= args.beta_anneal_to: for step in range(args.max_step): optimizer.zero_grad() sample_start_time = time.time() with torch.no_grad(): sample, x_hat = net.sample(args.batch_size) assert not sample.requires_grad assert not x_hat.requires_grad sample_time += time.time() - sample_start_time train_start_time = time.time() log_prob = net.log_prob(sample) with torch.no_grad(): energy = ham.energy(sample) loss = log_prob + beta * energy assert not energy.requires_grad assert not loss.requires_grad loss_reinforce = torch.mean((loss - loss.mean()) * log_prob) loss_reinforce.backward() if args.clip_grad > 0: # nn.utils.clip_grad_norm_(params, args.clip_grad) parameters = list(filter(lambda p: p.grad is not None, params)) max_norm = float(args.clip_grad) norm_type = 2 total_norm = 0 for p in parameters: param_norm = p.grad.data.norm(norm_type) total_norm += param_norm.item()**norm_type total_norm = total_norm**(1 / norm_type) clip_coef = max_norm / (total_norm + args.epsilon) for p in parameters: p.grad.data.mul_(clip_coef) optimizer.step() train_time += time.time() - train_start_time if args.print_step and step % args.print_step == 0: free_energy_mean = loss.mean() / beta / args.n free_energy_std = loss.std() / beta / args.n entropy_mean = -log_prob.mean() / args.n energy_mean = energy.mean() / args.n mag = sample.mean(dim=0) mag_mean = mag.mean() if step > 0: sample_time /= args.print_step train_time /= args.print_step used_time = time.time() - start_time my_log( 'beta = {:.3g}, # {}, F = {:.8g}, F_std = {:.8g}, S = {:.5g}, E = {:.5g}, M = {:.5g}, sample_time = {:.3f}, train_time = {:.3f}, used_time = {:.3f}' .format( beta, step, free_energy_mean.item(), free_energy_std.item(), entropy_mean.item(), energy_mean.item(), mag_mean.item(), sample_time, train_time, used_time, )) sample_time = 0 train_time = 0 with open(args.fname, 'a', newline='\n') as f: f.write('{} {} {:.3g} {:.8g} {:.8g} {:.8g} {:.8g}\n'.format( args.n, args.seed, beta, free_energy_mean.item(), free_energy_std.item(), energy_mean.item(), entropy_mean.item(), )) if args.ham == 'hop': ensure_dir(args.out_filename + '_sample/') np.savetxt('{}_sample/sample{:.2f}.txt'.format( args.out_filename, beta), sample.cpu().numpy(), delimiter=' ', fmt='%d') np.savetxt('{}_sample/log_prob{:.2f}.txt'.format( args.out_filename, beta), log_prob.cpu().detach().numpy(), delimiter=' ', fmt='%.5f') beta += args.beta_inc
state_dir = 'out' ham_args, features = get_ham_args_features() state_filename = '{state_dir}/{ham_args}/{features}/out{args.out_infix}_save/10000.state'.format( **locals()) target_layer = 1 num_channel = 1 out_dir = '../support/fig/filters/{ham_args}/{features}/layer{target_layer}'.format( **locals()) if __name__ == '__main__': ensure_dir(out_dir + '/') if args.net == 'made': net = MADE(**vars(args)) elif args.net == 'pixelcnn': net = PixelCNN(**vars(args)) else: raise ValueError('Unknown net: {}'.format(args.net)) net.to(args.device) print('{}\n'.format(net)) print(state_filename) state = torch.load(state_filename, map_location=args.device) net.load_state_dict(state['net']) sample = torch.zeros([num_channel, 1, args.L, args.L], requires_grad=True) nn.init.normal_(sample) optimizer = torch.optim.Adam([sample], lr=1e-3, weight_decay=1)
def train(train_data, test_data, image_shape): """ Trains MADE model on binary image dataset. Arguments: train_data: A (n_train, H, W, 1) uint8 numpy array of binary images with values in {0, 1} test_data: An (n_test, H, W, 1) uint8 numpy array of binary images with values in {0, 1} image_shape: (H, W), height and width of the image Returns: - a (# of training iterations,) numpy array of train_losses evaluated every minibatch - a (# of epochs + 1,) numpy array of test_losses evaluated once at initialization and after each epoch - a numpy array of size (100, H, W, 1) of samples with values in {0, 1} """ use_cuda = True device = torch.device('cuda') if use_cuda else None train_data = torch.from_numpy( train_data.reshape( (train_data.shape[0], train_data.shape[1] * train_data.shape[2]))).float().to(device) test_data = torch.from_numpy( test_data.reshape( (test_data.shape[0], test_data.shape[1] * test_data.shape[2]))).float().to(device) def nll_loss(batch, output): return F.binary_cross_entropy(torch.sigmoid(output), batch) H, W = image_shape input_dim = H * W made = MADE(input_dim) epochs = 10 lr = 0.005 batch_size = 32 train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True) optimizer = torch.optim.Adam(made.parameters(), lr=lr) init_test_loss = nll_loss(test_data, made(test_data)) train_losses = [] test_losses = [init_test_loss.item()] # Training for epoch in range(epochs): for batch in train_loader: optimizer.zero_grad() output = made(batch) loss = nll_loss(batch, output) loss.backward() optimizer.step() train_losses.append(loss.item()) test_loss = nll_loss(test_data, made(test_data)) test_losses.append(test_loss.item()) print(f'{epoch + 1}/{epochs} epochs') # Generate samples made.eval() samples = torch.zeros(size=(100, H * W)).to(device) with torch.no_grad(): for i in range(H * W): logits = made(samples) probas = torch.sigmoid(logits) pixel_i_samples = torch.bernoulli(probas[:, i]) samples[:, i] = pixel_i_samples return np.array(train_losses), np.array(test_losses), samples.reshape( (100, H, W, 1)).detach().cpu().numpy()
import torch import tensorflow as tf #import edward2 as ed from org_edward2_made import MADE as ed_MADE_org import numpy as np from disc_utils import one_hot, one_hot_argmax, multiplicative_inverse, one_hot_add, one_hot_minus, one_hot_multiply from made import MADE vocab_size = 90 input_shape = [10, 4, vocab_size] tf_made = ed_MADE_org(vocab_size, hidden_dims=[20, 20]) tf_made.build(input_shape) print(tf_made.built) #print(tf_made.network.get_weights()) print('making torch_MADE from ed2 conversion') torch_made = MADE(input_shape, vocab_size, hidden_dims=[20, 20, 20]) print('torch made model', torch_made) inp = torch.ones(input_shape) res = torch_made(inp) print('res shape', res.shape) print('inputs::::', inp[0, 0, :], inp[0, 1, :], inp[0, 2, :]) print('outputs::::', res[0, 0, :], res[0, 1, :], res[0, 2, :]) inp_tf = tf.ones(input_shape) res_tf = tf_made(inp_tf) print('res shape', res_tf.shape) print('inputs::::', inp_tf[0, 0, :], inp_tf[0, 1, :], inp_tf[0, 2, :]) print('outputs::::', res_tf[0, 0, :], res_tf[0, 1, :], res_tf[0, 2, :])
if args.gpu is not None: cuda.get_device(args.gpu).use() # reproducibility is good np.random.seed(42) # load the dataset print("loading binarized mnist from", args.data_path) mnist = np.load(args.data_path) xtr, xte = mnist['train_data'], mnist['valid_data'] # construct model and ship to GPU hidden_list = list(map(int, args.hiddens.split(','))) model = MADE(xtr.shape[1], hidden_list, xtr.shape[1], num_masks=args.num_masks, gpu=args.gpu) if args.gpu is not None: model.to_gpu(args.gpu) xtr = cuda.to_gpu(xtr) xte = cuda.to_gpu(xte) # set up the optimizer opt = chainer.optimizers.Adam(alpha=1e-3, weight_decay_rate=1e-4) opt.setup(model) # start the training for epoch in range(100): print("epoch %d" % (epoch, )) run_epoch(
mnist = np.load(args.data_path) xtr, xva = mnist['train_data'], mnist['valid_data'] # split validation set in validation + test set num_val = xva.shape[0] // 2 xte = xva[:num_val, :] xva = xva[num_val:, :] xtr = torch.from_numpy(xtr).cuda() xva = torch.from_numpy(xva).cuda() xte = torch.from_numpy(xte).cuda() print('training_set: ' + str(xtr.shape)) print('validation_set: ' + str(xva.shape)) print('test_set: ' + str(xte.shape)) # construct model and ship to GPU hidden_list = list(map(int, args.hiddens.split(','))) model = MADE(xtr.size(1), hidden_list, xtr.size(1), num_masks=args.num_masks) print(model) print("number of model parameters:", sum([np.prod(p.size()) for p in model.parameters()])) model.cuda() # set up the optimizer #opt = torch.optim.Adagrad(model.parameters(), lr=1e-2, eps=1e-6) opt = torch.optim.Adadelta(model.parameters()) epochs_no_improve = 0 best_loss = math.inf best_epoch = 0 path = './experiments/' + args.data_path + '/' for n in hidden_list: path += '_' + str(n)
n_RV = 374 # number of RVs scope_list = np.arange(n_RV) scope_temp = np.delete(scope_list, np.where(scope_list % 34 == 17)) init_scope = list(np.delete(scope_temp, np.where(scope_temp % 34 == 33))) # modify data to remove 0 (imag) columns data_train = data_train[:, init_scope] data_pos = data_pos[:, init_scope] data_neg = data_neg[:, init_scope] xtr = torch.from_numpy(data_train).float().cuda() xte = torch.from_numpy(data_pos).float().cuda() xod = torch.from_numpy(data_neg).float().cuda() # construct model and ship to GPU hidden_list = list(map(int, args.hiddens.split(','))) model = MADE(xtr.size(1), hidden_list, xtr.size(1) * 2, num_masks=args.num_masks) print("number of model parameters:", sum([np.prod(p.size()) for p in model.parameters()])) model.cuda() # set up the optimizer opt = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=45, gamma=0.1) # list to store loss loss_tr = [] loss_te = [] loss_od = [] # start the training for epoch in range(args.epoch): scheduler.step(epoch) loss_tr.append(run_epoch('train'))
max_epochs = 1000 # ----------------------------------- # Get dataset. data = get_data(dataset_name) train = torch.from_numpy(data.train.x) # Get data loaders. train_loader, val_loader, test_loader = get_data_loaders(data, batch_size) # Get model. n_in = data.n_dims if model_name.lower() == "maf": model = MAF(n_in, n_mades, hidden_dims) elif model_name.lower() == "made": model = MADE(n_in, hidden_dims, random_order=random_order, seed=seed, gaussian=True) # Get optimiser. optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-6) # Format name of model save file. save_name = f"{model_name}_{dataset_name}_{'_'.join(str(d) for d in hidden_dims)}.pt" # Initialise list for plotting. epochs_list = [] train_losses = [] val_losses = [] # Initialiise early stopping. i = 0 max_loss = np.inf # Training loop.
if verbose: print(f"{split} Average epoch loss: {np.mean(losses)}") return losses if __name__ == "__main__": # load the dataset from some path mnist = np.load("binarized_mnist.npz") x_train, x_test = mnist["train_data"], mnist["valid_data"] x_train = torch.as_tensor(x_train).cuda() x_test = torch.as_tensor(x_test).cuda() hidden_list = [500] resample_every = 20 model = MADE(x_train.size(1), hidden_list, x_train.size(1)) print( "number of model parameters: {np.sum([np.prod(p.size()) for p in model.parameters()])}" ) model.cuda() opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=50, gamma=0.1) # The training for epoch in range(100): print(f"Epoch {epoch}") scheduler.step() # get an estimate of the test loss run_one_epoch("test", upto=5)
state_filename = '{state_dir}/{model_args}/{features}/out{args.out_infix}_save/10000.state'.format( **locals()) def get_mean_err(count, x_sum, x_sqr_sum): x_mean = x_sum / count x_sqr_mean = x_sqr_sum / count x_std = sqrt(abs(x_sqr_mean - x_mean**2)) x_err = x_std / sqrt(count) x_ufloat = ufloat(x_mean, x_err) return x_ufloat if __name__ == '__main__': if args.net == 'made': net = MADE(**vars(args)) elif args.net == 'pixelcnn': net = PixelCNN(**vars(args)) else: raise ValueError('Unknown net: {}'.format(args.net)) net.to(args.device) print('{}\n'.format(net)) print(state_filename) state = torch.load(state_filename, map_location=args.device) ignore_param(state['net'], net) net.load_state_dict(state['net']) F_sum = 0 F_sqr_sum = 0 S_sum = 0
# reproducibility is good np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed_all(42) # load the dataset print("loading binarized mnist from", args.data_path) mnist = np.load(args.data_path) xtr, xte = mnist['train_data'], mnist['valid_data'] xtr = torch.from_numpy(xtr).cuda() xte = torch.from_numpy(xte).cuda() # construct model and ship to GPU hidden_list = list(map(int, args.hiddens.split(','))) model = MADE(xtr.size(1), hidden_list, xtr.size(1), num_masks=args.num_masks) print("number of model parameters:", sum([np.prod(p.size()) for p in model.parameters()])) model.cuda() # set up the optimizer opt = torch.optim.Adam(model.parameters(), 1e-3, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=45, gamma=0.1) # start the training for epoch in range(100): print("epoch %d" % (epoch, )) scheduler.step(epoch) run_epoch( 'test',
def __init__(self, dim: int, hidden_dims: List[int], reverse: bool): super(MAFLayer, self).__init__() self.dim = dim self.made = MADE(dim, hidden_dims, gaussian=True, seed=None) self.reverse = reverse
def main(): start_time = time.time() init_out_dir() if args.clear_checkpoint: clear_checkpoint() last_step = get_last_checkpoint_step() if last_step >= 0: my_log('\nCheckpoint found: {}\n'.format(last_step)) else: clear_log() print_args() if args.net == 'made': net = MADE(**vars(args)) elif args.net == 'pixelcnn': net = PixelCNN(**vars(args)) elif args.net == 'bernoulli': net = BernoulliMixture(**vars(args)) else: raise ValueError('Unknown net: {}'.format(args.net)) net.to(args.device) my_log('{}\n'.format(net)) params = list(net.parameters()) params = list(filter(lambda p: p.requires_grad, params)) nparams = int(sum([np.prod(p.shape) for p in params])) my_log('Total number of trainable parameters: {}'.format(nparams)) named_params = list(net.named_parameters()) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr) elif args.optimizer == 'sgdm': optimizer = torch.optim.SGD(params, lr=args.lr, momentum=0.9) elif args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(params, lr=args.lr, alpha=0.99) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0.9, 0.999)) elif args.optimizer == 'adam0.5': optimizer = torch.optim.Adam(params, lr=args.lr, betas=(0.5, 0.999)) else: raise ValueError('Unknown optimizer: {}'.format(args.optimizer)) if args.lr_schedule: # 0.92**80 ~ 1e-3 scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.92, patience=100, threshold=1e-4, min_lr=1e-6) if last_step >= 0: state = torch.load('{}_save/{}.state'.format(args.out_filename, last_step)) ignore_param(state['net'], net) net.load_state_dict(state['net']) if state.get('optimizer'): optimizer.load_state_dict(state['optimizer']) if args.lr_schedule and state.get('scheduler'): scheduler.load_state_dict(state['scheduler']) init_time = time.time() - start_time my_log('init_time = {:.3f}'.format(init_time)) my_log('Training...') sample_time = 0 train_time = 0 start_time = time.time() for step in range(last_step + 1, args.max_step + 1): optimizer.zero_grad() sample_start_time = time.time() with torch.no_grad(): sample, x_hat = net.sample(args.batch_size) assert not sample.requires_grad assert not x_hat.requires_grad sample_time += time.time() - sample_start_time train_start_time = time.time() log_prob = net.log_prob(sample) # 0.998**9000 ~ 1e-8 beta = args.beta * (1 - args.beta_anneal**step) with torch.no_grad(): energy = ising.energy(sample, args.ham, args.lattice, args.boundary) loss = log_prob + beta * energy assert not energy.requires_grad assert not loss.requires_grad loss_reinforce = torch.mean((loss - loss.mean()) * log_prob) loss_reinforce.backward() if args.clip_grad: nn.utils.clip_grad_norm_(params, args.clip_grad) optimizer.step() if args.lr_schedule: scheduler.step(loss.mean()) train_time += time.time() - train_start_time if args.print_step and step % args.print_step == 0: free_energy_mean = loss.mean() / args.beta / args.L**2 free_energy_std = loss.std() / args.beta / args.L**2 entropy_mean = -log_prob.mean() / args.L**2 energy_mean = energy.mean() / args.L**2 mag = sample.mean(dim=0) mag_mean = mag.mean() mag_sqr_mean = (mag**2).mean() if step > 0: sample_time /= args.print_step train_time /= args.print_step used_time = time.time() - start_time my_log( 'step = {}, F = {:.8g}, F_std = {:.8g}, S = {:.8g}, E = {:.8g}, M = {:.8g}, Q = {:.8g}, lr = {:.3g}, beta = {:.8g}, sample_time = {:.3f}, train_time = {:.3f}, used_time = {:.3f}' .format( step, free_energy_mean.item(), free_energy_std.item(), entropy_mean.item(), energy_mean.item(), mag_mean.item(), mag_sqr_mean.item(), optimizer.param_groups[0]['lr'], beta, sample_time, train_time, used_time, )) sample_time = 0 train_time = 0 if args.save_sample: state = { 'sample': sample, 'x_hat': x_hat, 'log_prob': log_prob, 'energy': energy, 'loss': loss, } torch.save(state, '{}_save/{}.sample'.format( args.out_filename, step)) if (args.out_filename and args.save_step and step % args.save_step == 0): state = { 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), } if args.lr_schedule: state['scheduler'] = scheduler.state_dict() torch.save(state, '{}_save/{}.state'.format( args.out_filename, step)) if (args.out_filename and args.visual_step and step % args.visual_step == 0): torchvision.utils.save_image( sample, '{}_img/{}.png'.format(args.out_filename, step), nrow=int(sqrt(sample.shape[0])), padding=0, normalize=True) if args.print_sample: x_hat_np = x_hat.view(x_hat.shape[0], -1).cpu().numpy() x_hat_std = np.std(x_hat_np, axis=0).reshape([args.L] * 2) x_hat_cov = np.cov(x_hat_np.T) x_hat_cov_diag = np.diag(x_hat_cov) x_hat_corr = x_hat_cov / ( sqrt(x_hat_cov_diag[:, None] * x_hat_cov_diag[None, :]) + args.epsilon) x_hat_corr = np.tril(x_hat_corr, -1) x_hat_corr = np.max(np.abs(x_hat_corr), axis=1) x_hat_corr = x_hat_corr.reshape([args.L] * 2) energy_np = energy.cpu().numpy() energy_count = np.stack( np.unique(energy_np, return_counts=True)).T my_log( '\nsample\n{}\nx_hat\n{}\nlog_prob\n{}\nenergy\n{}\nloss\n{}\nx_hat_std\n{}\nx_hat_corr\n{}\nenergy_count\n{}\n' .format( sample[:args.print_sample, 0], x_hat[:args.print_sample, 0], log_prob[:args.print_sample], energy[:args.print_sample], loss[:args.print_sample], x_hat_std, x_hat_corr, energy_count, )) if args.print_grad: my_log('grad max_abs min_abs mean std') for name, param in named_params: if param.grad is not None: grad = param.grad grad_abs = torch.abs(grad) my_log('{} {:.3g} {:.3g} {:.3g} {:.3g}'.format( name, torch.max(grad_abs).item(), torch.min(grad_abs).item(), torch.mean(grad).item(), torch.std(grad).item(), )) else: my_log('{} None'.format(name)) my_log('')
def run(split, upto=None): torch.set_grad_enabled(split=='train') model.train() if split == 'train' else model.eval() nsamples = 1 if split == 'train' else xte N, D = x.size() B = 128 n_steps = N // B if upto is None else min(N//B, upto) losses = [] for step in range(n_steps): xb = Variable(x[step * B: step * B + B]) xbhat = torch.zeros_like(xb) for s in range(nsamples): if step % args.resample_every == 0 or split == 'test': model.update_masks() xbhat += model(xb) xbhat /= nsamples loss = F.binary_cross_entropy_with_logits(xbhat, xb, size_average=False) / B lossf = loss.data.item() losses.append(lossf) if split == 'train': opt.zero_grad() loss.backward() opt.step() print("%s epoch avg loss: %f" %(split, np.mean(losses))) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-d', '--data-path', required=True, type=str, help="Path to binarized_mnist.npz") parser.add_argument('-q', '--hiddens', type=str, default='500', help="Comma separated sizes for hidden layers, e.g. 500, or 500,500") parser.add_argument('-n', '--num-masks', type=int, default=1, help="Number of orderings for order/connection-agnostic training") parser.add_argument('-r', '--resample-every', type=int, default=20, help="For efficiency we can choose to resample orders/masks only once every this many steps") parser.add_argument('-s', '--samples', type=int, default=1, help="How many samples of connectivity/masks to average logits over during inference") args = parser.parse_args() np.random_seed(42) torch.manual_seed(42) torch.cuda.manual_seed_all(42) print("loading binarized mnist from", args.data_path) mnist = np.load(args.data_path) xtr, xte = mnist['train_data'], mnist['valid_data'] xtr = torch.from_numpy(xtr).cuda() xte = torch.from_numpy(xte).cuda() # construct model and ship to GPU hidden_list = list(map(int, args.hiddens.split(','))) model = MADE(xtr.size(1), hidden_list, xtr.size(1), num_masks=args.num_masks) print("number of model parameters:",sum([np.prod(p.size()) for p in model.parameters()])) model.cuda() # set up the optimizer opt = torch.optim.Adam(model.parameters(), 1e-3, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=45, gamma=0.1) # start the training for epoch in range(100): print("epoch %d" % (epoch, )) scheduler.step(epoch) run_epoch('test', upto=5) # run only a few batches for approximate test accuracy run_epoch('train') print("optimization done. full test set eval:") run_epoch('test')