def test_DIS_samples_z(): NZ = 25 FILTER_SIZE = 2 #2 for testing BATCH_SIZE = 10000 IM_SIZE = 64 MULTIZ = True dae = DAE(nz=NZ, imSize=64, fSize=FILTER_SIZE, sigma=0.1, multimodalZ=MULTIZ) #sigma=level of corruption if not dae.multimodalZ: prior = dae.norm_prior else: print 'multi-prior' prior = dae.multi_prior dis = DIS_Z(nz=NZ, prior=prior) z = dis.prior(BATCH_SIZE).numpy() print 'z shape:', np.shape(z) plt.figure() plt.hist2d(z[:, 0], z[:, 1], (100, 100)) plt.title('2D hist for DAE DIS with multimodalZ') plt.savefig('2Dhist_DIS_z.png')
def main(train_dataset, expt_set, model_name=None, chrom='chr21', test_run=False, weighted_average=False, save_logs=False, eval_freq=1000000, epochs=None, seed=211, n_samples=20000000, replace_gaps=False): # TODO AUTOMATICALLY INFER EPOCHS FROM BINNED_CHRSZ if epochs is None and n_samples is not None: epochs = math.ceil(n_samples / BINNED_CHRSZ[chrom]) print('{} epochs of {} datapoints each total {} samples'.format(epochs, BINNED_CHRSZ[chrom], n_samples)) if model_name is None: model_name = '{}_{}'.format(chrom, train_dataset) if train_dataset=='full': n_train_obs = 312 elif train_dataset == 'train': n_train_obs = 267 else: raise ValueError('Train set must be either full or train') np.random.seed(seed) # n_drop and obs_counts are decoupled to allow the possibility for training on a subset of the dropped signals model = DAE(obs_counts=[50,45], n_drop=50, mlp_dropout=0.3, dae_dim=[100,50], n_train_obs=n_train_obs) # TODO fix naming (model.models[50] -> model.models['train']?) train_model = model.models[50] train_model.compile(loss=cauchy5, optimizer=Adam(lr=0.0003)) train_gen = TrainDataGeneratorHDF5(n_drop=50, chrom=chrom, batch_size=256, directory=data_dir, replace_gaps=replace_gaps) save_train_config(expt_set, model_name, model, train_gen, weighted_average=weighted_average, eval_freq=eval_freq, train_kwargs={'epochs': epochs, 'loss': 'cauchy5', 'optimizer': 'adam', 'lr': 0.0003, 'seed': seed}) checkpoint_folder = output_dir + 'weights/{}'.format(expt_set) os.makedirs(checkpoint_folder, exist_ok=True) callbacks = [EpochTimer()] # does what it sounds like if train_dataset == 'train': # callbacks monitor metrics on val set (for training chromosome) as well as saving checkpoints val_model = model.models[45] val_model.compile(loss='mse', optimizer=Adam()) val_gen = ValDataGeneratorHDF5(train_dataset=train_dataset, chrom=chrom, batch_size=256) callbacks += get_validation_callbacks(val_model, val_gen, checkpoint_folder, model_name, weighted_average=weighted_average, eval_freq=eval_freq, test_run=test_run, verbose=2 if test_run else 1) else: # callbacks save weights each dataset_size samples (i.e. each 'epoch') callbacks += get_checkpoint_callbacks(checkpoint_folder, model_name, weighted_average=weighted_average) if save_logs and not test_run: callbacks += [CSVLogger(output_dir+'logs/{}/{}.csv'.format(expt_set, model_name), append=False), ResumableTensorBoard(start_epoch*epoch_size, log_dir=output_dir+'logs/{}/{}/'.format(expt_set, model_name), update_freq=100000) ] train_model.fit_generator(train_gen, epochs=epochs, verbose=1 if test_run else 2, callbacks=callbacks)
def test_DAE_samples_z(): print 'testing DAE module' NZ = 25 FILTER_SIZE = 2 #2 for testing BATCH_SIZE = 10000 IM_SIZE = 64 x = Variable(torch.randn(BATCH_SIZE, 3, IM_SIZE, IM_SIZE)) #random input dae = DAE(imSize=IM_SIZE, fSize=FILTER_SIZE, nz=NZ, sigma=1.0, multimodalZ=True) z_sample = dae.sample_z(BATCH_SIZE) print type(z_sample), z_sample.size() z = z_sample.cpu().data.numpy() plt.figure() plt.hist2d(z[:, 0], z[:, 1], (100, 100)) plt.title('2D hist for DAE with multimodalZ') plt.savefig('2Dhist_z.png')
def main(model_name, expt_set, chrom, checkpoint_code=14, outfmt='npz', dataset='test', train_dataset='all', moving_average=False, output_directory=None, data_directory=None): if expt_set in ['imp', 'imp1']: checkpoint_code = 14 if expt_set == 'imp' else 14.0 # these are just used to identify the weights file that is loaded if output_directory is None: output_directory = output_dir if data_directory is None: data_directory = data_dir print('Saving preds for {} on {} to {}'.format(model_name, dataset, output_dir)) assert train_dataset in ['train', 'all'], 'train dataset must be either train or all' if dataset == 'test': data_gen = TestDataGeneratorHDF5(train_dataset=train_dataset, n_drop=50, chrom=chrom, directory=data_directory) elif dataset == 'val': if train_dataset == 'all': raise NotImplementedError() data_gen = ValDataGeneratorHDF5(n_drop=50, chrom=chrom, directory=data_directory) else: raise Exception('dataset must be either train or val') n_predict = len(dataset_expts[dataset]) model = DAE(obs_counts=[50, n_predict], n_drop=50, mlp_dropout=0.3, dae_dim=[100,50], n_train_obs=len(dataset_expts[train_dataset])) pred_model = model.models[n_predict] pred_model.compile(loss='mse', optimizer='adam') checkpoint = find_checkpoint(model_name, expt_set, checkpoint_code, moving_avg=moving_average, weights_dir=output_directory) print('Loading checkpoint', checkpoint) pred_model.load_weights(checkpoint) # print('Making predictions') preds = pred_model.predict_generator(data_gen, verbose=1, steps=None) print('Pred shape', preds.shape) preds = np.squeeze(preds) print('Squeezed pred shape', preds.shape) imp_dir = output_directory+'{}_imputations/{}/{}/'.format(dataset, expt_set, model_name) os.makedirs(imp_dir, exist_ok=True) print('Saving preds') assert n_predict == preds.shape[1], 'check names - length of name list doesnt match data' for track_name, track_vals in zip(dataset_expts[dataset], preds.T): np.savez_compressed(imp_dir + '{}.{}.{}.npz'.format(track_name, chrom, checkpoint_code), track_vals.reshape(-1)) print('Done')
def test_DAE(): print 'testing DAE module' NZ = 100 FILTER_SIZE = 2 #2 for testing BATCH_SIZE = 5 IM_SIZE = 64 x = Variable(torch.randn(BATCH_SIZE, 3, IM_SIZE, IM_SIZE)) #random input dae = DAE(imSize=IM_SIZE, fSize=FILTER_SIZE, nz=NZ, sigma=1.0) x_corr = dae.corrupt(x) z_enc = dae.encode(x) z_enc_, x_rec = dae.forward(x) z_sample = dae.sample_z(BATCH_SIZE) assert x_corr.size() == (BATCH_SIZE, 3, IM_SIZE, IM_SIZE) assert z_enc.size() == (BATCH_SIZE, NZ) assert z_enc_.size() == (BATCH_SIZE, NZ) assert x_rec.size() == (BATCH_SIZE, 3, IM_SIZE, IM_SIZE) assert z_sample.size() == (BATCH_SIZE, NZ)
trainLoader = torch.utils.data.DataLoader(trainDataset, batch_size=opts.batchSize, shuffle=True) testDataset = CELEBA(root=opts.root, train=False, transform=transforms.ToTensor()) testLoader = torch.utils.data.DataLoader(testDataset, batch_size=opts.batchSize, shuffle=False) print 'Data loaders ready.' #Create model dae = DAE(nz=opts.nz, imSize=64, fSize=opts.fSize, sigma=opts.sigma, multimodalZ=opts.multimodalZ) #sigma=level of corruption dis, NZ = build_dis(dae=dae, multimodalZ=opts.multimodalZ) svm = LINEAR_SVM(nz=NZ, c=opts.c) #model if dae.useCUDA: torch.cuda.set_device(opts.gpuNo) print 'using gpu:', torch.cuda.current_device() dae.cuda() dis.cuda() svm.cuda() if opts.loadDAE: #should load DAE if in eval mode print 'loading DAE...' dae.load_params(opts.load_DAE_from)