Exemple #1
0
def test_DIS_samples_z():
    NZ = 25
    FILTER_SIZE = 2  #2 for testing
    BATCH_SIZE = 10000
    IM_SIZE = 64
    MULTIZ = True

    dae = DAE(nz=NZ,
              imSize=64,
              fSize=FILTER_SIZE,
              sigma=0.1,
              multimodalZ=MULTIZ)  #sigma=level of corruption
    if not dae.multimodalZ:
        prior = dae.norm_prior
    else:
        print 'multi-prior'
        prior = dae.multi_prior
    dis = DIS_Z(nz=NZ, prior=prior)

    z = dis.prior(BATCH_SIZE).numpy()
    print 'z shape:', np.shape(z)
    plt.figure()
    plt.hist2d(z[:, 0], z[:, 1], (100, 100))
    plt.title('2D hist for DAE DIS with multimodalZ')
    plt.savefig('2Dhist_DIS_z.png')
Exemple #2
0
def main(train_dataset, expt_set, model_name=None, chrom='chr21', test_run=False,
         weighted_average=False, save_logs=False, eval_freq=1000000, epochs=None,
         seed=211, n_samples=20000000, replace_gaps=False):
  # TODO AUTOMATICALLY INFER EPOCHS FROM BINNED_CHRSZ
  if epochs is None and n_samples is not None:
    epochs = math.ceil(n_samples / BINNED_CHRSZ[chrom])
    print('{} epochs of {} datapoints each total {} samples'.format(epochs, BINNED_CHRSZ[chrom], n_samples))
  if model_name is None:
    model_name = '{}_{}'.format(chrom, train_dataset)
  if train_dataset=='full':
    n_train_obs = 312
  elif train_dataset == 'train':
    n_train_obs = 267
  else: 
    raise ValueError('Train set must be either full or train')

  np.random.seed(seed)
  # n_drop and obs_counts are decoupled to allow the possibility for training on a subset of the dropped signals
  model = DAE(obs_counts=[50,45], n_drop=50, mlp_dropout=0.3,
              dae_dim=[100,50], n_train_obs=n_train_obs)

  # TODO fix naming (model.models[50] -> model.models['train']?)
  train_model = model.models[50]
  train_model.compile(loss=cauchy5, optimizer=Adam(lr=0.0003))

  train_gen = TrainDataGeneratorHDF5(n_drop=50, chrom=chrom, batch_size=256,
                                     directory=data_dir, replace_gaps=replace_gaps)
  save_train_config(expt_set, model_name, model, train_gen,
                    weighted_average=weighted_average, eval_freq=eval_freq,
                    train_kwargs={'epochs': epochs, 'loss': 'cauchy5', 'optimizer': 'adam',
                                  'lr': 0.0003, 'seed': seed})

  checkpoint_folder = output_dir + 'weights/{}'.format(expt_set)
  os.makedirs(checkpoint_folder, exist_ok=True)
  callbacks = [EpochTimer()] # does what it sounds like

  if train_dataset == 'train':
    # callbacks monitor metrics on val set (for training chromosome) as well as saving checkpoints
    val_model = model.models[45]
    val_model.compile(loss='mse', optimizer=Adam())
    val_gen = ValDataGeneratorHDF5(train_dataset=train_dataset, chrom=chrom, batch_size=256)
    callbacks += get_validation_callbacks(val_model, val_gen, checkpoint_folder, model_name,
                                          weighted_average=weighted_average, eval_freq=eval_freq,
                                          test_run=test_run, verbose=2 if test_run else 1)
  else:
    # callbacks save weights each dataset_size samples (i.e. each 'epoch')
    callbacks += get_checkpoint_callbacks(checkpoint_folder, model_name, weighted_average=weighted_average)

  if save_logs and not test_run:
    callbacks += [CSVLogger(output_dir+'logs/{}/{}.csv'.format(expt_set, model_name), append=False),
                  ResumableTensorBoard(start_epoch*epoch_size,
                                       log_dir=output_dir+'logs/{}/{}/'.format(expt_set, model_name),
                                       update_freq=100000)
                  ]

  train_model.fit_generator(train_gen, epochs=epochs, verbose=1 if test_run else 2,
                            callbacks=callbacks)
Exemple #3
0
def test_DAE_samples_z():
    print 'testing DAE module'
    NZ = 25
    FILTER_SIZE = 2  #2 for testing
    BATCH_SIZE = 10000
    IM_SIZE = 64

    x = Variable(torch.randn(BATCH_SIZE, 3, IM_SIZE, IM_SIZE))  #random input
    dae = DAE(imSize=IM_SIZE,
              fSize=FILTER_SIZE,
              nz=NZ,
              sigma=1.0,
              multimodalZ=True)

    z_sample = dae.sample_z(BATCH_SIZE)
    print type(z_sample), z_sample.size()

    z = z_sample.cpu().data.numpy()
    plt.figure()
    plt.hist2d(z[:, 0], z[:, 1], (100, 100))
    plt.title('2D hist for DAE with multimodalZ')
    plt.savefig('2Dhist_z.png')
def main(model_name, expt_set, chrom, checkpoint_code=14, outfmt='npz', dataset='test', 
         train_dataset='all', moving_average=False, output_directory=None, data_directory=None):
  if expt_set in ['imp', 'imp1']:
    checkpoint_code = 14 if expt_set == 'imp' else 14.0 # these are just used to identify the weights file that is loaded
  if output_directory is None:
    output_directory = output_dir
  if data_directory is None:
    data_directory = data_dir

  print('Saving preds for {} on {} to {}'.format(model_name, dataset, output_dir))
  assert train_dataset in ['train', 'all'], 'train dataset must be either train or all'

  if dataset == 'test':
    data_gen = TestDataGeneratorHDF5(train_dataset=train_dataset, n_drop=50,
                                     chrom=chrom, directory=data_directory)
  elif dataset == 'val':
    if train_dataset == 'all':
      raise NotImplementedError()
    data_gen = ValDataGeneratorHDF5(n_drop=50, chrom=chrom, directory=data_directory)

  else:
    raise Exception('dataset must be either train or val')

  n_predict = len(dataset_expts[dataset])
  model = DAE(obs_counts=[50, n_predict], n_drop=50, mlp_dropout=0.3,
              dae_dim=[100,50], n_train_obs=len(dataset_expts[train_dataset]))
  pred_model = model.models[n_predict]
  pred_model.compile(loss='mse', optimizer='adam')

  checkpoint = find_checkpoint(model_name, expt_set, checkpoint_code, moving_avg=moving_average,
                               weights_dir=output_directory)
  print('Loading checkpoint', checkpoint)
  pred_model.load_weights(checkpoint)

  # print('Making predictions')
  preds = pred_model.predict_generator(data_gen, verbose=1, steps=None)
  print('Pred shape', preds.shape)
  preds = np.squeeze(preds)
  print('Squeezed pred shape', preds.shape)

  imp_dir = output_directory+'{}_imputations/{}/{}/'.format(dataset, expt_set, model_name)
  os.makedirs(imp_dir, exist_ok=True)
  
  print('Saving preds')
  assert n_predict == preds.shape[1], 'check names - length of name list doesnt match data'
  for track_name, track_vals in zip(dataset_expts[dataset], preds.T):
    np.savez_compressed(imp_dir + '{}.{}.{}.npz'.format(track_name, chrom, checkpoint_code), track_vals.reshape(-1))

  print('Done')
Exemple #5
0
def test_DAE():
    print 'testing DAE module'
    NZ = 100
    FILTER_SIZE = 2  #2 for testing
    BATCH_SIZE = 5
    IM_SIZE = 64

    x = Variable(torch.randn(BATCH_SIZE, 3, IM_SIZE, IM_SIZE))  #random input
    dae = DAE(imSize=IM_SIZE, fSize=FILTER_SIZE, nz=NZ, sigma=1.0)

    x_corr = dae.corrupt(x)
    z_enc = dae.encode(x)
    z_enc_, x_rec = dae.forward(x)
    z_sample = dae.sample_z(BATCH_SIZE)

    assert x_corr.size() == (BATCH_SIZE, 3, IM_SIZE, IM_SIZE)
    assert z_enc.size() == (BATCH_SIZE, NZ)
    assert z_enc_.size() == (BATCH_SIZE, NZ)
    assert x_rec.size() == (BATCH_SIZE, 3, IM_SIZE, IM_SIZE)
    assert z_sample.size() == (BATCH_SIZE, NZ)
Exemple #6
0
    trainLoader = torch.utils.data.DataLoader(trainDataset,
                                              batch_size=opts.batchSize,
                                              shuffle=True)

    testDataset = CELEBA(root=opts.root,
                         train=False,
                         transform=transforms.ToTensor())
    testLoader = torch.utils.data.DataLoader(testDataset,
                                             batch_size=opts.batchSize,
                                             shuffle=False)
    print 'Data loaders ready.'

    #Create model
    dae = DAE(nz=opts.nz,
              imSize=64,
              fSize=opts.fSize,
              sigma=opts.sigma,
              multimodalZ=opts.multimodalZ)  #sigma=level of corruption
    dis, NZ = build_dis(dae=dae, multimodalZ=opts.multimodalZ)
    svm = LINEAR_SVM(nz=NZ, c=opts.c)  #model

    if dae.useCUDA:
        torch.cuda.set_device(opts.gpuNo)
        print 'using gpu:', torch.cuda.current_device()
        dae.cuda()
        dis.cuda()
        svm.cuda()

    if opts.loadDAE:  #should load DAE if in eval mode
        print 'loading DAE...'
        dae.load_params(opts.load_DAE_from)