def run_alg(x_pub, y_pub, x_priv, y_priv, params, full_model_id):

  ##################################
  #   representation learning
  #################################
  x = x_pub
  y = y_pub

  # separate validation set if needed
  val_x = None
  #val_y = None
  if validation_split:
    logging.info("Splitting into training and validation sets")
    from sklearn.model_selection import train_test_split
    train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=validation_split, random_state=0)
    x, y = train_x, train_y
    logging.info(" * training set shape: %d x %d" % x.shape)
    logging.info(" * validation set shape: %d x %d" % val_x.shape)
  
  data_dim = x.shape[1]
  logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

  repr_dim = int(round(params.repr_dim))

  logging.info("Learning the representation on public data...")
  logging.info(" * learning a representation of size %d", repr_dim)
  start_time = time.time()
  
  # init the algorithm
  #alg = make_alg(data_dim, repr_dim, num_classes)
  #alg = make_alg(data_dim, repr_dim)
  from models.vae_pytorch import VAE
  alg = VAE().init(
    input_dim = data_dim,
    latent_dim = repr_dim,
    #enc_dims = [],
    enc_dims = [int(10 ** params.hidden_layer_size_mul_log10)*repr_dim] * int(params.n_hidden_layers),
    dec_dims = 'same',
    enc_activations = 'relu',
    dec_activations = 'relu',
    prediction_mean_activation = 'sigmoid',
    prediction_var = 'gs',
    prediction_log_var_min = math.log(0.01**2),
    normalize_input_type = 'quantiles',
    normalize_input_quantile = 0.05,
    normalize_input_axis = 'global',
    normalize_input_target = (0, 1),
    normalize_input_clip = True,
    optimizer = 'Adam',
    optimizer_params = {'lr': 10.0 ** params.learning_rate_log10},
    n_epochs = n_epochs,
    early_stopping = True,
    reduce_lr_on_plateau = False,
    batch_size = batch_size)

  # create output dir if does not exist
  #ensure_dir_exists('res')

  # define the progress saving function
  ensure_dir_exists('param_opt/progress')
  progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % (full_model_id)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id)
  #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
  if val_x is not None:
    val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % (full_model_id)
    val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
    #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id)
    #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
  def save_progress():
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    progress_file.write("%g\n" % rel_mse)
    #aux_pred = alg.predict_secondary(x)
    #aux_rel_ce = relative_cross_entropy(y, aux_pred)
    #aux_progress_file.write("%g\n" % aux_rel_ce)
    if val_x is not None:
      val_x_pred = alg.decode(alg.encode(val_x))
      rel_mse = relative_mean_squared_error(val_x, val_x_pred)
      val_progress_file.write("%g\n" % rel_mse)
      #val_aux_pred = alg.predict_secondary(val_x)
      #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
      #aux_val_progress_file.write("%g\n" % aux_rel_ce)
  
  # fit to the training data
  ensure_dir_exists("param_opt/log/")
  alg.learn(x, validation_data=val_x,
            log_file_prefix=("param_opt/log/%s" % (full_model_id)),
            per_epoch_callback_funs=[save_progress],
            deadline=None, max_duration=repr_max_duration)

  # test reconstruction error
  x_pred = alg.decode(alg.encode(x))
  rel_mse = relative_mean_squared_error(x, x_pred)
  val_x_pred = alg.decode(alg.encode(val_x))
  val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
  logging.info(" * final error: rel_mse = %g, val_rel_mse = %g",
              rel_mse, val_rel_mse)

  elapsed = time.time() - start_time
  logging.info(" * running time = %s", pretty_duration(elapsed))


  ##################################
  #   representation mapping
  #################################

  x = x_priv
  y = y_priv

  # get the representation
  logging.info("Making the representation of private data...")
  x_repr = alg.encode(x)

  # test to predict the data itself
  x_pred = alg.decode(x_repr)
  rel_mse = relative_mean_squared_error(x, x_pred)
  logging.info(" * reconstruct the data: rel_mse = %g", rel_mse)

  ##################################
  #   prediction
  #################################

  x = x_repr

  # private or non-private logistic regression
  private = True

  # test prediction with cross validation
  logging.info("Prediction with %d-fold cross validation...", pred_cv_folds)
  from sklearn.model_selection import StratifiedKFold
  cv = StratifiedKFold(n_splits=pred_cv_folds, shuffle=True, random_state=0)
  avg_test_acc = 0
  for fold, (train, test) in enumerate(cv.split(x, y)):
    logging.info("Fold %d...", fold)
    x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
  
    # init rng  
    #np.random.seed(seed0)

    logging.info("Bounding the data to 1-sphere...")
    if scale_fun == "norm_max":
      logging.info(" * scale by max norm")
      scale_factor = np.amax(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_max":
      logging.info(" * scale each dimension by max absolute value")
      scale_factor = np.amax(np.abs(x_train), axis=0)
    elif scale_fun == "norm_avg":
      logging.info(" * scale by average norm")
      scale_factor = np.mean(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_std":
      logging.info(" * scale each dimension by standard deviation")
      scale_factor = np.std(x_train, axis=0)
    elif scale_fun == "none":
      scale_factor = 1.0
    else:
      assert False

    x_train /= scale_factor * scale_const
    x_test /= scale_factor * scale_const
    if clip == "norm":
      logging.info(" * clip norms to max 1")
      x_train /= np.maximum(np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1)
      x_test /= np.maximum(np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack),1)
    elif clip == "dims":
      assert False, "not implemented"
    elif clip == "none":
      logging.info(" * no clipping -> no bounding")
      assert private == False #or np.isinf(epsilon)
    else:
      assert False

    # fit
    logging.info("Fitting a model...")
    if private:
      logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength)
      from models.logistic_regression import DPLogisticRegression
      model = DPLogisticRegression().init(repr_dim, classes=np.unique(y),
                                          alpha=regularizer_strength, epsilon=epsilon)
    else:
      logging.info(" * logistic regression: alpha=%g", regularizer_strength)
      from sklearn.linear_model import LogisticRegression
      model = LogisticRegression(C=1/regularizer_strength)
    
    model.fit(x_train, y_train)
    #print(model.predict(x_test))

    # compute mean accuracy on test set
    logging.info("Testing the model...")
    #acc = model.score(x_test, y_test)
    from sklearn.metrics import accuracy_score
    train_acc = accuracy_score(y_train, model.predict(x_train))
    test_acc = accuracy_score(y_test, model.predict(x_test))
    logging.info(" * train accuracy = %.6f", train_acc)
    logging.info(" * test accuracy = %.6f", test_acc)
    avg_test_acc += test_acc
  
  avg_test_acc /= pred_cv_folds
  logging.info("Average test accuracy = %.6f", avg_test_acc)
  
  return avg_test_acc
Example #2
0
def task(args):
    import pandas
    repr_dim, (alg_id, _, make_alg), seed = args
    logging.info("dataset = %s, algorithm = %s", data_set, alg_id)
    # read the data sets
    logging.info("Reading data...")
    data = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    logging.info(" * gene expression shape: %d x %d" % data.shape)

    #aux_target = pandas.read_hdf("data/TCGA_cancertype.h5", 'cancer_types')
    #logging.info(" * auxiliary target size: %d" % aux_target.shape)

    #common_samples = data.index.intersection(aux_target.index)
    #data = data.loc[common_samples]
    #aux_target = aux_target.loc[common_samples]
    #logging.info(" * number of common samples: %d" % common_samples.size)

    from common import categorical_to_binary

    x = data.as_matrix()
    #y = categorical_to_binary(aux_target.values)
    #num_classes = y.shape[1]

    #x = x[:,0:2000]

    # normalize the input to _total_ unit variance and per-feature zero mean
    if normalize_data:
        x -= np.mean(x)
        x /= np.std(x)
        x -= np.mean(x, axis=0)

    # FIXME!
    #x = (x - np.amin(x,axis=0)) / (np.amax(x,axis=0) - np.amin(x,axis=0))
    #x = (x - np.amin(x)) / (np.amax(x) - np.amin(x))

    # init rng
    np.random.seed(seed)
    import torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #if args.cuda ?????:
    #  torch.cuda.manual_seed(seed)

    # separate validation set if needed
    val_x = None
    #val_y = None
    if validation_split:
        logging.info("Splitting into training and validation sets")
        m = x.shape[0]
        perm = np.random.permutation(m)
        x = x[perm, :]
        #y = y[perm,:]
        split_point = int(validation_split * m)
        (val_x, x) = (x[:split_point, :], x[split_point:, :])
        #(val_y, y) = (y[:split_point,:], y[split_point:,:])
        logging.info(" * training set shape: %d x %d" % x.shape)
        logging.info(" * validation set shape: %d x %d" % val_x.shape)

    data_dim = x.shape[1]
    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    logging.info("Running the algorithm...")
    logging.info(" * learning a representation of size %d", repr_dim)
    start_time = time.time()

    # init the algorithm
    #alg = make_alg(data_dim, repr_dim, num_classes)
    alg = make_alg(data_dim, repr_dim)

    # create output dir if does not exist
    ensure_dir_exists('res')

    full_model_id = "%s-%d-%s-s%d%s" % (data_set, repr_dim, alg_id, seed,
                                        id_suffix)

    # define the progress saving function
    progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id)
    progress_file = open(progress_filename, 'w', encoding='utf-8')
    #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id)
    #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
    if val_x is not None:
        val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % (
            full_model_id)
        val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
        #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id)
        #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
    def save_progress():
        x_pred = alg.decode(alg.encode(x))
        rel_mse = relative_mean_squared_error(x, x_pred)
        progress_file.write("%g\n" % rel_mse)
        #aux_pred = alg.predict_secondary(x)
        #aux_rel_ce = relative_cross_entropy(y, aux_pred)
        #aux_progress_file.write("%g\n" % aux_rel_ce)
        if val_x is not None:
            val_x_pred = alg.decode(alg.encode(val_x))
            rel_mse = relative_mean_squared_error(val_x, val_x_pred)
            val_progress_file.write("%g\n" % rel_mse)
            #val_aux_pred = alg.predict_secondary(val_x)
            #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
            #aux_val_progress_file.write("%g\n" % aux_rel_ce)

    # fit to the training data
    alg.learn(x,
              validation_data=val_x,
              log_file_prefix=("log/%s" % (full_model_id)),
              per_epoch_callback_funs=[save_progress],
              deadline=deadline,
              max_duration=max_duration)

    # test reconstruction error
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    val_x_pred = alg.decode(alg.encode(val_x))
    val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
    logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse,
                 val_rel_mse)

    elapsed = time.time() - start_time
    logging.info(" * running time = %s", pretty_duration(elapsed))

    # save model
    logging.info("Saving the learned model...")
    ensure_dir_exists('repr_models')
    alg.save("repr_models/%s" % (full_model_id))
Example #3
0
def learn_repr(x, y, params, full_model_id):

    # separate validation set if needed
    val_x = None
    #val_y = None
    if params.repr_learn_validation_split:
        logging.info("Splitting into training and validation sets")
        from sklearn.model_selection import train_test_split
        train_x, val_x, train_y, val_y = train_test_split(
            x, y, test_size=params.repr_learn_validation_split, random_state=0)
        x, y = train_x, train_y
        logging.info(" * training set shape: %d x %d" % x.shape)
        logging.info(" * validation set shape: %d x %d" % val_x.shape)

    data_dim = x.shape[1]
    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    repr_dim = int(round(params.repr_dim))

    logging.info("Learning the representation on public data...")
    logging.info(" * learning a representation of size %d", repr_dim)
    start_time = time.time()

    (_, _, _, make_alg, _) = select_repr_alg(params.repr_alg)

    # init the algorithm
    #alg = make_alg(data_dim, repr_dim, num_classes)
    #alg = make_alg(data_dim, repr_dim)
    alg = make_alg(data_dim, repr_dim, params)
    # create output dir if does not exist
    #ensure_dir_exists('res')

    # define the progress saving function
    ensure_dir_exists('param_opt/progress')
    progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % (
        full_model_id)
    progress_file = open(progress_filename, 'w', encoding='utf-8')
    #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id)
    #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
    if val_x is not None:
        val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % (
            full_model_id)
        val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
        #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id)
        #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
    def save_progress():
        x_pred = alg.decode(alg.encode(x))
        rel_mse = relative_mean_squared_error(x, x_pred)
        progress_file.write("%g\n" % rel_mse)
        #aux_pred = alg.predict_secondary(x)
        #aux_rel_ce = relative_cross_entropy(y, aux_pred)
        #aux_progress_file.write("%g\n" % aux_rel_ce)
        if val_x is not None:
            val_x_pred = alg.decode(alg.encode(val_x))
            rel_mse = relative_mean_squared_error(val_x, val_x_pred)
            val_progress_file.write("%g\n" % rel_mse)
            #val_aux_pred = alg.predict_secondary(val_x)
            #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
            #aux_val_progress_file.write("%g\n" % aux_rel_ce)

    # fit to the training data
    ensure_dir_exists("param_opt/log/")
    alg.learn(x,
              validation_data=val_x,
              log_file_prefix=("param_opt/log/%s" % (full_model_id)),
              per_epoch_callback_funs=[save_progress],
              deadline=None,
              max_duration=params.repr_learn_max_duration)

    # test reconstruction error
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    if val_x is not None:
        val_x_pred = alg.decode(alg.encode(val_x))
        val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
    else:
        val_rel_mse = np.nan
    logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse,
                 val_rel_mse)

    elapsed = time.time() - start_time
    logging.info(" * running time = %s", pretty_duration(elapsed))

    return alg
def task(args):
    import pandas
    param_id, priv_cancertypes, seed = args
    logging.info("priv classes = %s, params_id = %s, seed = %d",
                 priv_cancertypes, param_id, seed)
    #repr_dim, (alg_id, _, make_alg), seed = args
    #logging.info("algorithm = %s, seed = %d", alg_id, seed)
    # read the data sets
    alg_id = param_id
    logging.info("Loading parameters...")
    params = np.load("run_parameters/params.npy")
    params = params[param_id]
    logging.info("Reading data...")
    gene_expr = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    logging.info(" * gene expression shape: %d x %d" % gene_expr.shape)

    logging.info("Filtering out genes with low expressions...")
    low_expr = (np.median(gene_expr, axis=0) < 0.0)
    gene_expr = gene_expr.iloc[:, ~low_expr]
    logging.info(" * %d of %d remaining (%d removed)" %
                 (sum(~low_expr), low_expr.size, sum(low_expr)))

    logging.info("Loading cancer types...")
    cancer_type = pandas.read_hdf("data/%s.h5" % (target_set), target_type)
    assert np.array_equal(gene_expr.index, cancer_type.index)

    # split
    logging.info("Splitting...")
    priv = cancer_type.isin(priv_cancertypes)
    logging.info(" * %d private samples, %d public samples (of %d total)" %
                 (sum(priv), sum(~priv), priv.size))

    from common import categorical_to_binary

    x_pub = gene_expr[~priv].as_matrix()
    y_pub = cancer_type[~priv].cat.codes.as_matrix()
    x_priv = gene_expr[priv].as_matrix()
    y_priv = cancer_type[priv].cat.codes.as_matrix()
    #y = categorical_to_binary(aux_target.values)
    #num_classes = y.shape[1]

    data_name = '-'.join(priv_cancertypes).replace(' ', '_')

    # A hack to have a different seed if the algorithm is run multiple times
    # with the same parameters. Destroys reproducibility...
    import time
    seed0 = int(time.time() * 100) % (2**32)
    # init rng
    np.random.seed(seed0)
    import torch
    torch.manual_seed(seed0)
    if torch.cuda.is_available() and torch.cuda.device_count() > 0:
        torch.cuda.manual_seed(seed0)

    ##################################
    #   representation learning
    #################################
    x = x_pub
    y = y_pub

    # separate validation set if needed
    val_x = None
    #val_y = None
    if validation_split:
        logging.info("Splitting into training and validation sets")
        from sklearn.model_selection import train_test_split
        train_x, val_x, train_y, val_y = train_test_split(
            x, y, test_size=validation_split, random_state=0)
        x, y = train_x, train_y
        #m = x.shape[0]
        #perm = np.random.permutation(m)
        #x = x[perm,:]
        #y = y[perm,:]
        #split_point = int(validation_split * m)
        #(val_x, x) = (x[:split_point,:], x[split_point:,:])
        #(val_y, y) = (y[:split_point,:], y[split_point:,:])
        logging.info(" * training set shape: %d x %d" % x.shape)
        logging.info(" * validation set shape: %d x %d" % val_x.shape)

    data_dim = x.shape[1]
    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    logging.info("Learning the representaiton on public data...")
    logging.info(" * learning a representation of size %d", repr_dim)
    start_time = time.time()

    # init the algorithm
    #alg = make_alg(data_dim, repr_dim, num_classes)
    #alg = make_alg(data_dim, repr_dim)
    from models.vae_pytorch import VAE
    alg = VAE().init(
        input_dim=data_dim,
        latent_dim=repr_dim,
        #enc_dims = [],
        enc_dims=[int(10**params.hidden_layer_size_mul_log10) * repr_dim] *
        int(params.n_hidden_layers),
        dec_dims='same',
        enc_activations='relu',
        dec_activations='relu',
        prediction_mean_activation='sigmoid',
        prediction_var='gs',
        prediction_log_var_min=math.log(0.01**2),
        normalize_input_type='quantiles',
        normalize_input_quantile=0.05,
        normalize_input_axis='global',
        normalize_input_target=(0, 1),
        normalize_input_clip=True,
        optimizer='Adam',
        optimizer_params={'lr': 10.0**params.learning_rate_log10},
        n_epochs=n_epochs,
        early_stopping=True,
        reduce_lr_on_plateau=False,
        batch_size=batch_size)

    # create output dir if does not exist
    ensure_dir_exists('res')

    full_model_id = "%s-%d-%s-s%d%s" % (data_name, repr_dim, alg_id, seed,
                                        id_suffix)

    # define the progress saving function
    progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id)
    progress_file = open(progress_filename, 'w', encoding='utf-8')
    #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id)
    #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
    if val_x is not None:
        val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % (
            full_model_id)
        val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
        #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id)
        #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
    def save_progress():
        x_pred = alg.decode(alg.encode(x))
        rel_mse = relative_mean_squared_error(x, x_pred)
        progress_file.write("%g\n" % rel_mse)
        #aux_pred = alg.predict_secondary(x)
        #aux_rel_ce = relative_cross_entropy(y, aux_pred)
        #aux_progress_file.write("%g\n" % aux_rel_ce)
        if val_x is not None:
            val_x_pred = alg.decode(alg.encode(val_x))
            rel_mse = relative_mean_squared_error(val_x, val_x_pred)
            val_progress_file.write("%g\n" % rel_mse)
            #val_aux_pred = alg.predict_secondary(val_x)
            #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
            #aux_val_progress_file.write("%g\n" % aux_rel_ce)

    # fit to the training data
    alg.learn(x,
              validation_data=val_x,
              log_file_prefix=("log/%s" % (full_model_id)),
              per_epoch_callback_funs=[save_progress],
              deadline=deadline,
              max_duration=max_duration)

    # test reconstruction error
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    val_x_pred = alg.decode(alg.encode(val_x))
    val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
    logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse,
                 val_rel_mse)

    elapsed = time.time() - start_time
    logging.info(" * running time = %s", pretty_duration(elapsed))

    # save model
    #logging.info("Saving the learned model...")
    #ensure_dir_exists('repr_models')
    #alg.save("repr_models/%s" % (full_model_id))

    ##################################
    #   representation mapping
    #################################

    x = x_priv
    y = y_priv

    # get the representation
    logging.info("Making the representation of private data...")
    x_repr = alg.encode(x)

    # test to predict the data itself
    x_pred = alg.decode(x_repr)
    rel_mse = relative_mean_squared_error(x, x_pred)
    logging.info(" * reconstruct the data: rel_mse = %g", rel_mse)
    ensure_dir_exists("res")
    with open("res/private-encdec-rel_mse-%d-%s-%s-s%d%s.txt" %
              (repr_dim, data_name, alg_id, seed, id_suffix),
              'w',
              encoding='utf-8') as f:
        f.write("%.6f\n" % rel_mse)

    # save the representation
    #logging.info("Saving the representation...")
    #ensure_dir_exists("data_repr")
    #np.savetxt("data_repr/repr-%s-%d-%s-s%d%s.csv" %
    #           (data_name, repr_dim, alg_id, seed, id_suffix),
    #           x_repr, delimiter=',')

    ##################################
    #   prediction
    #################################

    x = x_repr

    # split train and test sets
    logging.info("Splitting to train and test sets...")
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=pred_test_size, random_state=0)
    logging.info(" * train samples: %d" % x_train.shape[0])
    logging.info(" * test samples: %d" % x_test.shape[0])

    # init rng
    np.random.seed(seed0)

    #print(np.amax(np.linalg.norm(x_train, axis=1)))
    #print(np.mean(np.linalg.norm(x_train, axis=1)))

    logging.info("Bounding the data to 1-sphere...")
    if scale_fun == "norm_max":
        logging.info(" * scale by max norm")
        scale_factor = np.amax(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_max":
        logging.info(" * scale each dimension by max absolute value")
        scale_factor = np.amax(np.abs(x_train), axis=0)
    elif scale_fun == "norm_avg":
        logging.info(" * scale by average norm")
        scale_factor = np.mean(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_std":
        logging.info(" * scale each dimension by standard deviation")
        scale_factor = np.std(x_train, axis=0)
    elif scale_fun == "none":
        scale_factor = 1.0
    else:
        assert False

    x_train /= scale_factor * scale_const
    x_test /= scale_factor * scale_const
    #print(np.amax(np.linalg.norm(x_train, axis=1, keepdims=True)))
    if clip == "norm":
        logging.info(" * clip norms to max 1")
        x_train /= np.maximum(
            np.linalg.norm(x_train, axis=1, keepdims=True) *
            (1 + bounding_slack), 1)
        x_test /= np.maximum(
            np.linalg.norm(x_test, axis=1, keepdims=True) *
            (1 + bounding_slack), 1)
    elif clip == "dims":
        assert False, "not implemented"
    elif clip == "none":
        logging.info(" * no clipping -> no bounding")
        assert private == False  #or np.isinf(epsilon)
    else:
        assert False

    #for private in [False, True]:
    for private in [True]:
        # fit
        logging.info("Fitting a model...")
        if private:
            logging.info(" * DP logistic regression: epsilon=%g, alpha=%g",
                         epsilon, regularizer_strength)
            from models.logistic_regression import DPLogisticRegression
            model = DPLogisticRegression().init(repr_dim,
                                                classes=np.unique(y),
                                                alpha=regularizer_strength,
                                                epsilon=epsilon)
        else:
            logging.info(" * logistic regression: alpha=%g",
                         regularizer_strength)
            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression(C=1 / regularizer_strength)

        model.fit(x_train, y_train)
        #print(model.predict(x_test))

        # compute mean accuracy on test set
        logging.info("Testing the model...")
        #acc = model.score(x_test, y_test)
        from sklearn.metrics import accuracy_score
        train_acc = accuracy_score(y_train, model.predict(x_train))
        test_acc = accuracy_score(y_test, model.predict(x_test))
        logging.info(" * train accuracy = %.6f", train_acc)
        logging.info(" * test accuracy = %.6f", test_acc)

        logging.info("Writing results to disk...")
        ensure_dir_exists("res")
        filename = (
            "res/cancertype-pred-accuracy-%d-%s-%s-s%d-%s-%d-%s%s.txt" %
            (repr_dim, data_name, alg_id, seed, scale_fun, scale_const, clip,
             ("-e%g" % (epsilon) if private else "-nonpriv")))
        logging.info(" * filename: %s", filename)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("%.6f\n" % test_acc)

        filename = "param_opt/opt_result%s-%s.txt" % (id_suffix, full_model_id)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("%.6f\n" % test_acc)