Esempi in Python per ensure_dir_exists, esempi in Python per common.ensure_dir_exists

Esempio n. 1

0

Mostra file

File: test_pca_explained_variance.py Progetto: jamesdu0504/dp-representation-transfer

def task(args):
    import pandas
    #data_set, = args
    logging.info("dataset = %s", data_set)
    # read the data sets
    logging.info("Reading data...")
    data = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    logging.info(" * gene expression shape: %d x %d" % data.shape)

    x = data.as_matrix()

    if normalize_data:
        # these shouldn't affect the results
        x -= np.mean(x)
        x /= np.std(x)
        x -= np.mean(x, axis=0)

    logging.info("Running PCA...")
    pca = sk_PCA()
    pca.fit(x)

    logging.info("Writing results...")
    res_dir = 'res/pca-explained-variance'
    res_filename = "%s/%s.txt" % (res_dir, data_set)
    ensure_dir_exists(res_dir)
    np.savetxt(res_filename, pca.explained_variance_ratio_)

Esempio n. 2

0

Mostra file

File: optimize_repr_learning_params.py Progetto: jamesdu0504/dp-representation-transfer

def run_batch(args, params):
  ensure_dir_exists("run_parameters")
  params = [get_params(p, domain) for p in params]
  np.save("run_parameters/params.npy", params)
  assert len(params) == gpyopt_batch_size
  args.wait = True
  batch.run_tasks(args)
  # get results
  #return np.random.randn(gpyopt_batch_size, 1)
  res = np.zeros((gpyopt_batch_size, 1))
  for param_id in range(gpyopt_batch_size):
    tot_res = 0
    for val_cancertypes in val_cancer_type_splits:
      data_name = '-'.join(val_cancertypes).replace(' ', '_')
      for seed in seeds:
        #full_model_id = "%s-%d-%s-s%d%s" % (data_name, repr_dim, param_id, seed, id_suffix)
        full_model_id = "%s-%s-%d%s" % (param_id, data_name, seed, id_suffix)
        filename = "param_opt/opt_result-%s.txt" % (full_model_id)
        try:
          tot_res += np.loadtxt(filename)
          import os
          os.remove(filename)
        except:
          tot_res += gpyopt_fail_res
          logging.info('Warning, could not load "%s"' % filename)
    res[param_id] = tot_res / (len(val_cancer_type_splits) * len(seeds))
  return res

Esempio n. 3

0

Mostra file

def run_learning_and_mapping(args,
                             fixed_params,
                             task_param,
                             seeds,
                             slurm_args=None):
    logging.info('Running final tests with...')
    import copy
    task_params = [copy.copy(task_param) for s in seeds]
    param_ids = range(len(task_params))
    for param, seed, param_id in zip(task_params, seeds, param_ids):
        param.param_id = param_id
        param.seed = seed
    ensure_dir_exists("run_parameters")

    common_params = SimpleNamespace(
        **fixed_params.__dict__,
        priv_cancertype_pairs=[],
        pub_cancertypes=sum(cancer_type_pairs, []),
        task_type='learn_and_map',
    )
    args.wait = True
    batch2.run_tasks(args,
                     common_params,
                     task_params,
                     slurm_args=slurm_args,
                     params_file=("run_parameters/batch-%s.pkl" %
                                  (fixed_params.test_name)))

Esempio n. 4

0

Mostra file

File: worker_thread.py Progetto: parallel-fs-utils/fs-drift

 def cleanup_files(self):
     sys.stderr.flush()
     sys.stdout.flush()
     deltree(self.params.network_shared_path)
     deltree(self.params.top_directory)
     ensure_dir_exists(self.params.top_directory)
     ensure_dir_exists(self.params.network_shared_path)

Esempio n. 5

0

Mostra file

def task(args):
  (data_type, repr_dim), seed, (algName, _, makeAlg) = args
  logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName)
  # read the data sets
  logging.info("Reading data...")
  y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed))
  data_dim = x_train.shape[1]
  logging.info(" * training set: %d x %d" % x_train.shape)
  logging.info(" * testing set: %d x %d" % x_test.shape)
  # init rng  
  np.random.seed(seed)

  logging.info("Running and evaluating the algorithm...")
  logging.info(" * using representation with dimension = %d", repr_dim)
  
  # init the algorithm
  alg = makeAlg(data_dim, repr_dim)
  
  # create output dir if does not exist
  ensure_dir_exists('res')

  # define the progress saving function
  progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  def save_progress():
    x_test_pred = alg.decode(alg.encode(x_test))
    rel_mse = relative_mean_squared_error(x_test, x_test_pred)
    progress_file.write("%g\n" % rel_mse)

  # fit to the training data
  alg.learn(x_train,
            log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)),
            callbacks=[save_progress])
  
  # TODO: remove?
  x_test = x_train

  # test with the testing data
  x_test_pred = alg.decode(alg.encode(x_test))
  ensure_dir_exists('pred')
  pred_filename = 'pred/final-encdec-%s-%d-%s' % (data_type, seed, algName)
  if save_pred:
    np.save(pred_filename, x_test_pred)
  #from sklearn import metrics
  #mse = metrics.mean_squared_error(x_test, x_test_pred,
  #    multioutput='uniform_average')
  #explained_var = metrics.explained_variance_score(x_test, x_test_pred,
  #    multioutput='uniform_average')
  mse = mean_squared_error(x_test, x_test_pred)
  rel_mse = relative_mean_squared_error(x_test, x_test_pred)

  logging.info("Result: rel_mse = %g", rel_mse)
  logging.info("Writing results to a file...")
  res_filename = 'res/final-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  with open(res_filename, 'w', encoding='utf-8') as f:
    f.write("data = %-16s seed = %-4d alg = %-10s " % (data_type, seed, algName))
    f.write("mse = %.6f  " % mse)
    f.write("rel_mse = %.6f  " % rel_mse)
    f.write("\n")

Esempio n. 6

0

Mostra file

def task(args):
    repr_dim, (alg_id, seed, load_model) = args
    logging.info("representation size = %d, algorithm = %s, seed = %d",
                 repr_dim, alg_id, seed)

    # read the PADS gene expression data
    logging.info("Reading gene expression data...")
    import pandas
    data = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    x = data.as_matrix()
    logging.info(" * data shape: %d x %d" % x.shape)

    #logging.info("Filter and normalize...")
    ## load gene names that appear also in TCGA data
    #tcga_gene_names = np.array(getHDF5data("data/%s_genes.h5" % (aux_data_set),
    #                                       True, False)[0], dtype=str)
    #in_tcga = np.array([(gene_name in tcga_gene_names) for gene_name in gene_names])
    #assert(np.sum(in_tcga) == len(tcga_gene_names))
    ## use only those genes
    #x = x[:,in_tcga]

    # normalize the input to _total_ unit variance and zero mean
    if normalize_data:
        x -= np.mean(x)
        x /= np.std(x)
        x -= np.mean(x, axis=0)

    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    # init rng
    np.random.seed(seed)

    # load the model
    logging.info("Loading the model...")
    alg = load_model(repr_dim)

    # get the representation
    logging.info("Computing the representation or size %d..." % (repr_dim))
    x_repr = alg.encode(x)

    # test to predict the data itself
    x_pred = alg.decode(x_repr)
    rel_mse = relative_mean_squared_error(x, x_pred)
    logging.info(" * reconstruct the data: rel_mse = %g", rel_mse)
    ensure_dir_exists("res")
    with open("res/private-encdec-rel_mse-%d-%s-%s-s%d%s.txt" %
              (repr_dim, aux_data_set, alg_id, seed, id_suffix),
              'w',
              encoding='utf-8') as f:
        f.write("%.6f\n" % rel_mse)

    # save the representation
    logging.info("Saving the representation...")
    ensure_dir_exists("data_repr")
    np.savetxt("data_repr/repr-%d-%s-%s-s%d%s.csv" %
               (repr_dim, aux_data_set, alg_id, seed, id_suffix),
               x_repr,
               delimiter=',')

Esempio n. 7

0

Mostra file

File: optimize_test_repr_learning_params2.py Progetto: jamesdu0504/dp-representation-transfer

def run_optimization_batch(args,
                           fixed_params,
                           task_params,
                           seeds,
                           slurm_args=None):
    ensure_dir_exists("run_parameters")
    param_ids = range(len(task_params))
    for param, seed, param_id in zip(task_params, seeds, param_ids):
        param.param_id = param_id
        param.seed = seed
    assert len(task_params) == gpyopt_batch_size

    nonpriv_cancertype_pairs = [
        ctp for ctp in cancer_type_pairs
        if ctp != fixed_params.priv_cancertype_pair
    ]
    assert len(nonpriv_cancertype_pairs) == len(cancer_type_pairs) - 1

    res = np.zeros((len(task_params), fixed_params.param_opt_folds))
    for fold in range(fixed_params.param_opt_folds):
        val_cancertype_pairs = [
            ctp for (i, ctp) in enumerate(nonpriv_cancertype_pairs)
            if i % fixed_params.param_opt_folds == fold
        ]
        learn_cancertype_pairs = [
            ctp for (i, ctp) in enumerate(nonpriv_cancertype_pairs)
            if i % fixed_params.param_opt_folds != fold
        ]
        assert (len(val_cancertype_pairs) +
                len(learn_cancertype_pairs) == len(nonpriv_cancertype_pairs))

        common_params = SimpleNamespace(
            **fixed_params.__dict__,
            priv_cancertype_pairs=val_cancertype_pairs,
            pub_cancertypes=sum(learn_cancertype_pairs, []),
            task_type='paramopt',
        )
        args.wait = True
        batch2.run_tasks(args,
                         common_params,
                         task_params,
                         slurm_args=slurm_args,
                         params_file=("run_parameters/batch-%s.pkl" %
                                      (fixed_params.test_name)))
        # get results
        for param_id in param_ids:
            full_model_id = "%s-%s" % (fixed_params.test_name, param_id)
            filename = "param_opt/opt_result-%s.txt" % (full_model_id)
            try:
                res[param_id, fold] = np.loadtxt(filename)
                import os
                os.remove(filename)
            except:
                res[param_id, fold] = gpyopt_fail_res + np.random.randn(
                ) * gpyopt_fail_res_std
                logging.info('Warning, could not load "%s"' % filename)
    return np.mean(res, axis=1, keepdims=True)

Esempio n. 8

0

Mostra file

def rclone_upload(local_path: str,
                  rc_remote_path: str,
                  rc_bwlimit: str = None,
                  rc_logfile: str = None,
                  rc_dry_run: bool = False) -> None:
    """Use rclone to move something.
    https://rclone.org/docs/ """
    logging.debug('rclone_upload() args={0!r}'.format(locals()))  # SUPER DEBUG
    logging.info(
        'Using rclone to move local_path={l!r} to rc_remote_path={r!r}'.format(
            l=local_path, r=rc_remote_path))
    # command - prepare args
    # https://rclone.org/commands/rclone_move/
    cmd = [
        'rclone',
        'move',
        local_path,
        rc_remote_path,
    ]
    if rc_bwlimit:  # Throttle/ratelimit
        cmd.append('--bwlimit')
        cmd.append('{0}'.format(rc_bwlimit))
    if rc_logfile:  # Verbose debugging info. https://rclone.org/docs/#log-level-level
        cmd.append('--log-file={0}'.format(rc_logfile))
        cmd.append('--log-level')
        cmd.append('DEBUG')
    if rc_dry_run:  # https://rclone.org/docs/#n-dry-run
        cmd.append('--dry-run')
    logging.debug('cmd={0!r}'.format(cmd))
    # command - execute
    common.ensure_dir_exists(dir_path=os.path.join(
        'debug'))  # Protect against missing dir for stdout/stderr temp files.
    stdout_path = os.path.join('debug', 'ia2rc.rclone_upload.stdout.txt')
    stderr_path = os.path.join('debug', 'ia2rc.rclone_upload.stderr.txt')
    with open(
            stdout_path, 'w'
    ) as f_stdout:  # File objects required to capture stdout and stderr.
        with open(stderr_path, 'w') as f_stderr:
            cmd_res = subprocess.run(
                args=cmd,
                encoding='utf8',
                stdout=f_stdout,
                stderr=f_stderr,
            )
    # command - capture and tolerate result
    logging.debug(
        'cmd={0!r}'.format(cmd))  # Extra-detailed logging for dev only
    logging.debug('cmd_res={0!r}'.format(cmd_res))
    logging.debug('cmd_res.returncode={0!r}'.format(cmd_res.returncode))
    assert (
        cmd_res.returncode == 0
    )  # Nonzero means problems occured. TODO: Check return code better -2020-07-21.
    logging.info(
        'Finished rclone move local_path={l!r} to rc_remote_path={r!r}'.format(
            l=local_path, r=rc_remote_path))
    return

Esempio n. 9

0

Mostra file

File: test_vae.py Progetto: jamesdu0504/dp-representation-transfer

def task(args):
    seed, (algName, _, makeAlg) = args
    data_type = "vae_test"
    logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed,
                 algName)
    np.random.seed(seed)
    x = np.random.normal(0.0, 1.0, (1000, 2))
    x = np.dot(x, np.array([[5.0, 3.0], [0.3, -0.5]]))
    data_dim = x.shape[1]
    logging.info(" * training set: %d x %d" % x.shape)
    logging.info(" * testing set: %d x %d" % x.shape)
    # init rng

    logging.info("Running and evaluating the algorithm...")
    logging.info(" * using representation with dimension = %d", repr_dim)

    # init the algorithm
    alg = makeAlg(data_dim, repr_dim)

    # create output dir if does not exist
    #ensure_dir_exists('res')

    # define the progress saving function
    #progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName)
    #progress_file = open(progress_filename, 'w', encoding='utf-8')
    #def save_progress():
    #  x_test_pred = alg.decode(alg.encode(x_test))
    #  rel_mse = relative_mean_squared_error(x_test, x_test_pred)
    #  progress_file.write("%g\n" % rel_mse)

    # fit to the training data
    alg.learn(x, log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)))

    x_test = x

    # test with the testing data
    x_test_pred = alg.decode(alg.encode(x_test))
    #x_test_pred = alg.decode_generate(alg.encode(x_test))
    #x_test_pred = alg.decode_generate(alg.encode_generate(x_test))
    #x_test_pred = alg.decode(alg.encode_generate(x_test))
    ensure_dir_exists('pred')
    data_filename = 'data/generated/%s-%d' % (data_type, seed)
    pred_filename = 'pred/final-encdec-%s-%d-%s' % (data_type, seed, algName)
    if save_pred:
        np.save(data_filename, x_test)
        np.save(pred_filename, x_test_pred)
    #from sklearn import metrics
    #mse = metrics.mean_squared_error(x_test, x_test_pred,
    #    multioutput='uniform_average')
    #explained_var = metrics.explained_variance_score(x_test, x_test_pred,
    #    multioutput='uniform_average')
    mse = mean_squared_error(x_test, x_test_pred)
    rel_mse = relative_mean_squared_error(x_test, x_test_pred)

    logging.info("Result: rel_mse = %g", rel_mse)

Esempio n. 10

0

Mostra file

def create_top_dirs(prm):
    is_multi_host = (prm.host_set != [])
    sharepath = prm.network_shared_path
    if os.path.exists(sharepath):
        shutil.rmtree(sharepath)
        if is_multi_host:
            # so all remote clients see that directory was recreated
            time.sleep(2.1)
    common.ensure_dir_exists(sharepath)
    if is_multi_host:
        # workaround to force cross-host synchronization
        os.listdir(sharepath)
        time.sleep(1.1)  # lets NFS mount option actimeo=1 take effect

Esempio n. 11

0

Mostra file

File: clippingomega.py Progetto: jamesdu0504/dp-representation-transfer

def task(args):
  import diffpri as dp
  n, d, e = args
  logging.info("n = %d, d = %d, e = %s", n, d, e)
  if n == 0 or np.isinf(e): # no pv data -> no clipping
    wx = 0.0
    wy = 0.0
  else:
    wx, wy = dp.omega(n,d,e,mcmc)
  ensure_dir_exists("drugsens_params/clipping")
  with open("drugsens_params/clipping/wx_n%d_d%d_e%s.txt" % (n, d, e), 'w') as f:
    f.write("%s" % wx)
  with open("drugsens_params/clipping/wy_n%d_d%d_e%s.txt" % (n, d, e), 'w') as f:
    f.write("%s" % wy)

Esempio n. 12

0

Mostra file

File: optimize_repr_learning_params.py Progetto: jamesdu0504/dp-representation-transfer

def run_optimization(args, domain, constraints, batch_size, max_iter, max_duration=None, deadline=None):
  logging.info('Starting parameter optimization...')
  import GPyOpt
  ensure_dir_exists("param_opt")

  if max_duration is not None:
    new_dl = datetime.datetime.now() + max_duration
    if deadline is None or new_dl < deadline:
      deadline = new_dl

  initial_design_type = 'random'
  initial_design_numdata = batch_size
  logging.info('Selecting initial parameters...')
  space = GPyOpt.core.task.space.Design_space(domain, constraints)
  params = GPyOpt.experiment_design.initial_design(initial_design_type, space, initial_design_numdata)
  logging.info('Running...')
  results = run_batch(args, params)
  all_params = params
  all_results = results
  for i in range(max_iter):
    print(np.hstack((all_params, all_results)), flush=True)
    logging.info('Selecting a new set of parameters...')
    bo = GPyOpt.methods.BayesianOptimization(f=None,
                                              domain = domain,
                                              X = all_params,
                                              Y = -all_results,
                                              acquisition_type = 'EI',
                                              normalize_Y = True,
                                              evaluator_type = 'local_penalization',
                                              batch_size = batch_size,
                                              acquisition_jitter = 0,
                                              maximize = False)

    params = bo.suggest_next_locations()
    logging.info('Running...')
    results = run_batch(args, params)
    all_params = np.vstack((all_params, params))
    all_results = np.vstack((all_results, results))
    np.save("param_opt/opt_params%s.npy" % id_suffix, all_params)
    np.save("param_opt/opt_results%s.npy" % id_suffix, all_results)

    if datetime.datetime.now() >= deadline:
      logging.info('Gpyopt iteration %d: Time based stopping' % (i))
      break
  
  return all_params[np.argmax(np.results)]

Esempio n. 13

0

Mostra file

File: optimize_test_repr_learning_params2.py Progetto: jamesdu0504/dp-representation-transfer

def run_test(args, fixed_params, task_param, seeds, slurm_args=None):
    logging.info('Running final tests with...')
    import copy
    task_params = [copy.copy(task_param) for s in seeds]
    param_ids = range(len(task_params))
    for param, seed, param_id in zip(task_params, seeds, param_ids):
        param.param_id = param_id
        param.seed = seed
    ensure_dir_exists("run_parameters")

    nonpriv_cancertype_pairs = [
        ctp for ctp in cancer_type_pairs
        if ctp != fixed_params.priv_cancertype_pair
    ]
    assert len(nonpriv_cancertype_pairs) == len(cancer_type_pairs) - 1

    common_params = SimpleNamespace(
        **fixed_params.__dict__,
        priv_cancertype_pairs=[fixed_params.priv_cancertype_pair],
        pub_cancertypes=sum(nonpriv_cancertype_pairs, []),
        task_type='test',
    )
    args.wait = True
    batch2.run_tasks(args,
                     common_params,
                     task_params,
                     slurm_args=slurm_args,
                     params_file=("run_parameters/batch-%s.pkl" %
                                  (fixed_params.test_name)))
    #
    res = np.zeros((len(task_params), 1))
    for param_id in param_ids:
        full_model_id = "%s-%s" % (fixed_params.test_name, param_id)
        filename = "param_opt/opt_result-%s.txt" % (full_model_id)
        try:
            res[param_id] = np.loadtxt(filename)
            import os
            os.remove(filename)
        except:
            res[param_id] = np.nan  #gpyopt_fail_res
            logging.info('Warning, could not load "%s"' % filename)
    filename = "res/test_results-%s.txt" % (fixed_params.test_name)
    logging.info("Writing final results to '%s'" % filename)
    np.savetxt(filename, res)

Esempio n. 14

0

Mostra file

def run_optimization_batch(args,
                           fixed_params,
                           task_params,
                           seeds,
                           slurm_args=None):
    ensure_dir_exists("run_parameters")
    param_ids = range(len(task_params))
    for param, seed, param_id in zip(task_params, seeds, param_ids):
        param.param_id = param_id
        #param.priv_cancertypes = val_cancertypes
        #param.skip_cancertypes = priv_cancertypes
        param.seed = seed
    #np.save("run_parameters/params-%s.npy" % (test_name), params)
    assert len(task_params) == gpyopt_batch_size
    common_params = SimpleNamespace(
        **fixed_params.__dict__,
        pred_cancertypes=fixed_params.val_cancertypes,
        skip_cancertypes=fixed_params.priv_cancertypes,
        task_type='paramopt',
    )
    args.wait = True
    batch2.run_tasks(args,
                     common_params,
                     task_params,
                     slurm_args=slurm_args,
                     params_file=("run_parameters/batch-%s.pkl" %
                                  (fixed_params.test_name)))
    # get results
    #return np.random.randn(gpyopt_batch_size, 1)
    res = np.zeros((len(task_params), 1))
    for param_id in param_ids:
        full_model_id = "%s-%s" % (fixed_params.test_name, param_id)
        filename = "param_opt/opt_result-%s.txt" % (full_model_id)
        try:
            res[param_id] = np.loadtxt(filename)
            import os
            os.remove(filename)
        except:
            res[param_id] = gpyopt_fail_res + np.random.randn(
            ) * gpyopt_fail_res_std
            logging.info('Warning, could not load "%s"' % filename)
    return res

Esempio n. 15

0

Mostra file

def run_test(args, fixed_params, task_param, seeds, slurm_args=None):
    logging.info('Running tests...')

    task_params = [[
        SimpleNamespace(
            pred_cancertypes=priv_cancertypes,
            seed=seed,
        ) for seed in seeds
    ] for priv_cancertypes in cancer_type_pairs]
    task_params = sum(task_params, [])
    param_ids = range(len(task_params))

    for param, param_id in zip(task_params, param_ids):
        param.param_id = param_id

    ensure_dir_exists("run_parameters")
    #np.save("run_parameters/params-%s.npy" % (test_name), params)
    common_params = SimpleNamespace(**fixed_params.__dict__, )
    args.wait = True
    batch2.run_tasks(args,
                     common_params,
                     task_params,
                     slurm_args=slurm_args,
                     params_file=("run_parameters/batch-%s.pkl" %
                                  (fixed_params.test_name)))
    #
    res = np.zeros((len(task_params), 1))
    for param_id in param_ids:
        full_model_id = "%s-%s" % (fixed_params.test_name, param_id)
        filename = "param_opt/opt_result-%s.txt" % (full_model_id)
        try:
            res[param_id] = np.loadtxt(filename)
            import os
            os.remove(filename)
        except:
            res[param_id] = np.nan
            logging.info('Warning, could not load "%s"' % filename)
    res = np.reshape(res, (len(cancer_type_pairs), len(seeds)))
    filename = "res/test_results-%s.txt" % (fixed_params.test_name)
    logging.info("Writing final results to '%s'" % filename)
    np.savetxt(filename, res)

Esempio n. 16

0

Mostra file

def task(args):
  data_type, seed, (algName, _, makeAlg) = args
  logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName)
  # read the data sets
  logging.info("Reading data...")
  y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed))
  data_dim = x_train.shape[1]
  logging.info(" * training set: %d x %d" % x_train.shape)
  logging.info(" * testing set: %d x %d" % x_test.shape)
  # init rng  
  np.random.seed(seed)

  x_test = x_train

  logging.info("Running and evaluating the algorithm...")
  
  # init the algorithm
  alg = makeAlg(data_dim, repr_dim)
  
  # create output dir if does not exist
  ensure_dir_exists('res')

  from sklearn.decomposition import PCA as sk_PCA
  pca = sk_PCA(n_components=repr_dim)
  pca.fit(x_train)
  y_train = pca.transform(x_train)
  y_test = pca.transform(x_test)

  # define the progress saving function
  progress_filename = 'res/progress-enc-mse-%s-%d-%s.txt' % (data_type, seed, algName)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  def save_progress():
    y_test_pred = alg.encode(x_test)
    rel_mse = relative_mean_squared_error(y_test, y_test_pred)
    progress_file.write("%g\n" % rel_mse)
  
  # fit
  alg.learn(x_train, y_train,
            log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)),
            callbacks=[save_progress])

Esempio n. 17

0

Mostra file

File: learn_and_predict_tcga_split_cancertypes_optim.py Progetto: jamesdu0504/dp-representation-transfer

def run_optimization(args, domain, constraints, batch_size, max_iter):
    logging.info('Starting parameter optimization...')
    import GPyOpt
    ensure_dir_exists("param_opt")
    initial_design_type = 'random'
    initial_design_numdata = batch_size
    logging.info('Selecting initial parameters...')
    space = GPyOpt.core.task.space.Design_space(domain, constraints)
    params = GPyOpt.experiment_design.initial_design(initial_design_type,
                                                     space,
                                                     initial_design_numdata)
    logging.info('Running...')
    results = run_batch(args, params)
    all_params = params
    all_results = results
    for i in range(max_iter):
        print(all_params, flush=True)
        print(all_results, flush=True)
        logging.info('Selecting a new set of parameters...')
        bo = GPyOpt.methods.BayesianOptimization(
            f=None,
            domain=domain,
            X=all_params,
            Y=all_results,
            acquisition_type='EI',
            normalize_Y=True,
            evaluator_type='local_penalization',
            batch_size=batch_size,
            acquisition_jitter=0)

        params = bo.suggest_next_locations()
        logging.info('Running...')
        results = run_batch(args, params)
        all_params = np.vstack((all_params, params))
        all_results = np.vstack((all_results, results))
        np.save("param_opt/opt_params%s.npy" % id_suffix, all_params)
        np.save("param_opt/opt_results%s.npy" % id_suffix, all_results)

Esempio n. 18

0

Mostra file

def task(args):
    repr_dim, (alg_id, load_model) = args
    logging.info("representation size = %d, algorithm = %s", repr_dim, alg_id)

    # read the GDSC gene expression data
    logging.info("Reading gene expression data...")
    import pandas
    data = pandas.read_hdf("data/%s.h5" % (data_set),
                           'redistributed_gene_expressions')
    x = data.as_matrix()
    logging.info(" * data shape: %d x %d" % x.shape)

    # normalize the input to _total_ unit variance and zero mean
    if normalize_data:
        x -= np.mean(x)
        x /= np.std(x)

    # init rng
    np.random.seed(0)

    # load the model
    logging.info("Loading the model...")
    alg = load_model(repr_dim)

    # get the representation
    logging.info("Computing the representation or size %d..." % (repr_dim))
    x_repr = alg.encode(x)

    # variance of each representation component
    #repr_vars = np.var(x_repr, axis=0)

    repr_avg = np.mean(x_repr, axis=0)

    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    fig, axes = plt.subplots(nrows=repr_dim,
                             ncols=1,
                             figsize=(16, 10),
                             sharex=True,
                             sharey=True)

    logging.info("Computing and plotting projections...")
    x_repr_onedim = np.empty(x_repr.shape)
    for i in range(repr_dim):
        logging.info("  * component %d/%d" % (i + 1, repr_dim))
        x_repr_onedim[:, :] = repr_avg
        x_repr_onedim[:, i] = x_repr[:, i]
        repr_proj = alg.decode(x_repr_onedim)
        proj_std = np.std(repr_proj, axis=0)
        #plt.subplot(repr_dim, 1, i+1)
        axes[i].bar(np.arange(x.shape[1]),
                    proj_std,
                    color='b',
                    edgecolor='none')
        #axes[i].bar(np.arange(50), proj_std[0:50], color='b', edgecolor='none')
        plt.ylabel("projection std")
        plt.xlabel("gene")
        #plt.title("repr component %d" % i)

    ensure_dir_exists("figs/tcga_repr_projections")
    figname = "figs/tcga_repr_projections/d%d_%s.png" % (repr_dim, alg_id)
    plt.savefig(figname, format='png', dpi=200)
    plt.close(fig)

Esempio n. 19

0

Mostra file

File: optimize_repr_learning_params.py Progetto: jamesdu0504/dp-representation-transfer

def run_alg(x_pub, y_pub, x_priv, y_priv, params, full_model_id):

  ##################################
  #   representation learning
  #################################
  x = x_pub
  y = y_pub

  # separate validation set if needed
  val_x = None
  #val_y = None
  if validation_split:
    logging.info("Splitting into training and validation sets")
    from sklearn.model_selection import train_test_split
    train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=validation_split, random_state=0)
    x, y = train_x, train_y
    logging.info(" * training set shape: %d x %d" % x.shape)
    logging.info(" * validation set shape: %d x %d" % val_x.shape)
  
  data_dim = x.shape[1]
  logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

  repr_dim = int(round(params.repr_dim))

  logging.info("Learning the representation on public data...")
  logging.info(" * learning a representation of size %d", repr_dim)
  start_time = time.time()
  
  # init the algorithm
  #alg = make_alg(data_dim, repr_dim, num_classes)
  #alg = make_alg(data_dim, repr_dim)
  from models.vae_pytorch import VAE
  alg = VAE().init(
    input_dim = data_dim,
    latent_dim = repr_dim,
    #enc_dims = [],
    enc_dims = [int(10 ** params.hidden_layer_size_mul_log10)*repr_dim] * int(params.n_hidden_layers),
    dec_dims = 'same',
    enc_activations = 'relu',
    dec_activations = 'relu',
    prediction_mean_activation = 'sigmoid',
    prediction_var = 'gs',
    prediction_log_var_min = math.log(0.01**2),
    normalize_input_type = 'quantiles',
    normalize_input_quantile = 0.05,
    normalize_input_axis = 'global',
    normalize_input_target = (0, 1),
    normalize_input_clip = True,
    optimizer = 'Adam',
    optimizer_params = {'lr': 10.0 ** params.learning_rate_log10},
    n_epochs = n_epochs,
    early_stopping = True,
    reduce_lr_on_plateau = False,
    batch_size = batch_size)

  # create output dir if does not exist
  #ensure_dir_exists('res')

  # define the progress saving function
  ensure_dir_exists('param_opt/progress')
  progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % (full_model_id)
  progress_file = open(progress_filename, 'w', encoding='utf-8')
  #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id)
  #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
  if val_x is not None:
    val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % (full_model_id)
    val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
    #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id)
    #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
  def save_progress():
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    progress_file.write("%g\n" % rel_mse)
    #aux_pred = alg.predict_secondary(x)
    #aux_rel_ce = relative_cross_entropy(y, aux_pred)
    #aux_progress_file.write("%g\n" % aux_rel_ce)
    if val_x is not None:
      val_x_pred = alg.decode(alg.encode(val_x))
      rel_mse = relative_mean_squared_error(val_x, val_x_pred)
      val_progress_file.write("%g\n" % rel_mse)
      #val_aux_pred = alg.predict_secondary(val_x)
      #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
      #aux_val_progress_file.write("%g\n" % aux_rel_ce)
  
  # fit to the training data
  ensure_dir_exists("param_opt/log/")
  alg.learn(x, validation_data=val_x,
            log_file_prefix=("param_opt/log/%s" % (full_model_id)),
            per_epoch_callback_funs=[save_progress],
            deadline=None, max_duration=repr_max_duration)

  # test reconstruction error
  x_pred = alg.decode(alg.encode(x))
  rel_mse = relative_mean_squared_error(x, x_pred)
  val_x_pred = alg.decode(alg.encode(val_x))
  val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
  logging.info(" * final error: rel_mse = %g, val_rel_mse = %g",
              rel_mse, val_rel_mse)

  elapsed = time.time() - start_time
  logging.info(" * running time = %s", pretty_duration(elapsed))


  ##################################
  #   representation mapping
  #################################

  x = x_priv
  y = y_priv

  # get the representation
  logging.info("Making the representation of private data...")
  x_repr = alg.encode(x)

  # test to predict the data itself
  x_pred = alg.decode(x_repr)
  rel_mse = relative_mean_squared_error(x, x_pred)
  logging.info(" * reconstruct the data: rel_mse = %g", rel_mse)

  ##################################
  #   prediction
  #################################

  x = x_repr

  # private or non-private logistic regression
  private = True

  # test prediction with cross validation
  logging.info("Prediction with %d-fold cross validation...", pred_cv_folds)
  from sklearn.model_selection import StratifiedKFold
  cv = StratifiedKFold(n_splits=pred_cv_folds, shuffle=True, random_state=0)
  avg_test_acc = 0
  for fold, (train, test) in enumerate(cv.split(x, y)):
    logging.info("Fold %d...", fold)
    x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
  
    # init rng  
    #np.random.seed(seed0)

    logging.info("Bounding the data to 1-sphere...")
    if scale_fun == "norm_max":
      logging.info(" * scale by max norm")
      scale_factor = np.amax(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_max":
      logging.info(" * scale each dimension by max absolute value")
      scale_factor = np.amax(np.abs(x_train), axis=0)
    elif scale_fun == "norm_avg":
      logging.info(" * scale by average norm")
      scale_factor = np.mean(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_std":
      logging.info(" * scale each dimension by standard deviation")
      scale_factor = np.std(x_train, axis=0)
    elif scale_fun == "none":
      scale_factor = 1.0
    else:
      assert False

    x_train /= scale_factor * scale_const
    x_test /= scale_factor * scale_const
    if clip == "norm":
      logging.info(" * clip norms to max 1")
      x_train /= np.maximum(np.linalg.norm(x_train, axis=1, keepdims=True) * (1 + bounding_slack), 1)
      x_test /= np.maximum(np.linalg.norm(x_test, axis=1, keepdims=True) * (1 + bounding_slack),1)
    elif clip == "dims":
      assert False, "not implemented"
    elif clip == "none":
      logging.info(" * no clipping -> no bounding")
      assert private == False #or np.isinf(epsilon)
    else:
      assert False

    # fit
    logging.info("Fitting a model...")
    if private:
      logging.info(" * DP logistic regression: epsilon=%g, alpha=%g", epsilon, regularizer_strength)
      from models.logistic_regression import DPLogisticRegression
      model = DPLogisticRegression().init(repr_dim, classes=np.unique(y),
                                          alpha=regularizer_strength, epsilon=epsilon)
    else:
      logging.info(" * logistic regression: alpha=%g", regularizer_strength)
      from sklearn.linear_model import LogisticRegression
      model = LogisticRegression(C=1/regularizer_strength)
    
    model.fit(x_train, y_train)
    #print(model.predict(x_test))

    # compute mean accuracy on test set
    logging.info("Testing the model...")
    #acc = model.score(x_test, y_test)
    from sklearn.metrics import accuracy_score
    train_acc = accuracy_score(y_train, model.predict(x_train))
    test_acc = accuracy_score(y_test, model.predict(x_test))
    logging.info(" * train accuracy = %.6f", train_acc)
    logging.info(" * test accuracy = %.6f", test_acc)
    avg_test_acc += test_acc
  
  avg_test_acc /= pred_cv_folds
  logging.info("Average test accuracy = %.6f", avg_test_acc)
  
  return avg_test_acc

Esempio n. 20

0

Mostra file

File: optimize_repr_learning_params.py Progetto: jamesdu0504/dp-representation-transfer

def run_test(args, params):
  logging.info('Running final tests with...')
  ensure_dir_exists("run_parameters")
  np.save("run_parameters/test_params.npy", params)
  args.wait = True
  batch.run_tasks(args)

Esempio n. 21

0

Mostra file

def learn_repr(x, y, params, full_model_id):

    # separate validation set if needed
    val_x = None
    #val_y = None
    if params.repr_learn_validation_split:
        logging.info("Splitting into training and validation sets")
        from sklearn.model_selection import train_test_split
        train_x, val_x, train_y, val_y = train_test_split(
            x, y, test_size=params.repr_learn_validation_split, random_state=0)
        x, y = train_x, train_y
        logging.info(" * training set shape: %d x %d" % x.shape)
        logging.info(" * validation set shape: %d x %d" % val_x.shape)

    data_dim = x.shape[1]
    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    repr_dim = int(round(params.repr_dim))

    logging.info("Learning the representation on public data...")
    logging.info(" * learning a representation of size %d", repr_dim)
    start_time = time.time()

    (_, _, _, make_alg, _) = select_repr_alg(params.repr_alg)

    # init the algorithm
    #alg = make_alg(data_dim, repr_dim, num_classes)
    #alg = make_alg(data_dim, repr_dim)
    alg = make_alg(data_dim, repr_dim, params)
    # create output dir if does not exist
    #ensure_dir_exists('res')

    # define the progress saving function
    ensure_dir_exists('param_opt/progress')
    progress_filename = 'param_opt/progress/encdec-mse-%s.txt' % (
        full_model_id)
    progress_file = open(progress_filename, 'w', encoding='utf-8')
    #aux_progress_filename = 'param_opt/progress/aux-ce-%s.txt' % (full_model_id)
    #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
    if val_x is not None:
        val_progress_filename = 'param_opt/progress/encdec-validation-mse-%s.txt' % (
            full_model_id)
        val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
        #aux_val_progress_filename = 'param_opt/progress/aux-validation-ce-%s.txt' % (full_model_id)
        #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
    def save_progress():
        x_pred = alg.decode(alg.encode(x))
        rel_mse = relative_mean_squared_error(x, x_pred)
        progress_file.write("%g\n" % rel_mse)
        #aux_pred = alg.predict_secondary(x)
        #aux_rel_ce = relative_cross_entropy(y, aux_pred)
        #aux_progress_file.write("%g\n" % aux_rel_ce)
        if val_x is not None:
            val_x_pred = alg.decode(alg.encode(val_x))
            rel_mse = relative_mean_squared_error(val_x, val_x_pred)
            val_progress_file.write("%g\n" % rel_mse)
            #val_aux_pred = alg.predict_secondary(val_x)
            #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
            #aux_val_progress_file.write("%g\n" % aux_rel_ce)

    # fit to the training data
    ensure_dir_exists("param_opt/log/")
    alg.learn(x,
              validation_data=val_x,
              log_file_prefix=("param_opt/log/%s" % (full_model_id)),
              per_epoch_callback_funs=[save_progress],
              deadline=None,
              max_duration=params.repr_learn_max_duration)

    # test reconstruction error
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    if val_x is not None:
        val_x_pred = alg.decode(alg.encode(val_x))
        val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
    else:
        val_rel_mse = np.nan
    logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse,
                 val_rel_mse)

    elapsed = time.time() - start_time
    logging.info(" * running time = %s", pretty_duration(elapsed))

    return alg

Esempio n. 22

0

Mostra file

def task(common_params, task_params):
    # add logging file
    log_file_name = "log/opttest-task-%s-%s-s%d.log" % (
        common_params.test_name, common_params.task_type, task_params.seed)
    log_file_handler = logging.FileHandler(log_file_name, mode='w')
    log_file_handler.setFormatter(log_file_formatter)
    logging.getLogger().addHandler(log_file_handler)

    logging.info("test_name = %s", common_params.test_name)
    logging.info("params_id = %s", task_params.param_id)
    logging.info("Running with params: %s" % task_params)
    params = SimpleNamespace(**common_params.__dict__, **task_params.__dict__)

    (gene_expr, cancer_type) = load_data()

    # split
    logging.info("Splitting...")

    logging.info(" * private cancertype pairs: %s" %
                 params.priv_cancertype_pairs)
    logging.info(" * public cancertypes: %s" % params.pub_cancertypes)

    priv_cancertypes = sum(params.priv_cancertype_pairs, [])
    priv = cancer_type.isin(priv_cancertypes)
    pub = cancer_type.isin(params.pub_cancertypes)

    logging.info(" * %d private samples, %d public samples (of %d total)" %
                 (sum(priv), sum(pub), priv.size))

    from common import categorical_to_binary

    x_pub = gene_expr[pub].as_matrix()
    y_pub = cancer_type[pub].cat.codes.as_matrix()

    seed = int(params.seed)
    # init rng
    np.random.seed(seed)
    import torch
    torch.manual_seed(seed)
    if torch.cuda.is_available() and torch.cuda.device_count() > 0:
        torch.cuda.manual_seed(seed)

    full_model_id = "%s-%s" % (common_params.test_name, task_params.param_id)

    logging.info("Representation learning...")
    repr_alg = learn_repr(x_pub, y_pub, params, full_model_id)
    x_pub_repr = map_repr(x_pub, repr_alg, params, full_model_id)

    if params.task_type == 'paramopt':
        acc = np.zeros(len(params.priv_cancertype_pairs))
        for p, priv_cancertype_pair in enumerate(params.priv_cancertype_pairs):
            logging.info("Prediction with private cancertypes %s..." %
                         priv_cancertype_pair)
            priv = cancer_type.isin(priv_cancertype_pair)
            x_priv = gene_expr[priv].as_matrix()
            y_priv = cancer_type[priv].cat.codes.as_matrix()
            x_priv_repr = map_repr(x_priv, repr_alg, params, full_model_id)
            acc[p] = predict(x_priv_repr, y_priv, x_pub_repr, params,
                             full_model_id)

        avg_acc = np.mean(acc)
        logging.info("Total average prediction accuracy: %.6f" % avg_acc)

        logging.info("Writing results to disk...")
        filename = "param_opt/opt_result-%s.txt" % (full_model_id)
        logging.info(" * filename: %s", filename)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("%.6f\n" % avg_acc)

    elif params.task_type == 'learn_and_map':
        gdsc_gene_expr = load_gdsc_data()
        x_gdsc = gdsc_gene_expr.as_matrix()
        x_gdsc_repr = map_repr(x_gdsc, repr_alg, params, full_model_id)

        logging.info("Saving the representation...")
        ensure_dir_exists("data_repr")
        np.savetxt("data_repr/%s-%s.csv" % (gdsc_data_set, full_model_id),
                   x_gdsc_repr,
                   delimiter=',')
    else:
        assert False, "invalid task type"

Esempio n. 23

0

Mostra file

def run_optimization(args,
                     fixed_params,
                     domain,
                     constraints,
                     batch_size,
                     max_iter,
                     max_duration=None,
                     deadline=None,
                     slurm_args=None):
    logging.info('Starting parameter optimization...')
    import GPyOpt
    ensure_dir_exists("param_opt")

    if max_duration is not None:
        new_dl = datetime.datetime.now() + max_duration
        if deadline is None or new_dl < deadline:
            deadline = new_dl

    # initial parameters and values
    if fixed_params.param_opt_continue:
        logging.info('Loading earlier params and results...')
        all_params = np.load("param_opt/opt_params-%s.npy" %
                             (fixed_params.test_name))
        all_results = np.load("param_opt/opt_results-%s.npy" %
                              (fixed_params.test_name))
        opt_seeds = range(len(all_params))
    else:
        logging.info('Selecting initial parameters...')
        initial_design_type = 'random'
        initial_design_numdata = batch_size
        space = GPyOpt.core.task.space.Design_space(domain, constraints)
        opt_params = GPyOpt.experiment_design.initial_design(
            initial_design_type, space, initial_design_numdata)
        logging.info('Running...')
        opt_seeds = range(len(opt_params))
        task_params = [get_params(p, domain) for p in opt_params]
        results = run_optimization_batch(args, fixed_params, task_params,
                                         opt_seeds, slurm_args)
        all_params = opt_params
        all_results = results

    for i in range(max_iter):
        #print(np.hstack((all_params, all_results)), flush=True)
        logging.info('Best result this far: %g', np.amax(all_results))
        logging.info('Selecting a new set of parameters...')
        bo = GPyOpt.methods.BayesianOptimization(
            f=None,
            domain=domain,
            X=all_params,
            Y=-all_results,
            acquisition_type='EI',
            normalize_Y=True,
            evaluator_type='local_penalization',
            batch_size=batch_size,
            acquisition_jitter=0,
            maximize=False)

        opt_params = bo.suggest_next_locations()
        next_seed = max(opt_seeds) + 1
        opt_seeds = range(next_seed, next_seed + len(opt_params))
        logging.info('Running...')
        task_params = [get_params(p, domain) for p in opt_params]
        results = run_optimization_batch(args, fixed_params, task_params,
                                         opt_seeds, slurm_args)
        all_params = np.vstack((all_params, opt_params))
        all_results = np.vstack((all_results, results))
        np.save("param_opt/opt_params-%s.npy" % (fixed_params.test_name),
                all_params)
        np.save("param_opt/opt_results-%s.npy" % (fixed_params.test_name),
                all_results)

        if datetime.datetime.now() >= deadline:
            logging.info('Gpyopt iteration %d: Time based stopping' % (i))
            break

    all_params = [get_params(p, domain) for p in all_params]
    all_results = list(all_results)

    filename = "param_opt/paramopt-%s.txt" % (fixed_params.test_name)
    logging.info("Writing params and result to '%s'" % filename)
    with open(filename, 'wb') as f:
        pickle.dump(all_params, f)
        pickle.dump(all_results, f)

    best_params_id = np.argmax(all_results)
    best_params = all_params[best_params_id]
    best_result = all_results[best_params_id]
    logging.info('Final best result: %g', best_result)
    logging.info(' * obtained with: %s', best_params)

    filename = "res/paramopt_best_result-%s.txt" % (fixed_params.test_name)
    logging.info("Writing best result to '%s'" % filename)
    np.savetxt(filename, best_result)

    filename = "param_opt/paramopt_best_params-%s.txt" % (
        fixed_params.test_name)
    logging.info("Writing best params to '%s'" % filename)
    with open(filename, 'wb') as f:
        pickle.dump(best_params, f)

    return best_params

Esempio n. 24

0

Mostra file

File: plot_mnist_predictions.py Progetto: jamesdu0504/dp-representation-transfer

  plt.imshow(x_test[sample,:].reshape((28,28)), cmap='gray')
  if s == 0:
    plt.title("original")

s = 0
seed = seeds[s]

for a, alg_id in enumerate(algorithms):
  print("  alg = %s ..." % alg_id)
  #pred_filename = 'pred/final-encdec-%s-%d-%s.npy' % (data_type, seed, alg_id)
  #pred_rand_filename = 'pred/final-encdec-rand-%s-%d-%s.npy' % (data_type, seed, alg_id)
  pred_filename = 'pred/final-encdec-%s-r%d-s%d-%s.npy' % (data_type, repr_dim, seed, alg_id)
  pred_rand_filename = 'pred/final-encdec-rand-%s-r%d-s%d-%s.npy' % (data_type, repr_dim, seed, alg_id)
  x_test_pred = np.load(pred_filename)
  x_test_pred_rand = np.load(pred_rand_filename)
  for s, sample in enumerate(samples):
    plt.subplot(tiled[1], tiled[0], s * tiled[0] + 2*a + 2)
    plt.axis('off')
    plt.imshow(x_test_pred[sample,:].clip(0,1).reshape((28,28)), cmap='gray')
    if s == 0:
      plt.title(alg_id)
    plt.subplot(tiled[1], tiled[0], s * tiled[0] + 2*a + 3)
    plt.axis('off')
    plt.imshow(x_test_pred_rand[sample,:].clip(0,1).reshape((28,28)), cmap='gray')

ensure_dir_exists("figs/predictions")
#figname = "figs/predictions/%s" % (data_type)
figname = "figs/predictions/%s-r%d" % (data_type, repr_dim)
plt.savefig(figname)
plt.close()

Esempio n. 25

0

Mostra file

File: learn_and_predict_tcga_split_cancertypes_optim.py Progetto: jamesdu0504/dp-representation-transfer

def task(args):
    import pandas
    param_id, priv_cancertypes, seed = args
    logging.info("priv classes = %s, params_id = %s, seed = %d",
                 priv_cancertypes, param_id, seed)
    #repr_dim, (alg_id, _, make_alg), seed = args
    #logging.info("algorithm = %s, seed = %d", alg_id, seed)
    # read the data sets
    alg_id = param_id
    logging.info("Loading parameters...")
    params = np.load("run_parameters/params.npy")
    params = params[param_id]
    logging.info("Reading data...")
    gene_expr = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    logging.info(" * gene expression shape: %d x %d" % gene_expr.shape)

    logging.info("Filtering out genes with low expressions...")
    low_expr = (np.median(gene_expr, axis=0) < 0.0)
    gene_expr = gene_expr.iloc[:, ~low_expr]
    logging.info(" * %d of %d remaining (%d removed)" %
                 (sum(~low_expr), low_expr.size, sum(low_expr)))

    logging.info("Loading cancer types...")
    cancer_type = pandas.read_hdf("data/%s.h5" % (target_set), target_type)
    assert np.array_equal(gene_expr.index, cancer_type.index)

    # split
    logging.info("Splitting...")
    priv = cancer_type.isin(priv_cancertypes)
    logging.info(" * %d private samples, %d public samples (of %d total)" %
                 (sum(priv), sum(~priv), priv.size))

    from common import categorical_to_binary

    x_pub = gene_expr[~priv].as_matrix()
    y_pub = cancer_type[~priv].cat.codes.as_matrix()
    x_priv = gene_expr[priv].as_matrix()
    y_priv = cancer_type[priv].cat.codes.as_matrix()
    #y = categorical_to_binary(aux_target.values)
    #num_classes = y.shape[1]

    data_name = '-'.join(priv_cancertypes).replace(' ', '_')

    # A hack to have a different seed if the algorithm is run multiple times
    # with the same parameters. Destroys reproducibility...
    import time
    seed0 = int(time.time() * 100) % (2**32)
    # init rng
    np.random.seed(seed0)
    import torch
    torch.manual_seed(seed0)
    if torch.cuda.is_available() and torch.cuda.device_count() > 0:
        torch.cuda.manual_seed(seed0)

    ##################################
    #   representation learning
    #################################
    x = x_pub
    y = y_pub

    # separate validation set if needed
    val_x = None
    #val_y = None
    if validation_split:
        logging.info("Splitting into training and validation sets")
        from sklearn.model_selection import train_test_split
        train_x, val_x, train_y, val_y = train_test_split(
            x, y, test_size=validation_split, random_state=0)
        x, y = train_x, train_y
        #m = x.shape[0]
        #perm = np.random.permutation(m)
        #x = x[perm,:]
        #y = y[perm,:]
        #split_point = int(validation_split * m)
        #(val_x, x) = (x[:split_point,:], x[split_point:,:])
        #(val_y, y) = (y[:split_point,:], y[split_point:,:])
        logging.info(" * training set shape: %d x %d" % x.shape)
        logging.info(" * validation set shape: %d x %d" % val_x.shape)

    data_dim = x.shape[1]
    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    logging.info("Learning the representaiton on public data...")
    logging.info(" * learning a representation of size %d", repr_dim)
    start_time = time.time()

    # init the algorithm
    #alg = make_alg(data_dim, repr_dim, num_classes)
    #alg = make_alg(data_dim, repr_dim)
    from models.vae_pytorch import VAE
    alg = VAE().init(
        input_dim=data_dim,
        latent_dim=repr_dim,
        #enc_dims = [],
        enc_dims=[int(10**params.hidden_layer_size_mul_log10) * repr_dim] *
        int(params.n_hidden_layers),
        dec_dims='same',
        enc_activations='relu',
        dec_activations='relu',
        prediction_mean_activation='sigmoid',
        prediction_var='gs',
        prediction_log_var_min=math.log(0.01**2),
        normalize_input_type='quantiles',
        normalize_input_quantile=0.05,
        normalize_input_axis='global',
        normalize_input_target=(0, 1),
        normalize_input_clip=True,
        optimizer='Adam',
        optimizer_params={'lr': 10.0**params.learning_rate_log10},
        n_epochs=n_epochs,
        early_stopping=True,
        reduce_lr_on_plateau=False,
        batch_size=batch_size)

    # create output dir if does not exist
    ensure_dir_exists('res')

    full_model_id = "%s-%d-%s-s%d%s" % (data_name, repr_dim, alg_id, seed,
                                        id_suffix)

    # define the progress saving function
    progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id)
    progress_file = open(progress_filename, 'w', encoding='utf-8')
    #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id)
    #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
    if val_x is not None:
        val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % (
            full_model_id)
        val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
        #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id)
        #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
    def save_progress():
        x_pred = alg.decode(alg.encode(x))
        rel_mse = relative_mean_squared_error(x, x_pred)
        progress_file.write("%g\n" % rel_mse)
        #aux_pred = alg.predict_secondary(x)
        #aux_rel_ce = relative_cross_entropy(y, aux_pred)
        #aux_progress_file.write("%g\n" % aux_rel_ce)
        if val_x is not None:
            val_x_pred = alg.decode(alg.encode(val_x))
            rel_mse = relative_mean_squared_error(val_x, val_x_pred)
            val_progress_file.write("%g\n" % rel_mse)
            #val_aux_pred = alg.predict_secondary(val_x)
            #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
            #aux_val_progress_file.write("%g\n" % aux_rel_ce)

    # fit to the training data
    alg.learn(x,
              validation_data=val_x,
              log_file_prefix=("log/%s" % (full_model_id)),
              per_epoch_callback_funs=[save_progress],
              deadline=deadline,
              max_duration=max_duration)

    # test reconstruction error
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    val_x_pred = alg.decode(alg.encode(val_x))
    val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
    logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse,
                 val_rel_mse)

    elapsed = time.time() - start_time
    logging.info(" * running time = %s", pretty_duration(elapsed))

    # save model
    #logging.info("Saving the learned model...")
    #ensure_dir_exists('repr_models')
    #alg.save("repr_models/%s" % (full_model_id))

    ##################################
    #   representation mapping
    #################################

    x = x_priv
    y = y_priv

    # get the representation
    logging.info("Making the representation of private data...")
    x_repr = alg.encode(x)

    # test to predict the data itself
    x_pred = alg.decode(x_repr)
    rel_mse = relative_mean_squared_error(x, x_pred)
    logging.info(" * reconstruct the data: rel_mse = %g", rel_mse)
    ensure_dir_exists("res")
    with open("res/private-encdec-rel_mse-%d-%s-%s-s%d%s.txt" %
              (repr_dim, data_name, alg_id, seed, id_suffix),
              'w',
              encoding='utf-8') as f:
        f.write("%.6f\n" % rel_mse)

    # save the representation
    #logging.info("Saving the representation...")
    #ensure_dir_exists("data_repr")
    #np.savetxt("data_repr/repr-%s-%d-%s-s%d%s.csv" %
    #           (data_name, repr_dim, alg_id, seed, id_suffix),
    #           x_repr, delimiter=',')

    ##################################
    #   prediction
    #################################

    x = x_repr

    # split train and test sets
    logging.info("Splitting to train and test sets...")
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=pred_test_size, random_state=0)
    logging.info(" * train samples: %d" % x_train.shape[0])
    logging.info(" * test samples: %d" % x_test.shape[0])

    # init rng
    np.random.seed(seed0)

    #print(np.amax(np.linalg.norm(x_train, axis=1)))
    #print(np.mean(np.linalg.norm(x_train, axis=1)))

    logging.info("Bounding the data to 1-sphere...")
    if scale_fun == "norm_max":
        logging.info(" * scale by max norm")
        scale_factor = np.amax(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_max":
        logging.info(" * scale each dimension by max absolute value")
        scale_factor = np.amax(np.abs(x_train), axis=0)
    elif scale_fun == "norm_avg":
        logging.info(" * scale by average norm")
        scale_factor = np.mean(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_std":
        logging.info(" * scale each dimension by standard deviation")
        scale_factor = np.std(x_train, axis=0)
    elif scale_fun == "none":
        scale_factor = 1.0
    else:
        assert False

    x_train /= scale_factor * scale_const
    x_test /= scale_factor * scale_const
    #print(np.amax(np.linalg.norm(x_train, axis=1, keepdims=True)))
    if clip == "norm":
        logging.info(" * clip norms to max 1")
        x_train /= np.maximum(
            np.linalg.norm(x_train, axis=1, keepdims=True) *
            (1 + bounding_slack), 1)
        x_test /= np.maximum(
            np.linalg.norm(x_test, axis=1, keepdims=True) *
            (1 + bounding_slack), 1)
    elif clip == "dims":
        assert False, "not implemented"
    elif clip == "none":
        logging.info(" * no clipping -> no bounding")
        assert private == False  #or np.isinf(epsilon)
    else:
        assert False

    #for private in [False, True]:
    for private in [True]:
        # fit
        logging.info("Fitting a model...")
        if private:
            logging.info(" * DP logistic regression: epsilon=%g, alpha=%g",
                         epsilon, regularizer_strength)
            from models.logistic_regression import DPLogisticRegression
            model = DPLogisticRegression().init(repr_dim,
                                                classes=np.unique(y),
                                                alpha=regularizer_strength,
                                                epsilon=epsilon)
        else:
            logging.info(" * logistic regression: alpha=%g",
                         regularizer_strength)
            from sklearn.linear_model import LogisticRegression
            model = LogisticRegression(C=1 / regularizer_strength)

        model.fit(x_train, y_train)
        #print(model.predict(x_test))

        # compute mean accuracy on test set
        logging.info("Testing the model...")
        #acc = model.score(x_test, y_test)
        from sklearn.metrics import accuracy_score
        train_acc = accuracy_score(y_train, model.predict(x_train))
        test_acc = accuracy_score(y_test, model.predict(x_test))
        logging.info(" * train accuracy = %.6f", train_acc)
        logging.info(" * test accuracy = %.6f", test_acc)

        logging.info("Writing results to disk...")
        ensure_dir_exists("res")
        filename = (
            "res/cancertype-pred-accuracy-%d-%s-%s-s%d-%s-%d-%s%s.txt" %
            (repr_dim, data_name, alg_id, seed, scale_fun, scale_const, clip,
             ("-e%g" % (epsilon) if private else "-nonpriv")))
        logging.info(" * filename: %s", filename)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("%.6f\n" % test_acc)

        filename = "param_opt/opt_result%s-%s.txt" % (id_suffix, full_model_id)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("%.6f\n" % test_acc)

Esempio n. 26

0

Mostra file

File: plot_pred_acc_article.py Progetto: jamesdu0504/dp-representation-transfer

plt.gca().set_xticklabels([" " for a in x0])
plt.gca().tick_params(axis='x', which='both',length=0)
#plt.gca().set_xticks([])
plt.gca().set_xlim(-0.5, 0.5)
plt.gca().set_ylim(0.08, 0.38)
plt.legend()
plt.gca().set_ylabel("prediction accuracy")
plt.gca().set_xlabel("  ")


if n_files_not_found > 0:
  print("Warning: '%s' and %d other files not found." %
      (last_not_found, n_files_not_found-1))

#plt.show()

ensure_dir_exists(figpath)

figname = "%s%s%s%s" % (figname,
        ("-ica" if ica else ""),
        ("-cliponly" if clipping_only else ""),
        ("-mcmc" if mcmc else "-fixed"),
)

plt.tight_layout()

#plt.savefig(figname, format='png', dpi=300, bbox_inches='tight')
plt.savefig(figname + ".png", format='png', dpi=300)
plt.savefig(figname + ".pdf", format='pdf', dpi=300)

Esempio n. 27

0

Mostra file

File: test_mnist_vae_torch.py Progetto: jamesdu0504/dp-representation-transfer

def task(args):
    seed, (algName, _, makeAlg) = args
    data_type = "mnist"
    logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed,
                 algName)

    # init rng
    np.random.seed(seed)

    # load mnist
    from keras.datasets import mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    image_shape = x_train.shape[1:]
    x_train = x_train.astype('float32') / 255.
    x_test = x_test.astype('float32') / 255.
    x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
    x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
    #x_train = x_train[0:1000,:]
    #x_test = x_test[0:100,:]
    x = x_train

    data_dim = x.shape[1]
    logging.info(" * training set: %d x %d" % x.shape)
    logging.info(" * testing set: %d x %d" % x_test.shape)

    logging.info("Running and evaluating the algorithm...")
    logging.info(" * using representation with dimension = %d", repr_dim)

    # init the algorithm
    alg = makeAlg(data_dim, repr_dim)

    # create output dir if does not exist
    #ensure_dir_exists('res')

    # define the progress saving function
    #progress_filename = 'res/progress-encdec-mse-%s-%d-%s.txt' % (data_type, seed, algName)
    #progress_file = open(progress_filename, 'w', encoding='utf-8')
    #def save_progress():
    #  x_test_pred = alg.decode(alg.encode(x_test))
    #  rel_mse = relative_mean_squared_error(x_test, x_test_pred)
    #  progress_file.write("%g\n" % rel_mse)

    # fit to the training data
    alg.learn(x,
              validation_data=x_test,
              log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)),
              verbose='print_epochs')
    #verbose='progress_bars')

    # test with the testing data
    x_test_pred = alg.decode(alg.encode(x_test))
    x_test_pred_rand = alg.decode_generate(alg.encode(x_test))
    ensure_dir_exists('pred')
    data_filename = 'data/generated/%s' % (data_type)
    pred_filename = 'pred/final-encdec-%s-r%d-s%d-%s' % (data_type, repr_dim,
                                                         seed, algName)
    pred_rand_filename = 'pred/final-encdec-rand-%s-r%d-s%d-%s' % (
        data_type, repr_dim, seed, algName)
    if save_pred:
        np.save(data_filename, x_test)
        np.save(pred_filename, x_test_pred)
        np.save(pred_rand_filename, x_test_pred_rand)
    #from sklearn import metrics
    #mse = metrics.mean_squared_error(x_test, x_test_pred,
    #    multioutput='uniform_average')
    #explained_var = metrics.explained_variance_score(x_test, x_test_pred,
    #    multioutput='uniform_average')
    mse = mean_squared_error(x_test, x_test_pred)
    rel_mse = relative_mean_squared_error(x_test, x_test_pred)

    logging.info("Result: rel_mse = %g", rel_mse)

Esempio n. 28

0

Mostra file

def task(args):
    repr_dim, alg_id, seed = args
    logging.info("representation size = %d, algorithm = %s, seed = %d",
                 repr_dim, alg_id, seed)

    # read the PADS gene expression data
    logging.info("Reading reduced gene expression data...")
    filename = ("data_repr/repr-%s-%d-%s-%s-s%d%s.csv" %
                (data_set, repr_dim, aux_data_set, alg_id, seed, id_suffix))
    logging.info(" * filename: %s" % filename)
    x = np.loadtxt(filename, delimiter=',')
    if x.ndim < 2:
        x = x[:, np.newaxis]
    logging.info(" * data shape: %d x %d" % x.shape)

    logging.info("Reading cancer types...")
    filename = "data/%s.h5" % (target_set)
    logging.info(" * filename: %s" % filename)
    import pandas
    target = pandas.read_hdf(filename, 'cancer_types')
    logging.info(" * target size: %d" % target.shape)
    #y = target.as_matrix()
    y = target.cat.codes.as_matrix()

    # split train and test sets
    logging.info("Splitting to train and test sets...")
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=0)
    logging.info(" * train samples: %d" % x_train.shape[0])
    logging.info(" * test samples: %d" % x_test.shape[0])

    # init rng
    np.random.seed(seed)

    #print(np.amax(np.linalg.norm(x_train, axis=1)))
    #print(np.mean(np.linalg.norm(x_train, axis=1)))

    logging.info("Bounding the data to 1-sphere...")
    if scale_fun == "norm_max":
        logging.info(" * scale by max norm")
        scale_factor = np.amax(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_max":
        logging.info(" * scale each dimension by max absolute value")
        scale_factor = np.amax(np.abs(x_train), axis=0)
    elif scale_fun == "norm_avg":
        logging.info(" * scale by average norm")
        scale_factor = np.mean(np.linalg.norm(x_train, axis=1))
    elif scale_fun == "dims_std":
        logging.info(" * scale each dimension by standard deviation")
        scale_factor = np.std(x_train, axis=0)
    elif scale_fun == "none":
        scale_factor = 1.0
    else:
        assert False

    x_train /= scale_factor * scale_const
    x_test /= scale_factor * scale_const
    #print(np.amax(np.linalg.norm(x_train, axis=1, keepdims=True)))
    if clip == "norm":
        logging.info(" * clip norms to max 1")
        x_train /= np.maximum(
            np.linalg.norm(x_train, axis=1, keepdims=True) *
            (1 + bounding_slack), 1)
        x_test /= np.maximum(
            np.linalg.norm(x_test, axis=1, keepdims=True) *
            (1 + bounding_slack), 1)
    elif clip == "dims":
        assert False, "not implemented"
    elif clip == "none":
        logging.info(" * no clipping -> no bounding")
        assert private == False  #or np.isinf(epsilon)
    else:
        assert False

    # fit
    logging.info("Fitting a model...")
    if private:
        logging.info(" * DP logistic regression: epsilon=%g, alpha=%g",
                     epsilon, regularizer_strength)
        from models.logistic_regression import DPLogisticRegression
        model = DPLogisticRegression().init(repr_dim,
                                            classes=np.unique(y),
                                            alpha=regularizer_strength,
                                            epsilon=epsilon)
    else:
        logging.info(" * logistic regression: alpha=%g", regularizer_strength)
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(C=1 / regularizer_strength)

    model.fit(x_train, y_train)
    #print(model.predict(x_test))

    # compute mean accuracy on test set
    logging.info("Testing the model...")
    #acc = model.score(x_test, y_test)
    from sklearn.metrics import accuracy_score
    train_acc = accuracy_score(y_train, model.predict(x_train))
    test_acc = accuracy_score(y_test, model.predict(x_test))
    logging.info(" * train accuracy = %.6f", train_acc)
    logging.info(" * test accuracy = %.6f", test_acc)

    logging.info("Writing results to disk...")
    ensure_dir_exists("res")
    filename = (
        "res/cancertype-pred-accuracy-%d-%s-%s-s%d-%s-%d-%s%s.txt" %
        (repr_dim, aux_data_set, alg_id, seed, scale_fun, scale_const, clip,
         ("-e%g" % (epsilon) if private else "-nonpriv")))
    logging.info(" * filename: %s", filename)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write("%.6f\n" % test_acc)

Esempio n. 29

0

Mostra file

            #color=line.get_color(), label=alg_id+" (val)")
    #plt.plot((max_epochs-1) * np.array([1, 1.02, 1.04]), )
    #plt.gca().annotate('foo', xy=(0.2, 0.0), xytext=(-2.0, 0.3), bbox=dict(boxstyle="round", fc="w"))
  #    offset = transforms.ScaledTranslation(dx, dy,
  #  fig.dpi_scale_trans)
  #    y = ax.transData.inverted().transform(last_rel_mse)
  #    y = y + 
  #shadow_transform = ax.transData.inverted().transform()
    #plt.plot((max_epochs-1) * np.array([1, 1.05]), [last_rel_mse])
  plt.yscale('log')
  #plt.yscale('symlog', linthreshy=1e-1)
  plt.xlabel("epoch")
  if relative_to is None:
    #plt.ylim([0, 1e1])
    plt.ylim([1e-1, 2e0])
    plt.ylabel("relative mse")
  else:
    plt.ylabel("relative mse diff from " + relative_to)
  ensure_dir_exists("figs")
  plt.legend()
  if not tiled:
    figname = "figs/%s-progress-mse-%s-%s-%d%s" % (task, data_set, input_dim, repr_dim, fig_name_suffix)
    plt.savefig(figname)
    plt.close()

if tiled:
  #figname = "figs/progress-mse-tcga-%s%s" % (input_dim, fig_name_suffix)
  figname = "figs/%s-progress-mse-tcga%s" % (task, fig_name_suffix)
  plt.savefig(figname)
  plt.close()

Esempio n. 30

0

Mostra file

def task(args):
    import pandas
    repr_dim, (alg_id, _, make_alg), seed = args
    logging.info("dataset = %s, algorithm = %s", data_set, alg_id)
    # read the data sets
    logging.info("Reading data...")
    data = pandas.read_hdf("data/%s.h5" % (data_set), data_type)
    logging.info(" * gene expression shape: %d x %d" % data.shape)

    #aux_target = pandas.read_hdf("data/TCGA_cancertype.h5", 'cancer_types')
    #logging.info(" * auxiliary target size: %d" % aux_target.shape)

    #common_samples = data.index.intersection(aux_target.index)
    #data = data.loc[common_samples]
    #aux_target = aux_target.loc[common_samples]
    #logging.info(" * number of common samples: %d" % common_samples.size)

    from common import categorical_to_binary

    x = data.as_matrix()
    #y = categorical_to_binary(aux_target.values)
    #num_classes = y.shape[1]

    #x = x[:,0:2000]

    # normalize the input to _total_ unit variance and per-feature zero mean
    if normalize_data:
        x -= np.mean(x)
        x /= np.std(x)
        x -= np.mean(x, axis=0)

    # FIXME!
    #x = (x - np.amin(x,axis=0)) / (np.amax(x,axis=0) - np.amin(x,axis=0))
    #x = (x - np.amin(x)) / (np.amax(x) - np.amin(x))

    # init rng
    np.random.seed(seed)
    import torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #if args.cuda ?????:
    #  torch.cuda.manual_seed(seed)

    # separate validation set if needed
    val_x = None
    #val_y = None
    if validation_split:
        logging.info("Splitting into training and validation sets")
        m = x.shape[0]
        perm = np.random.permutation(m)
        x = x[perm, :]
        #y = y[perm,:]
        split_point = int(validation_split * m)
        (val_x, x) = (x[:split_point, :], x[split_point:, :])
        #(val_y, y) = (y[:split_point,:], y[split_point:,:])
        logging.info(" * training set shape: %d x %d" % x.shape)
        logging.info(" * validation set shape: %d x %d" % val_x.shape)

    data_dim = x.shape[1]
    logging.info(" * data shape after preprocessing: %d x %d" % x.shape)

    logging.info("Running the algorithm...")
    logging.info(" * learning a representation of size %d", repr_dim)
    start_time = time.time()

    # init the algorithm
    #alg = make_alg(data_dim, repr_dim, num_classes)
    alg = make_alg(data_dim, repr_dim)

    # create output dir if does not exist
    ensure_dir_exists('res')

    full_model_id = "%s-%d-%s-s%d%s" % (data_set, repr_dim, alg_id, seed,
                                        id_suffix)

    # define the progress saving function
    progress_filename = 'res/progress-encdec-mse-%s.txt' % (full_model_id)
    progress_file = open(progress_filename, 'w', encoding='utf-8')
    #aux_progress_filename = 'res/progress-aux-ce-%s.txt' % (full_model_id)
    #aux_progress_file = open(aux_progress_filename, 'w', encoding='utf-8')
    if val_x is not None:
        val_progress_filename = 'res/progress-encdec-validation-mse-%s.txt' % (
            full_model_id)
        val_progress_file = open(val_progress_filename, 'w', encoding='utf-8')
        #aux_val_progress_filename = 'res/progress-aux-validation-ce-%s.txt' % (full_model_id)
        #aux_val_progress_file = open(aux_val_progress_filename, 'w', encoding='utf-8')
    def save_progress():
        x_pred = alg.decode(alg.encode(x))
        rel_mse = relative_mean_squared_error(x, x_pred)
        progress_file.write("%g\n" % rel_mse)
        #aux_pred = alg.predict_secondary(x)
        #aux_rel_ce = relative_cross_entropy(y, aux_pred)
        #aux_progress_file.write("%g\n" % aux_rel_ce)
        if val_x is not None:
            val_x_pred = alg.decode(alg.encode(val_x))
            rel_mse = relative_mean_squared_error(val_x, val_x_pred)
            val_progress_file.write("%g\n" % rel_mse)
            #val_aux_pred = alg.predict_secondary(val_x)
            #aux_rel_ce = relative_cross_entropy(val_y, val_aux_pred)
            #aux_val_progress_file.write("%g\n" % aux_rel_ce)

    # fit to the training data
    alg.learn(x,
              validation_data=val_x,
              log_file_prefix=("log/%s" % (full_model_id)),
              per_epoch_callback_funs=[save_progress],
              deadline=deadline,
              max_duration=max_duration)

    # test reconstruction error
    x_pred = alg.decode(alg.encode(x))
    rel_mse = relative_mean_squared_error(x, x_pred)
    val_x_pred = alg.decode(alg.encode(val_x))
    val_rel_mse = relative_mean_squared_error(val_x, val_x_pred)
    logging.info(" * final error: rel_mse = %g, val_rel_mse = %g", rel_mse,
                 val_rel_mse)

    elapsed = time.time() - start_time
    logging.info(" * running time = %s", pretty_duration(elapsed))

    # save model
    logging.info("Saving the learned model...")
    ensure_dir_exists('repr_models')
    alg.save("repr_models/%s" % (full_model_id))