コード例 #1
0
def main(argv):
  del argv  # Unused.
  df = io_utils.read_dataframe_from_hdf5(FLAGS.original_df)

  ## Generate the percentile-normalized embeddings
  emb_df_normalized = normalize_ljosa(df)

  ## Generate the post-factor analysis embeddings
  np.random.seed(0)
  emb_df_fa = transform.factor_analysis(emb_df_normalized, 0.15, 50)

  io_utils.write_dataframe_to_hdf5(emb_df_fa, FLAGS.post_fa_path)
  io_utils.write_dataframe_to_hdf5(emb_df_normalized,
                                   FLAGS.post_normliazation_path)
コード例 #2
0
def main(argv):
  del argv

  emb_df_clean = io_utils.read_dataframe_from_hdf5(FLAGS.input_df)
  if "treatment_group" not in emb_df_clean.index.names:
    raise ValueError("Must have treatment_group in embeddings index names.")
  contents = load_contents(FLAGS.transformation_file)

  ## dictionary to save things
  save_dict = {}

  ## Get steps over training
  steps = list(contents.keys())
  steps.remove("params")
  steps = np.sort(steps)

  ## Truncate list of steps
  steps = steps[:FLAGS.num_steps]

  ## embeddings without unevaluated compound
  emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean)
  if "treatment_group" not in emb_df_valid.index.names:
    raise ValueError("Must have treatment_group in embeddings index names.")

  ## list of compounds and number of compounds
  comp_list = emb_df_valid.index.get_level_values(
      level=metadata.COMPOUND).unique()
  n_comp = len(comp_list)

  ## Set up data structure for leave-one-out cross validation
  list_of_comp_set = []
  for i in range(n_comp):
    comp_set = {}
    comp_set["b"] = comp_list[i]
    comp_set["a"] = list(set(comp_list).difference([comp_list[i]]))
    list_of_comp_set.append(comp_set)

  ## Cross validation training with leave-one-out and variable stopping time.
  (steps_max, cross_validated_scores) = cross_val_train(
      emb_df_clean, contents, steps, list_of_comp_set, n_comp,
      percent_norm=FLAGS.percentile_normalize,
      factor_analys=FLAGS.factor_analysis)

  ## Find first and last timesteps used, to use for bootstraps
  boot_steps = [steps[i] for i, v in enumerate(steps) if
                np.max(steps_max) >= v >= np.min(steps_max)]

  metrics_dict = evaluate_metrics(contents, emb_df_clean, steps_max,
                                  boot_steps, list_of_comp_set,
                                  num_bootstrap=FLAGS.num_bootstrap,
                                  percent_norm=FLAGS.percentile_normalize,
                                  factor_analys=FLAGS.factor_analysis)
  save_dict["metrics_dict"] = metrics_dict

  ## time steps where max cross validation results were found
  save_dict["list_of_time_step_max"] = steps_max

  ## accuracy for not same compound or batch, obtained at time_step_max
  ## for each individual compound.
  save_dict["metrics_dict"]["wdn"]["cross_val_scores"] = cross_validated_scores

  with gfile.GFile(FLAGS.output_file, mode="w") as f:
    f.write(pickle.dumps(save_dict))
コード例 #3
0
 def testReadHDF5(self):
     df = pandas.DataFrame(FOLD_DATA)
     actual = io_utils.read_dataframe_from_hdf5(self.base_path)
     self.assertTrue(actual.equals(df))