コード例 #1
0
ファイル: dataset.py プロジェクト: evanfeinberg/deepchem
def _df_to_numpy(df, feature_types):
  """Transforms a featurized dataset df into standard set of numpy arrays"""
  if not set(feature_types).issubset(df.keys()):
    raise ValueError(
        "Featurized data does not support requested feature_types.")
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  sorted_tasks = FeaturizedSamples.get_sorted_task_names(df)
  n_tasks = len(sorted_tasks)
  y = df[sorted_tasks].values
  y = np.reshape(y, (n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))
  tensors = []
  for _, datapoint in df.iterrows():
    feature_list = []
    for feature_type in feature_types:
      feature_list.append(datapoint[feature_type])
    features = np.squeeze(np.concatenate(feature_list))
    tensors.append(features)
  x = np.stack(tensors)
  sorted_ids = df["mol_id"]

  # Set missing data to have weight zero
  missing = (y.astype(object) == "")

  y[missing] = 0.
  w[missing] = 0.

  return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
コード例 #2
0
ファイル: dataset.py プロジェクト: evanfeinberg/deepchem
def write_dataset_single(val, data_dir, feature_types):
  """Writes files for single row (X, y, w, X-transformed, ...) to disk."""
  (df_file, df) = val
  # TODO(rbharath): This is a hack. clean up.
  if not len(df):
    return None
  task_names = FeaturizedSamples.get_sorted_task_names(df)
  ids, X, y, w = _df_to_numpy(df, feature_types)
  X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
  y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)

  basename = os.path.splitext(os.path.basename(df_file))[0]
  out_X = os.path.join(data_dir, "%s-X.joblib" % basename)
  out_X_transformed = os.path.join(data_dir, "%s-X-transformed.joblib" % basename)
  out_y = os.path.join(data_dir, "%s-y.joblib" % basename)
  out_y_transformed = os.path.join(data_dir, "%s-y-transformed.joblib" % basename)
  out_w = os.path.join(data_dir, "%s-w.joblib" % basename)
  out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename)

  save_to_disk(X, out_X)
  save_to_disk(y, out_y)
  save_to_disk(w, out_w)
  save_to_disk(ids, out_ids)
  # TODO(rbharath): Should X be saved to out_X_transformed as well? Since
  # itershards expects to loop over X-transformed? (Ditto for y/w)
  return([df_file, task_names, out_ids, out_X, out_X_transformed, out_y,
          out_y_transformed, out_w,
          X_sums, X_sum_squares, X_n,
          y_sums, y_sum_squares, y_n])