def train_test_split(input_transforms, output_transforms, feature_types, splittype, data_dir):
    """Saves transformed model."""

    samples_dir = os.path.join(data_dir, "samples")
    samples = FeaturizedSamples(samples_dir, reload_data=True)

    print("Split data into train/test")
    train_samples_dir = os.path.join(data_dir, "train-samples")
    test_samples_dir = os.path.join(data_dir, "test-samples")
    train_samples, test_samples = samples.train_test_split(splittype, train_samples_dir, test_samples_dir)

    train_data_dir = os.path.join(data_dir, "train-data")
    test_data_dir = os.path.join(data_dir, "test-data")

    print("Generating train dataset.")
    train_dataset = Dataset(train_data_dir, train_samples, feature_types)

    print("Generating test dataset.")
    test_dataset = Dataset(test_data_dir, test_samples, feature_types)

    print("Transforming train data.")
    train_dataset.transform(input_transforms, output_transforms)

    print("Transforming test data.")
    test_dataset.transform(input_transforms, output_transforms)
  def test_API(self):
    """Straightforward test of multitask deepchem classification API."""
    splittype = "scaffold"
    feature_types = ["ECFP"]
    output_transforms = []
    input_transforms = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2}
    model_name = "multitask_deep_classifier"

    # Featurize input
    featurizer = DataFeaturizer(tasks=self.tasks,
                                smiles_field=self.smiles_field,
                                verbose=True)
    feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in self.tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        evaluator.compute_model_performance(test_csv_out, test_stats_out)
Exemple #3
0
def _df_to_numpy(df, feature_types):
  """Transforms a featurized dataset df into standard set of numpy arrays"""
  if not set(feature_types).issubset(df.keys()):
    raise ValueError(
        "Featurized data does not support requested feature_types.")
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  sorted_tasks = FeaturizedSamples.get_sorted_task_names(df)
  n_tasks = len(sorted_tasks)
  y = df[sorted_tasks].values
  y = np.reshape(y, (n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))
  tensors = []
  for _, datapoint in df.iterrows():
    feature_list = []
    for feature_type in feature_types:
      feature_list.append(datapoint[feature_type])
    features = np.squeeze(np.concatenate(feature_list))
    tensors.append(features)
  x = np.stack(tensors)
  sorted_ids = df["mol_id"]

  # Set missing data to have weight zero
  missing = (y.astype(object) == "")

  y[missing] = 0.
  w[missing] = 0.

  return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
Exemple #4
0
def write_dataset_single(val, data_dir, feature_types):
  """Writes files for single row (X, y, w, X-transformed, ...) to disk."""
  (df_file, df) = val
  # TODO(rbharath): This is a hack. clean up.
  if not len(df):
    return None
  task_names = FeaturizedSamples.get_sorted_task_names(df)
  ids, X, y, w = _df_to_numpy(df, feature_types)
  X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
  y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)

  basename = os.path.splitext(os.path.basename(df_file))[0]
  out_X = os.path.join(data_dir, "%s-X.joblib" % basename)
  out_X_transformed = os.path.join(data_dir, "%s-X-transformed.joblib" % basename)
  out_y = os.path.join(data_dir, "%s-y.joblib" % basename)
  out_y_transformed = os.path.join(data_dir, "%s-y-transformed.joblib" % basename)
  out_w = os.path.join(data_dir, "%s-w.joblib" % basename)
  out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename)

  save_to_disk(X, out_X)
  save_to_disk(y, out_y)
  save_to_disk(w, out_w)
  save_to_disk(ids, out_ids)
  # TODO(rbharath): Should X be saved to out_X_transformed as well? Since
  # itershards expects to loop over X-transformed? (Ditto for y/w)
  return([df_file, task_names, out_ids, out_X, out_X_transformed, out_y,
          out_y_transformed, out_w,
          X_sums, X_sum_squares, X_n,
          y_sums, y_sum_squares, y_n])
  def _create_model(self, splittype, feature_types, input_transforms,
                    output_transforms, task_type, model_params, model_name,
                    input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None):
    """Helper method to create model for test."""
    # Featurize input
    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                verbose=True)
    feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        _, _ = evaluator.compute_model_performance(
            test_csv_out, test_stats_out)