Beispiel #1
0
def in_silico_mutagenesis(model: Model,
                          encoded_sequences: np.ndarray) -> np.ndarray:
  """Computes in-silico-mutagenesis scores

  Parameters
  ----------
  model: Model
    This can be any model that accepts inputs of the required shape and produces
    an output of shape `(N_sequences, N_tasks)`.
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`

  Returns
  -------
  np.ndarray
    A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`.
  """
  # Shape (N_sequences, num_tasks)
  wild_type_predictions = model.predict(NumpyDataset(encoded_sequences))
  # check whether wild_type_predictions is np.ndarray or not
  assert isinstance(wild_type_predictions, np.ndarray)
  num_tasks = wild_type_predictions.shape[1]
  # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
  mutagenesis_scores = np.empty(
      encoded_sequences.shape + (num_tasks,), dtype=np.float32)
  # Shape (N_sequences, num_tasks, 1, 1, 1)
  wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
                                                np.newaxis]
  for sequence_index, (sequence, wild_type_prediction) in enumerate(
      zip(encoded_sequences, wild_type_predictions)):

    # Mutates every position of the sequence to every letter
    # Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
    # Breakdown:
    # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1)
    mutated_sequences = np.repeat(
        sequence[np.newaxis], np.prod(sequence.shape), axis=0)

    # remove wild-type
    # len(arange) = N_letters * sequence_length
    arange = np.arange(len(mutated_sequences))
    # len(horizontal cycle) = N_letters * sequence_length
    horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0])
    mutated_sequences[arange, :, horizontal_cycle, :] = 0

    # add mutant
    vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1])
    mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
    # make mutant predictions
    mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
    # check whether wild_type_predictions is np.ndarray or not
    assert isinstance(mutated_predictions, np.ndarray)
    mutated_predictions = mutated_predictions.reshape(sequence.shape +
                                                      (num_tasks,))
    mutagenesis_scores[
        sequence_index] = wild_type_prediction - mutated_predictions
  rolled_scores = np.rollaxis(mutagenesis_scores, -1)
  return rolled_scores
Beispiel #2
0
def fit_model(model_name, model_params, model_dir, data_dir):
    """Builds model from featurized data."""
    task_type = Model.get_task_type(model_name)
    train_dir = os.path.join(data_dir, "train-data")
    train = Dataset(train_dir)

    task_types = {task: task_type for task in train.get_task_names()}
    model_params["data_shape"] = train.get_data_shape()

    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train)
    model.save(model_dir)
Beispiel #3
0
def create_and_eval_model(train_dataset, test_dataset, task_type,
                          model_params, model_name, model_dir, tasks):
  """Helper method to create model for test."""
  # Fit model
  task_types = {task: task_type for task in tasks}
  model_params["data_shape"] = train_dataset.get_data_shape()
  print("Creating Model object.")
  import deepchem.models.deep
  model = Model.model_builder(model_name, task_types, model_params)
  print("About to fit model")
  model.fit(train_dataset)
  print("Done fitting, about to save...")
  model.save(model_dir)

  # Eval model on train
  evaluator = Evaluator(model, train_dataset, verbose=True)
  with tempfile.NamedTemporaryFile() as train_csv_out:
    with tempfile.NamedTemporaryFile() as train_stats_out:
      _, performance_df = evaluator.compute_model_performance(
          train_csv_out, train_stats_out)
  print("train_performance_df")
  print(performance_df)   

  evaluator = Evaluator(model, test_dataset, verbose=True)
  with tempfile.NamedTemporaryFile() as test_csv_out:
    with tempfile.NamedTemporaryFile() as test_stats_out:
      _, performance_df = evaluator.compute_model_performance(
          test_csv_out, test_stats_out)
  print("test_performance_df")
  print(performance_df)  

  return performance_df.iterrows().next()[1]["r2_score"]
Beispiel #4
0
def eval_trained_model(model_type, model_dir, data_dir, csv_out, stats_out):
    """Evaluates a trained model on specified data."""
    model = Model.load(model_type, model_dir)
    data = Dataset(data_dir)

    evaluator = Evaluator(model, data, verbose=True)
    _, perf_df = evaluator.compute_model_performance(csv_out, stats_out)
    print("Model Performance.")
    print(perf_df)
Beispiel #5
0
  def predict(self, dataset, transformers=[], batch_size=None,
              pad_batches=False):
    """
    Uses self to make predictions on provided Dataset object.

    This is overridden to make sure the batch size is always valid for Tensorflow.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    return Model.predict(self, dataset, transformers,
                         self.model_instance.batch_size, True)
Beispiel #6
0
  def load(self, model_dir):
    """
    Load keras multitask DNN from disk.
    """
    filename = Model.get_model_filename(model_dir)
    filename, _ = os.path.splitext(filename)

    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")

    with open(json_filename) as file_obj:
      model = model_from_json(file_obj.read())
    model.load_weights(h5_filename)
    self.raw_model = model
Beispiel #7
0
    def reload(self):
        """
    Load keras multitask DNN from disk.
    """
        filename = Model.get_model_filename(self.model_dir)
        filename, _ = os.path.splitext(filename)

        json_filename = "%s.%s" % (filename, "json")
        h5_filename = "%s.%s" % (filename, "h5")

        with open(json_filename) as file_obj:
            model = model_from_json(file_obj.read())
        model.load_weights(h5_filename)
        self.raw_model = model
Beispiel #8
0
  def reload(self, custom_objects={}):
    """
    Load keras multitask DNN from disk.
    """
    filename = Model.get_model_filename(self.model_dir)
    filename, _ = os.path.splitext(filename)

    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")

    with open(json_filename) as file_obj:
      model = model_from_json(file_obj.read(), custom_objects=custom_objects)
    model.load_weights(h5_filename)
    self.model_instance = model
  def test_API(self):
    """Straightforward test of multitask deepchem classification API."""
    splittype = "scaffold"
    feature_types = ["ECFP"]
    output_transforms = []
    input_transforms = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2}
    model_name = "multitask_deep_classifier"

    # Featurize input
    featurizer = DataFeaturizer(tasks=self.tasks,
                                smiles_field=self.smiles_field,
                                verbose=True)
    feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in self.tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        evaluator.compute_model_performance(test_csv_out, test_stats_out)
Beispiel #10
0
    def save(self, out_dir):
        """
    Saves underlying keras model to disk.
    """
        super(KerasModel, self).save(out_dir)
        model = self.get_raw_model()
        filename, _ = os.path.splitext(Model.get_model_filename(out_dir))

        # Note that keras requires the model architecture and weights to be stored
        # separately. A json file is generated that specifies the model architecture.
        # The weights will be stored in an h5 file. The pkl.gz file with store the
        # target name.
        json_filename = "%s.%s" % (filename, "json")
        h5_filename = "%s.%s" % (filename, "h5")
        # Save architecture
        json_string = model.to_json()
        with open(json_filename, "wb") as file_obj:
            file_obj.write(json_string)
        model.save_weights(h5_filename, overwrite=True)
Beispiel #11
0
  def save(self, out_dir):
    """
    Saves underlying keras model to disk.
    """
    super(KerasModel, self).save(out_dir)
    model = self.get_raw_model()
    filename, _ = os.path.splitext(Model.get_model_filename(out_dir))

    # Note that keras requires the model architecture and weights to be stored
    # separately. A json file is generated that specifies the model architecture.
    # The weights will be stored in an h5 file. The pkl.gz file with store the
    # target name.
    json_filename = "%s.%s" % (filename, "json")
    h5_filename = "%s.%s" % (filename, "h5")
    # Save architecture
    json_string = model.to_json()
    with open(json_filename, "wb") as file_obj:
      file_obj.write(json_string)
    model.save_weights(h5_filename, overwrite=True)
  def _create_model(self, splittype, feature_types, input_transforms,
                    output_transforms, task_type, model_params, model_name,
                    input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None):
    """Helper method to create model for test."""
    # Featurize input
    input_file = os.path.join(self.current_dir, input_file)
    featurizer = DataFeaturizer(tasks=tasks,
                                smiles_field=self.smiles_field,
                                protein_pdb_field=protein_pdb_field,
                                ligand_pdb_field=ligand_pdb_field,
                                verbose=True)
    feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir)

    # Transform data into arrays for ML
    samples = FeaturizedSamples(self.samplesdir, feature_files,
                                reload_data=False)

    # Split into train/test
    train_samples, test_samples = samples.train_test_split(
        splittype, self.train_dir, self.test_dir)
    train_dataset = Dataset(self.train_dir, train_samples, feature_types)
    test_dataset = Dataset(self.test_dir, test_samples, feature_types)

    # Transforming train/test data
    train_dataset.transform(input_transforms, output_transforms)
    test_dataset.transform(input_transforms, output_transforms)

    # Fit model
    task_types = {task: task_type for task in tasks}
    model_params["data_shape"] = train_dataset.get_data_shape()
    model = Model.model_builder(model_name, task_types, model_params)
    model.fit(train_dataset)
    model.save(self.model_dir)

    # Eval model on train
    evaluator = Evaluator(model, test_dataset, verbose=True)
    with tempfile.NamedTemporaryFile() as test_csv_out:
      with tempfile.NamedTemporaryFile() as test_stats_out:
        _, _ = evaluator.compute_model_performance(
            test_csv_out, test_stats_out)
Beispiel #13
0
 def reload(self):
     """Loads sklearn model from joblib file on disk."""
     self.model_instance = load_from_disk(
         Model.get_model_filename(self.model_dir))
Beispiel #14
0
    nb_tasks = len(sorted_tasks)
    y_pred = np.zeros((nb_samples, nb_tasks))
    for ind, task in enumerate(sorted_tasks):
      task_type = self.task_types[task]
      taskname = "task%d" % ind
      if task_type == "classification":
        # Class probabilities are predicted for classification outputs. Instead,
        # output the most likely class.
        y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1))
      else:
        y_pred_task = np.squeeze(y_pred_dict[taskname])
      y_pred[:, ind] = y_pred_task
    y_pred = np.squeeze(y_pred)
    return y_pred

Model.register_model_type("multitask_deep_regressor", MultiTaskDNN)
Model.register_model_type("multitask_deep_classifier", MultiTaskDNN)

class SingleTaskDNN(MultiTaskDNN):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, model_type, task_types, model_params, initialize_raw_model=True):
    super(SingleTaskDNN, self).__init__(model_type, task_types, model_params,
                                        initialize_raw_model)

Model.register_model_type("singletask_deep_regressor", SingleTaskDNN)
Model.register_model_type("singletask_deep_classifier", SingleTaskDNN)

def to_one_hot(y):
  """Transforms label vector into one-hot encoding.
Beispiel #15
0
      model.add(Activation('relu'))
      model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2])))
      model.add(Flatten())
      # TODO(rbharath): If we change away from axis-size 32, this code will break.
      # Eventually figure out a more general rule that works for all axis sizes.
      model.add(Dense(16, init='normal'))
      model.add(Activation('relu'))
      model.add(Dropout(0.5))
      model.add(Dense(1, init='normal'))

      sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
      print("About to compile model")
      model.compile(loss=loss_function, optimizer=sgd)
      self.raw_model = model

  def fit_on_batch(self, X, y, w):
    X = shuffle_data(X)
    loss = self.raw_model.train_on_batch(X, y)
    print("Loss: %f" % loss)

  def predict_on_batch(self, X):
    if len(np.shape(X)) != 5:
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    X = shuffle_data(X)
    y_pred = self.raw_model.predict_on_batch(X)
    y_pred = np.squeeze(y_pred)
    return y_pred

Model.register_model_type("convolutional_3D_regressor", DockingDNN)
Beispiel #16
0
        for ind, task in enumerate(sorted_tasks):
            task_type = self.task_types[task]
            taskname = "task%d" % ind
            if task_type == "classification":
                # Class probabilities are predicted for classification outputs. Instead,
                # output the most likely class.
                y_pred_task = np.squeeze(
                    np.argmax(y_pred_dict[taskname], axis=1))
            else:
                y_pred_task = np.squeeze(y_pred_dict[taskname])
            y_pred[:, ind] = y_pred_task
        y_pred = np.squeeze(y_pred)
        return y_pred


Model.register_model_type(MultiTaskDNN)


class SingleTaskDNN(MultiTaskDNN):
    """
  Abstract base class for different ML models.
  """
    def __init__(self, task_types, model_params, initialize_raw_model=True):
        super(SingleTaskDNN,
              self).__init__(task_types,
                             model_params,
                             initialize_raw_model=initialize_raw_model)


Model.register_model_type(SingleTaskDNN)
Beispiel #17
0
    """
    Makes predictions on dataset.
    """
    # Sets batch_size which the default impl in Model expects
    #TODO(enf/rbharath): This is kludgy. Fix later.
    if "batch_size" not in self.model_params.keys():
      self.model_params["batch_size"] = 32
    return super(SklearnModel, self).predict(X)

  def save(self, out_dir):
    """Saves sklearn model to disk using joblib."""
    super(SklearnModel, self).save(out_dir)
    save_to_disk(self.raw_model, self.get_model_filename(out_dir))

  def load(self, model_dir):
    """Loads sklearn model from joblib file on disk."""
    self.raw_model = load_from_disk(Model.get_model_filename(model_dir))

Model.register_model_type(SklearnModel)

#TODO(enf/rbharath): deprecate the following if __init__.py functions as planned.
'''
Model.register_model_type("logistic", SklearnModel)
Model.register_model_type("rf_classifier", SklearnModel)
Model.register_model_type("rf_regressor", SklearnModel)
Model.register_model_type("linear", SklearnModel)
Model.register_model_type("ridge", SklearnModel)
Model.register_model_type("lasso", SklearnModel)
Model.register_model_type("lasso_lars", SklearnModel)
Model.register_model_type("elastic_net", SklearnModel)
'''
Beispiel #18
0
    for (X, y, _, _) in numpy_dataset.itershards():
      Xs.append(X)
      ys.append(y)
    X = np.concatenate(Xs)
    y = np.concatenate(ys).ravel()
    self.raw_model.fit(X, y)

  def predict_on_batch(self, X):
    """
    Makes predictions on given batch of new data.
    """
    return self.raw_model.predict(X)

  def save(self, out_dir):
    """Saves sklearn model to disk using joblib."""
    super(SklearnModel, self).save(out_dir)
    save_to_disk(self.raw_model, self.get_model_filename(out_dir))

  def load(self, model_dir):
    """Loads sklearn model from joblib file on disk."""
    self.raw_model = load_from_disk(Model.get_model_filename(model_dir))

Model.register_model_type("logistic", SklearnModel)
Model.register_model_type("rf_classifier", SklearnModel)
Model.register_model_type("rf_regressor", SklearnModel)
Model.register_model_type("linear", SklearnModel)
Model.register_model_type("ridge", SklearnModel)
Model.register_model_type("lasso", SklearnModel)
Model.register_model_type("lasso_lars", SklearnModel)
Model.register_model_type("elastic_net", SklearnModel)
Beispiel #19
0
 def reload(self):
   """Loads sklearn model from joblib file on disk."""
   self.model_instance = load_from_disk(Model.get_model_filename(self.model_dir))
Beispiel #20
0
 def load(self, model_dir):
     """Loads sklearn model from joblib file on disk."""
     self.raw_model = load_from_disk(Model.get_model_filename(model_dir))
Beispiel #21
0
    Makes predictions on dataset.
    """
        # Sets batch_size which the default impl in Model expects
        #TODO(enf/rbharath): This is kludgy. Fix later.
        if "batch_size" not in self.model_params.keys():
            self.model_params["batch_size"] = 32
        return super(SklearnModel, self).predict(X)

    def save(self, out_dir):
        """Saves sklearn model to disk using joblib."""
        super(SklearnModel, self).save(out_dir)
        save_to_disk(self.raw_model, self.get_model_filename(out_dir))

    def load(self, model_dir):
        """Loads sklearn model from joblib file on disk."""
        self.raw_model = load_from_disk(Model.get_model_filename(model_dir))


Model.register_model_type(SklearnModel)

#TODO(enf/rbharath): deprecate the following if __init__.py functions as planned.
'''
Model.register_model_type("logistic", SklearnModel)
Model.register_model_type("rf_classifier", SklearnModel)
Model.register_model_type("rf_regressor", SklearnModel)
Model.register_model_type("linear", SklearnModel)
Model.register_model_type("ridge", SklearnModel)
Model.register_model_type("lasso", SklearnModel)
Model.register_model_type("lasso_lars", SklearnModel)
Model.register_model_type("elastic_net", SklearnModel)
'''
Beispiel #22
0
 def load(self, model_dir):
   """Loads sklearn model from joblib file on disk."""
   self.raw_model = load_from_disk(Model.get_model_filename(model_dir))
Beispiel #23
0
    nb_tasks = len(sorted_tasks)
    y_pred = np.zeros((nb_samples, nb_tasks))
    for ind, task in enumerate(sorted_tasks):
      task_type = self.task_types[task]
      taskname = "task%d" % ind
      if task_type == "classification":
        # Class probabilities are predicted for classification outputs. Instead,
        # output the most likely class.
        y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1))
      else:
        y_pred_task = np.squeeze(y_pred_dict[taskname])
      y_pred[:, ind] = y_pred_task
    y_pred = np.squeeze(y_pred)
    return y_pred

Model.register_model_type(MultiTaskDNN)

class SingleTaskDNN(MultiTaskDNN):
  """
  Abstract base class for different ML models.
  """
  def __init__(self, task_types, model_params, initialize_raw_model=True):
    super(SingleTaskDNN, self).__init__(task_types, model_params,
                                        initialize_raw_model=initialize_raw_model)

Model.register_model_type(SingleTaskDNN)

def to_one_hot(y):
  """Transforms label vector into one-hot encoding.

  Turns y into vector of shape [n_samples, 2] (assuming binary labels).