Exemple #1
0
  def split(self, dataset, frac_split, split_dirs=None):
    """
    Method that does bulk of splitting dataset.
    """
    if split_dirs is not None:
      assert len(split_dirs) == 2
    else:
      split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]

    # Handle edge case where frac_split is 1
    if frac_split == 1:
      dataset_1 = NumpyDataset(dataset.X, dataset.y, dataset.w, dataset.ids)
      dataset_2 = None
      return dataset_1, dataset_2
    X, y, w, ids = randomize_arrays(
        (dataset.X, dataset.y, dataset.w, dataset.ids))
    split_indices = self.get_task_split_indices(y, w, frac_split)

    # Create weight matrices fpor two haves. 
    w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
    for task, split_index in enumerate(split_indices):
      # copy over up to required index for weight first_split
      w_1[:split_index, task] = w[:split_index, task]
      w_2[split_index:, task] = w[split_index:, task]

    # check out if any rows in either w_1 or w_2 are just zeros
    rows_1 = w_1.any(axis=1)
    X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1]
    dataset_1 = NumpyDataset(X_1, y_1, w_1, ids_1)

    rows_2 = w_2.any(axis=1)
    X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2]
    dataset_2 = NumpyDataset(X_2, y_2, w_2, ids_2)

    return dataset_1, dataset_2
Exemple #2
0
    def train_valid_test_split(self,
                               dataset,
                               frac_train=.8,
                               frac_valid=.1,
                               frac_test=.1):
        """Performs a train/valid/test split of the tasks for dataset.

    If split is uneven, spillover goes to test.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    frac_train: float, optional
      Proportion of tasks to be put into train. Rounded to nearest int.
    frac_valid: float, optional
      Proportion of tasks to be put into valid. Rounded to nearest int.
    frac_test: float, optional
      Proportion of tasks to be put into test. Rounded to nearest int.
    """
        np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1)
        n_tasks = len(dataset.get_task_names())
        n_train = int(np.round(frac_train * n_tasks))
        n_valid = int(np.round(frac_valid * n_tasks))
        n_test = n_tasks - n_train - n_valid

        X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids

        train_dataset = NumpyDataset(X, y[:, :n_train], w[:, :n_train], ids)
        valid_dataset = NumpyDataset(X, y[:, n_train:n_train + n_valid],
                                     w[:, n_train:n_train + n_valid], ids)
        test_dataset = NumpyDataset(X, y[:, n_train + n_valid:],
                                    w[:, n_train + n_valid:], ids)
        return train_dataset, valid_dataset, test_dataset
Exemple #3
0
def in_silico_mutagenesis(model: Model,
                          encoded_sequences: np.ndarray) -> np.ndarray:
  """Computes in-silico-mutagenesis scores

  Parameters
  ----------
  model: Model
    This can be any model that accepts inputs of the required shape and produces
    an output of shape `(N_sequences, N_tasks)`.
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`

  Returns
  -------
  np.ndarray
    A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`.
  """
  # Shape (N_sequences, num_tasks)
  wild_type_predictions = model.predict(NumpyDataset(encoded_sequences))
  # check whether wild_type_predictions is np.ndarray or not
  assert isinstance(wild_type_predictions, np.ndarray)
  num_tasks = wild_type_predictions.shape[1]
  # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
  mutagenesis_scores = np.empty(
      encoded_sequences.shape + (num_tasks,), dtype=np.float32)
  # Shape (N_sequences, num_tasks, 1, 1, 1)
  wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
                                                np.newaxis]
  for sequence_index, (sequence, wild_type_prediction) in enumerate(
      zip(encoded_sequences, wild_type_predictions)):

    # Mutates every position of the sequence to every letter
    # Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
    # Breakdown:
    # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1)
    mutated_sequences = np.repeat(
        sequence[np.newaxis], np.prod(sequence.shape), axis=0)

    # remove wild-type
    # len(arange) = N_letters * sequence_length
    arange = np.arange(len(mutated_sequences))
    # len(horizontal cycle) = N_letters * sequence_length
    horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0])
    mutated_sequences[arange, :, horizontal_cycle, :] = 0

    # add mutant
    vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1])
    mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
    # make mutant predictions
    mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
    # check whether wild_type_predictions is np.ndarray or not
    assert isinstance(mutated_predictions, np.ndarray)
    mutated_predictions = mutated_predictions.reshape(sequence.shape +
                                                      (num_tasks,))
    mutagenesis_scores[
        sequence_index] = wild_type_prediction - mutated_predictions
  rolled_scores = np.rollaxis(mutagenesis_scores, -1)
  return rolled_scores
Exemple #4
0
    def test_compute_model_performance_multitask_regressor(self):
        random_seed = 42
        n_data_points = 20
        n_features = 2
        np.random.seed(seed=random_seed)

        X = np.random.rand(n_data_points, n_features)
        y1 = np.expand_dims(np.array([0.5 for x in range(n_data_points)]),
                            axis=-1)
        y2 = np.expand_dims(np.array([-0.5 for x in range(n_data_points)]),
                            axis=-1)
        X = NumpyDataset(X)
        ys = [NumpyDataset(y1), NumpyDataset(y2)]

        databag = Databag()

        features = Feature(shape=(None, n_features))
        databag.add_dataset(features, X)

        outputs = []
        losses = []
        labels = []
        for i in range(2):
            label = Label(shape=(None, 1))
            dense = Dense(out_channels=1, in_layers=[features])
            loss = ReduceSquareDifference(in_layers=[dense, label])

            outputs.append(dense)
            losses.append(loss)
            labels.append(label)
            databag.add_dataset(label, ys[i])

        total_loss = ReduceMean(in_layers=losses)

        tg = dc.models.TensorGraph(mode="regression",
                                   batch_size=20,
                                   random_seed=random_seed,
                                   learning_rate=0.1)
        for output in outputs:
            tg.add_output(output)
        tg.set_loss(total_loss)

        tg.fit_generator(
            databag.iterbatches(epochs=1000,
                                batch_size=tg.batch_size,
                                pad_batches=True))
        metric = [
            dc.metrics.Metric(dc.metrics.mean_absolute_error,
                              np.mean,
                              mode="regression"),
        ]
        scores = tg.evaluate_generator(databag.iterbatches(),
                                       metric,
                                       labels=labels,
                                       per_task_metrics=True)
        scores = list(scores[1].values())
        assert_true(np.all(np.isclose(scores, [0.0, 0.0], atol=1.0)))
Exemple #5
0
    def test_compute_model_performance_multitask_classifier(self):
        n_data_points = 20
        n_features = 2

        X = np.ones(shape=(n_data_points // 2, n_features)) * -1
        X1 = np.ones(shape=(n_data_points // 2, n_features))
        X = np.concatenate((X, X1))
        class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))])
        class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
        y1 = np.concatenate((class_0, class_1))
        y2 = np.concatenate((class_1, class_0))
        X = NumpyDataset(X)
        ys = [NumpyDataset(y1), NumpyDataset(y2)]

        databag = Databag()

        features = Feature(shape=(None, n_features))
        databag.add_dataset(features, X)

        outputs = []
        entropies = []
        labels = []
        for i in range(2):
            label = Label(shape=(None, 2))
            labels.append(label)
            dense = Dense(out_channels=2, in_layers=[features])
            output = SoftMax(in_layers=[dense])
            smce = SoftMaxCrossEntropy(in_layers=[label, dense])

            entropies.append(smce)
            outputs.append(output)
            databag.add_dataset(label, ys[i])

        total_loss = ReduceMean(in_layers=entropies)

        tg = dc.models.TensorGraph(learning_rate=0.1)
        for output in outputs:
            tg.add_output(output)
        tg.set_loss(total_loss)

        tg.fit_generator(
            databag.iterbatches(epochs=1000,
                                batch_size=tg.batch_size,
                                pad_batches=True))
        metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                                   np.mean,
                                   mode="classification")

        scores = tg.evaluate_generator(databag.iterbatches(), [metric],
                                       labels=labels,
                                       per_task_metrics=True)
        scores = list(scores[1].values())
        # Loosening atol to see if tests stop failing sporadically
        assert_true(np.all(np.isclose(scores, [1.0, 1.0], atol=0.20)))
Exemple #6
0
def in_silico_mutagenesis(model, X):
    """Computes in-silico-mutagenesis scores
   
  Parameters
  ----------
  model: TensorGraph
    Currently only SequenceDNN will work, but other models may be added.
  X: ndarray
    Shape (N_sequences, N_letters, sequence_length, 1) 

  Returns
  -------
  (num_task, N_sequences, N_letters, sequence_length, 1) ISM score array.
  """
    #Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
    mutagenesis_scores = np.empty(X.shape + (model.num_tasks, ),
                                  dtype=np.float32)
    # Shape (N_sequences, num_tasks)
    wild_type_predictions = model.predict(NumpyDataset(X))
    # Shape (N_sequences, num_tasks, 1, 1, 1)
    wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
                                                  np.newaxis]
    for sequence_index, (sequence, wild_type_prediction) in enumerate(
            zip(X, wild_type_predictions)):

        # Mutates every position of the sequence to every letter
        # Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
        # Breakdown:
        #  Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1)
        mutated_sequences = np.repeat(sequence[np.newaxis],
                                      np.prod(sequence.shape),
                                      axis=0)

        # remove wild-type
        # len(arange) = N_letters * sequence_length
        arange = np.arange(len(mutated_sequences))
        # len(horizontal cycle) = N_letters * sequence_length
        horizontal_cycle = np.tile(np.arange(sequence.shape[1]),
                                   sequence.shape[0])
        mutated_sequences[arange, :, horizontal_cycle, :] = 0

        # add mutant
        vertical_repeat = np.repeat(np.arange(sequence.shape[0]),
                                    sequence.shape[1])
        mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
        # make mutant predictions
        mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
        mutated_predictions = mutated_predictions.reshape(sequence.shape +
                                                          (model.num_tasks, ))
        mutagenesis_scores[
            sequence_index] = wild_type_prediction - mutated_predictions
    rolled_scores = np.rollaxis(mutagenesis_scores, -1)
    return rolled_scores
  def sample(self, n_graphs: int = 100) -> NumpyDataset:
    """Samples graphs

    Parameters
    ----------
    n_graphs: int, default 100
      Number of graphs to generate

    Returns
    -------
    graphs: NumpyDataset
      Generated Graphs
    """
    graphs, labels = [], []
    for i in range(n_graphs):
      n_nodes = random.randint(self.min_nodes, self.max_nodes)
      edge_index = generate_edge_index(n_nodes, self.avg_degree)
      n_edges = edge_index.shape[1]

      if self.task == 'graph':
        graph_label = random.randint(0, self.n_classes - 1)
        node_features = np.random.rand(n_nodes,
                                       self.n_node_features) + graph_label
        edge_features = np.random.rand(n_edges,
                                       self.n_edge_features) + graph_label
        kwargs = {}
        for feature_name, feature_shape in self.kwargs.items():
          kwargs[feature_name] = np.random.rand(1, feature_shape) + graph_label
        labels.append(graph_label)

      elif self.task == 'node':
        node_label = np.random.randint(0, self.n_classes - 1,
                                       n_nodes).astype(np.float64)
        node_features = np.random.rand(
            n_nodes, self.n_node_features) + node_label.reshape(-1, 1)
        # For a node-prediction task, label is not added to edge features and other global features
        # because label here is a node-level attribute and not a graph-level attribute
        edge_features = np.random.rand(n_edges, self.n_edge_features)
        kwargs = {}
        for feature_name, feature_shape in self.kwargs.items():
          kwargs[feature_name] = np.random.rand(1, feature_shape)
        kwargs['y'] = node_label

      graph = GraphData(node_features, edge_index, edge_features, **kwargs)
      graphs.append(graph)

      if self.task == 'graph':
        graph_dataset = NumpyDataset(X=np.array(graphs), y=np.array(labels))
      elif self.task == 'node':
        # In this case, the 'y' attribute of GraphData will contain the
        # node-level labels.
        graph_dataset = NumpyDataset(X=np.array(graphs))
    return graph_dataset
    def test_compute_model_performance_singletask_regressor_ordering(self):
        n_data_points = 1000
        n_features = 1

        X = np.array(range(n_data_points))
        X = np.expand_dims(X, axis=-1)
        y1 = X + 1
        X = NumpyDataset(X)
        ys = [NumpyDataset(y1)]

        databag = Databag()

        features = Feature(shape=(None, n_features))
        databag.add_dataset(features, X)

        outputs = []
        losses = []
        labels = []
        for i in range(1):
            label = Label(shape=(None, 1))
            dense = Dense(out_channels=1, in_layers=[features])
            loss = ReduceSquareDifference(in_layers=[dense, label])

            outputs.append(dense)
            losses.append(loss)
            labels.append(label)
            databag.add_dataset(label, ys[i])

        total_loss = ReduceMean(in_layers=losses)

        tg = dc.models.TensorGraph(mode="regression", learning_rate=0.1)
        for output in outputs:
            tg.add_output(output)
        tg.set_loss(total_loss)

        tg.fit_generator(
            databag.iterbatches(epochs=1000,
                                batch_size=tg.batch_size,
                                pad_batches=True))
        metric = [
            dc.metrics.Metric(dc.metrics.mean_absolute_error,
                              np.mean,
                              mode="regression"),
            dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
        ]
        scores = tg.evaluate_generator(databag.iterbatches(batch_size=1),
                                       metric,
                                       labels=labels,
                                       per_task_metrics=True)
        print(scores)
        scores = list(scores[1].values())
        assert_true(np.all(np.isclose(scores, [0.0], atol=0.5)))
    def test_compute_model_performance_multitask_regressor(self):
        random_seed = 42
        n_data_points = 20
        n_features = 2
        n_tasks = 2
        np.random.seed(seed=random_seed)

        X = np.random.rand(n_data_points, n_features)
        y1 = np.array([0.5 for x in range(n_data_points)])
        y2 = np.array([-0.5 for x in range(n_data_points)])
        y = np.stack([y1, y2], axis=1)
        dataset = NumpyDataset(X, y)

        features = Feature(shape=(None, n_features))
        label = Label(shape=(None, n_tasks))
        dense = Dense(out_channels=n_tasks, in_layers=[features])
        loss = ReduceSquareDifference(in_layers=[dense, label])

        tg = dc.models.TensorGraph(random_seed=random_seed, learning_rate=0.1)
        tg.add_output(dense)
        tg.set_loss(loss)

        tg.fit(dataset, nb_epoch=1000)
        metric = [
            dc.metrics.Metric(dc.metrics.mean_absolute_error,
                              np.mean,
                              mode="regression"),
        ]
        scores = tg.evaluate_generator(tg.default_generator(dataset),
                                       metric,
                                       labels=[label],
                                       per_task_metrics=True)
        scores = list(scores[1].values())
        assert_true(np.all(np.isclose(scores, [0.0, 0.0], atol=1.0)))
    def test_compute_model_performance_singletask_classifier(self):
        n_data_points = 20
        n_features = 10

        X = np.ones(shape=(int(n_data_points / 2), n_features)) * -1
        X1 = np.ones(shape=(int(n_data_points / 2), n_features))
        X = np.concatenate((X, X1))
        class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))])
        class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
        y = np.concatenate((class_0, class_1))
        dataset = NumpyDataset(X, y)

        features = Feature(shape=(None, n_features))
        label = Label(shape=(None, 2))
        dense = Dense(out_channels=2, in_layers=[features])
        output = SoftMax(in_layers=[dense])
        smce = SoftMaxCrossEntropy(in_layers=[label, dense])
        total_loss = ReduceMean(in_layers=smce)

        tg = dc.models.TensorGraph(learning_rate=0.1)
        tg.add_output(output)
        tg.set_loss(total_loss)

        tg.fit(dataset, nb_epoch=1000)
        metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                                   np.mean,
                                   mode="classification")

        scores = tg.evaluate_generator(tg.default_generator(dataset), [metric],
                                       labels=[label],
                                       per_task_metrics=True)
        scores = list(scores[1].values())
        assert_true(np.isclose(scores, [1.0], atol=0.05))
Exemple #11
0
    def test_neighbor_list_simple(self):
        """Test that neighbor lists can be constructed."""
        N_atoms = 10
        start = 0
        stop = 12
        nbr_cutoff = 3
        ndim = 3
        M = 6
        X = np.random.rand(N_atoms, ndim)
        y = np.random.rand(N_atoms, 1)
        dataset = NumpyDataset(X, y)

        features = Feature(shape=(N_atoms, ndim))
        labels = Label(shape=(N_atoms, ))
        nbr_list = NeighborList(N_atoms,
                                M,
                                ndim,
                                nbr_cutoff,
                                start,
                                stop,
                                in_layers=[features])
        nbr_list = ToFloat(in_layers=[nbr_list])
        # This isn't a meaningful loss, but just for test
        loss = ReduceSum(in_layers=[nbr_list])
        tg = dc.models.TensorGraph(use_queue=False)
        tg.add_output(nbr_list)
        tg.set_loss(loss)

        tg.build()
Exemple #12
0
    def get_task_dataset_minus_support(self, support, task):
        """Gets data for specified task, minus support points.

        Useful for evaluating model performance once trained (so that
        test compounds can be ensured distinct from support.)

        Parameters
        ----------
        dataset: dc.data.Dataset
            Source dataset.
        support: dc.data.Dataset
            The support dataset
        task: int
            Task number of task to select.
        """
        dataset = self.__getitem__(task)
        support_ids = set(support.ids)
        non_support_inds = [
            ind for ind in range(len(dataset))
            if dataset.ids[ind] not in support_ids
        ]

        # Remove support indices
        X = dataset.X[non_support_inds]
        y = dataset.y[non_support_inds, :]
        w = dataset.w[non_support_inds, :]
        ids = dataset.ids[non_support_inds]
        return NumpyDataset(X, y, w, ids)
Exemple #13
0
    def k_fold_split(self, dataset, K):
        """Performs a K-fold split of the tasks for dataset.

    If split is uneven, spillover goes to last fold.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to be split
    K: int
      Number of splits to be made
    """
        n_tasks = len(dataset.get_task_names())
        n_per_fold = int(np.round(n_tasks / float(K)))
        if K * n_per_fold != n_tasks:
            print("Assigning extra tasks to last fold due to uneven split")

        X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids

        fold_datasets = []
        for fold in range(K):
            if fold != K - 1:
                fold_tasks = range(fold * n_per_fold, (fold + 1) * n_per_fold)
            else:
                fold_tasks = range(fold * n_per_fold, n_tasks)
            fold_datasets.append(
                NumpyDataset(X, y[:, fold_tasks], w[:, fold_tasks], ids))
        return fold_datasets
 def test_tensorboard(self):
     n_data_points = 20
     n_features = 2
     X = np.random.rand(n_data_points, n_features)
     y = [[0, 1] for x in range(n_data_points)]
     dataset = NumpyDataset(X, y)
     features = Feature(shape=(None, n_features))
     dense = Dense(out_channels=2, in_layers=[features])
     output = SoftMax(in_layers=[dense])
     label = Label(shape=(None, 2))
     smce = SoftMaxCrossEntropy(in_layers=[label, dense])
     loss = ReduceMean(in_layers=[smce])
     tg = dc.models.TensorGraph(tensorboard=True,
                                tensorboard_log_frequency=1,
                                learning_rate=0.01,
                                model_dir='/tmp/tensorgraph')
     tg.add_output(output)
     tg.set_loss(loss)
     tg.fit(dataset, nb_epoch=1000)
     files_in_dir = os.listdir(tg.model_dir)
     event_file = list(
         filter(lambda x: x.startswith("events"), files_in_dir))
     assert_true(len(event_file) > 0)
     event_file = os.path.join(tg.model_dir, event_file[0])
     file_size = os.stat(event_file).st_size
     assert_true(file_size > 0)
Exemple #15
0
  def test_neighbor_list_vina(self):
    """Test under conditions closer to Vina usage."""
    N_atoms = 5
    M_nbrs = 2
    ndim = 3
    start = 0
    stop = 4
    nbr_cutoff = 1

    X = NumpyDataset(start + np.random.rand(N_atoms, ndim) * (stop - start))

    coords = Feature(shape=(N_atoms, ndim))

    # Now an (N, M) shape
    nbr_list = NeighborList(
        N_atoms, M_nbrs, ndim, nbr_cutoff, start, stop, in_layers=[coords])

    nbr_list = ToFloat(in_layers=[nbr_list])
    flattened = Flatten(in_layers=[nbr_list])
    dense = Dense(out_channels=1, in_layers=[flattened])
    output = ReduceSum(in_layers=[dense])

    tg = dc.models.TensorGraph(learning_rate=0.1, use_queue=False)
    tg.set_loss(output)

    databag = Databag({coords: X})
    tg.fit_generator(databag.iterbatches(epochs=1))
Exemple #16
0
    def test_save_load(self):
        n_data_points = 20
        n_features = 2
        X = np.random.rand(n_data_points, n_features)
        y = [[0, 1] for x in range(n_data_points)]
        dataset = NumpyDataset(X, y)
        features = Feature(shape=(None, n_features))
        dense = Dense(out_channels=2, in_layers=[features])
        output = SoftMax(in_layers=[dense])
        label = Label(shape=(None, 2))
        smce = SoftMaxCrossEntropy(in_layers=[label, dense])
        loss = ReduceMean(in_layers=[smce])
        tg = dc.models.TensorGraph(learning_rate=0.01)
        tg.add_output(output)
        tg.set_loss(loss)
        submodel_loss = ReduceSum(in_layers=smce)
        submodel_opt = Adam(learning_rate=0.002)
        submodel = tg.create_submodel(layers=[dense],
                                      loss=submodel_loss,
                                      optimizer=submodel_opt)
        tg.fit(dataset, nb_epoch=1)
        prediction = np.squeeze(tg.predict_on_batch(X))
        tg.save()

        dirpath = tempfile.mkdtemp()
        shutil.rmtree(dirpath)
        shutil.move(tg.model_dir, dirpath)

        tg1 = TensorGraph.load_from_dir(dirpath)
        prediction2 = np.squeeze(tg1.predict_on_batch(X))
        assert_true(np.all(np.isclose(prediction, prediction2, atol=0.01)))
    def test_set_optimizer(self):
        n_data_points = 20
        n_features = 2
        X = np.random.rand(n_data_points, n_features)
        y = [[0, 1] for x in range(n_data_points)]
        dataset = NumpyDataset(X, y)
        features = Feature(shape=(None, n_features))
        dense = Dense(out_channels=2, in_layers=[features])
        output = SoftMax(in_layers=[dense])
        label = Label(shape=(None, 2))
        smce = SoftMaxCrossEntropy(in_layers=[label, dense])
        loss = ReduceMean(in_layers=[smce])
        tg = dc.models.TensorGraph(learning_rate=0.01, use_queue=False)
        tg.add_output(output)
        tg.set_loss(loss)
        global_step = tg.get_global_step()
        learning_rate = ExponentialDecay(initial_rate=0.1,
                                         decay_rate=0.96,
                                         decay_steps=100000)
        tg.set_optimizer(GradientDescent(learning_rate=learning_rate))
        tg.fit(dataset, nb_epoch=1000)
        prediction = np.squeeze(tg.predict_on_batch(X))
        tg.save()

        tg1 = TensorGraph.load_from_dir(tg.model_dir)
        prediction2 = np.squeeze(tg1.predict_on_batch(X))
        assert_true(np.all(np.isclose(prediction, prediction2, atol=0.01)))
def test_compute_model_performance_singletask_classifier():
    """Computes model performance on singletask dataset with one-hot label encoding."""
    n_data_points = 20
    n_features = 10

    X = np.ones(shape=(int(n_data_points / 2), n_features)) * -1
    X1 = np.ones(shape=(int(n_data_points / 2), n_features))
    X = np.concatenate((X, X1))
    class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))])
    class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))])
    y = np.concatenate((class_0, class_1))
    dataset = NumpyDataset(X, y)

    features = layers.Input(shape=(n_features, ))
    dense = layers.Dense(2)(features)
    output = layers.Softmax()(dense)
    keras_model = tf.keras.Model(inputs=features, outputs=[output])
    model = dc.models.KerasModel(keras_model,
                                 dc.models.losses.SoftmaxCrossEntropy(),
                                 learning_rate=0.1)

    model.fit(dataset, nb_epoch=1000)
    metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                               np.mean,
                               mode="classification",
                               n_tasks=1)

    scores = model.evaluate_generator(model.default_generator(dataset),
                                      [metric],
                                      per_task_metrics=True)
    scores = list(scores[1].values())
    assert np.isclose(scores, [1.0], atol=0.05)
def test_compute_model_performance_multitask_regressor():
    random_seed = 42
    n_data_points = 20
    n_features = 2
    n_tasks = 2
    np.random.seed(seed=random_seed)

    X = np.random.rand(n_data_points, n_features)
    y1 = np.array([0.5 for x in range(n_data_points)])
    y2 = np.array([-0.5 for x in range(n_data_points)])
    y = np.stack([y1, y2], axis=1)
    dataset = NumpyDataset(X, y)

    features = layers.Input(shape=(n_features, ))
    dense = layers.Dense(n_tasks)(features)
    keras_model = tf.keras.Model(inputs=features, outputs=[dense])
    model = dc.models.KerasModel(keras_model,
                                 dc.models.losses.L2Loss(),
                                 learning_rate=0.1)

    model.fit(dataset, nb_epoch=1000)
    metric = [
        dc.metrics.Metric(dc.metrics.mean_absolute_error,
                          np.mean,
                          mode="regression"),
    ]
    scores = model.evaluate_generator(model.default_generator(dataset),
                                      metric,
                                      per_task_metrics=True)
    scores = list(scores[1].values())
    assert np.all(np.isclose(scores, [0.0, 0.0], atol=1.0))
Exemple #20
0
  def predict_on_batch(
      self,
      X: ArrayLike,
      transformers: List[Transformer] = [],
      outputs: Optional[OneOrMany[tf.Tensor]] = None) -> OneOrMany[np.ndarray]:
    """Generates predictions for input samples, processing samples in a batch.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.
    outputs: Tensor or list of Tensors
      The outputs to return.  If this is None, the model's standard prediction
      outputs will be returned.  Alternatively one or more Tensors within the
      model may be specified, in which case the output of those Tensors will be
      returned.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
    dataset = NumpyDataset(X=X, y=None)
    return self.predict(dataset, transformers, outputs)
Exemple #21
0
  def predict_uncertainty_on_batch(self, X: Sequence, masks: int = 50
                                  ) -> OneOrMany[Tuple[np.ndarray, np.ndarray]]:
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    dataset = NumpyDataset(X=X, y=None)
    return self.predict_uncertainty(dataset, masks)
Exemple #22
0
    def fit_on_batch(self, X, y, w, variables=None, loss=None, callbacks=[]):
        """Perform a single step of training.

    Parameters
    ----------
    X: ndarray
      the inputs for the batch
    y: ndarray
      the labels for the batch
    w: ndarray
      the weights for the batch
    variables: list of tf.Variable
      the variables to train.  If None (the default), all trainable variables in
      the model are used.
    loss: function
      a function of the form f(outputs, labels, weights) that computes the loss
      for each batch.  If None (the default), the model's standard loss function
      is used.
    callbacks: function or list of functions
      one or more functions of the form f(model, step) that will be invoked after
      every step.  This can be used to perform validation, logging, etc.
   """
        if not self.built:
            self.build()
        dataset = NumpyDataset(X, y, w)
        return self.fit(dataset,
                        nb_epoch=1,
                        variables=variables,
                        loss=loss,
                        callbacks=callbacks)
Exemple #23
0
def get_dataset(mode='classification', featurizer='GraphConv', num_tasks=2):
    data_points = 20
    if mode == 'classification':
        tasks, all_dataset, transformers = load_bace_classification(
            featurizer, reload=False)
    else:
        tasks, all_dataset, transformers = load_delaney(featurizer,
                                                        reload=False)

    train, valid, test = all_dataset
    for i in range(1, num_tasks):
        tasks.append("random_task")
    w = np.ones(shape=(data_points, len(tasks)))

    if mode == 'classification':
        y = np.random.randint(0, 2, size=(data_points, len(tasks)))
        metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                                   np.mean,
                                   mode="classification")
    else:
        y = np.random.normal(size=(data_points, len(tasks)))
        metric = dc.metrics.Metric(dc.metrics.mean_absolute_error,
                                   mode="regression")

    ds = NumpyDataset(train.X[:data_points], y, w, train.ids[:data_points])

    return tasks, ds, transformers, metric
Exemple #24
0
 def score(self, protein_file, ligand_file):
     """Returns a score for a protein/ligand pair."""
     features = self.featurizer.featurize_complexes([ligand_file],
                                                    [protein_file])
     dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
     score = self.model.predict(dataset)
     return score
Exemple #25
0
def get_task_dataset_minus_support(dataset, support, task):
  """Gets data for specified task, minus support points.

  Useful for evaluating model performance once trained (so that
  test compounds can be ensured distinct from support.)

  Parameters
  ----------
  dataset: dc.data.Dataset
    Source dataset.
  support: dc.data.Dataset
    The support dataset
  task: int
    Task number of task to select.
  """
  support_ids = set(support.ids)
  non_support_inds = [ind for ind in range(len(dataset))
                      if dataset.ids[ind] not in support_ids]

  # Remove support indices
  X = dataset.X[non_support_inds]
  y = dataset.y[non_support_inds]
  w = dataset.w[non_support_inds]
  ids = dataset.ids[non_support_inds]

  # Get task specific entries
  w_task = w[:, task]
  X_task = X[w_task != 0]
  y_task = y[w_task != 0, task]
  ids_task = ids[w_task != 0]
  # Now just get weights for this task
  w_task = w[w_task != 0, task]

  return NumpyDataset(X_task, y_task, w_task, ids_task)
Exemple #26
0
  def test_atomic_conv_variable(self):
    """A simple test that initializes and fits an AtomicConvModel on variable input size."""
    # For simplicity, let's assume both molecules have same number of
    # atoms.
    frag1_num_atoms = 1000
    frag2_num_atoms = 1200
    complex_num_atoms = frag1_num_atoms + frag2_num_atoms
    batch_size = 1
    atomic_convnet = atomic_conv.AtomicConvModel(
        batch_size=batch_size,
        frag1_num_atoms=frag1_num_atoms,
        frag2_num_atoms=frag2_num_atoms,
        complex_num_atoms=complex_num_atoms)

    # Creates a set of dummy features that contain the coordinate and
    # neighbor-list features required by the AtomicConvModel.
    features = []
    frag1_coords = np.random.rand(frag1_num_atoms, 3)
    frag1_nbr_list = {i: [] for i in range(frag1_num_atoms)}
    frag1_z = np.random.randint(10, size=(frag1_num_atoms))
    frag2_coords = np.random.rand(frag2_num_atoms, 3)
    frag2_nbr_list = {i: [] for i in range(frag2_num_atoms)}
    frag2_z = np.random.randint(10, size=(frag2_num_atoms))
    system_coords = np.random.rand(complex_num_atoms, 3)
    system_nbr_list = {i: [] for i in range(complex_num_atoms)}
    system_z = np.random.randint(10, size=(complex_num_atoms))

    features.append(
        (frag1_coords, frag1_nbr_list, frag1_z, frag2_coords, frag2_nbr_list,
         frag2_z, system_coords, system_nbr_list, system_z))
    features = np.asarray(features)
    labels = np.zeros(batch_size)
    train = NumpyDataset(features, labels)
    atomic_convnet.fit(train, nb_epoch=1)
def test_normalizing_flow():

    flow_layers = [
        tfb.RealNVP(num_masked=1,
                    shift_and_log_scale_fn=tfb.real_nvp_default_template(
                        hidden_layers=[8, 8]))
    ]
    # 3D Multivariate Gaussian base distribution
    nf = NormalizingFlow(
        base_distribution=tfd.MultivariateNormalDiag(loc=[0., 0.]),
        flow_layers=flow_layers)

    nfm = NormalizingFlowModel(nf)

    # Must be float32 for RealNVP
    target_distribution = tfd.MultivariateNormalDiag(loc=[1., 0.])
    dataset = NumpyDataset(X=target_distribution.sample(96))

    # Tests a simple flow of one RealNVP layer.

    X = nfm.flow.sample()
    x1 = tf.zeros([2])
    x2 = dataset.X[0]

    # log likelihoods should be negative
    assert nfm.flow.log_prob(X).numpy() < 0
    assert nfm.flow.log_prob(x1).numpy() < 0
    assert nfm.flow.log_prob(x2).numpy() < 0

    # # Fit model
    final = nfm.fit(dataset, nb_epoch=5)
    print(final)
    assert final > 0
Exemple #28
0
 def predict_mols(self, mols):
   featurizer = CircularFingerprint(
       size=self.n_features, radius=2, chiral=True)
   features = np.expand_dims(featurizer.featurize(mols), axis=1)
   features = np.concatenate([features, features], axis=1)
   ds = NumpyDataset(features, None, None, None)
   return self.predict(ds)[0][:, 0]
Exemple #29
0
 def predict_on_batch(self, X, transformers=[], outputs=None):
     dataset = NumpyDataset(X, y=None)
     generator = self.default_generator(dataset,
                                        predict=True,
                                        pad_batches=False)
     preds = self.predict_on_generator(generator, transformers, outputs)
     preds = 10**-preds  # Since we get train on -log10(IC50)
     return preds
Exemple #30
0
  def test_training(self):
    """
    Check training of the basicMolGANmodel on small number of compounds.
    Due to training instability try a few times and see if it worked at least once.
    Typically it fails between 1-3 times of 10.
    This is something that needs to be addressed in future releases.
    """

    input_file = os.path.join(self.current_dir, "molgan_example.csv")
    data = pd.read_csv(input_file)
    molecules = list(data['Molecule'])
    feat = MolGanFeaturizer()
    featurized = feat.featurize(molecules)
    dataset = NumpyDataset([x.adjacency_matrix for x in featurized],
                           [x.node_features for x in featurized])

    # True will be assigned up successful training attempt
    success = False

    for _ in range(self.training_attempts):
      # force clear tensor flow backend
      keras_clear_session()
      # create new model
      gan = MolGAN(learning_rate=ExponentialDecay(0.001, 0.9, 5000))

      # to avoid flake8 E125/yapf incompatibility
      s = gan.batch_size

      # generate input
      def iterbatches(epochs):
        for __ in range(epochs):
          for batch in dataset.iterbatches(batch_size=s, pad_batches=True):
            adjacency_tensor = one_hot(batch[0], gan.edges)
            node_tesor = one_hot(batch[1], gan.nodes)

            yield {
                gan.data_inputs[0]: adjacency_tensor,
                gan.data_inputs[1]: node_tesor
            }

      # train model
      gan.fit_gan(iterbatches(1000), generator_steps=0.2, checkpoint_interval=0)

      # generate sample
      g = gan.predict_gan_generator(1000)

      # check how many valid molecules were created and add to list
      generated_molecules = feat.defeaturize(g)
      valid_molecules_count = len(
          list(filter(lambda x: x is not None, generated_molecules)))
      print(valid_molecules_count)
      if valid_molecules_count:
        success = True
        break

    # finally test if there was at least one valid training session
    # as the model structure improves this should become more and more strict
    assert success