Exemple #1
0
    def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):
        """Construct a feed dictionary from minibatch data.

    TODO(rbharath): ids_b is not used here. Can we remove it?

    Args:
      X_b: np.ndarray of shape (batch_size, n_features)
      y_b: np.ndarray of shape (batch_size, n_tasks)
      w_b: np.ndarray of shape (batch_size, n_tasks)
      ids_b: List of length (batch_size) with datapoint identifiers.
    """
        orig_dict = {}
        orig_dict["mol_features"] = X_b
        for task in range(self.n_tasks):
            if y_b is not None:
                orig_dict["labels_%d" % task] = to_one_hot(y_b[:, task])
            else:
                # Dummy placeholders
                orig_dict["labels_%d" % task] = np.squeeze(
                    to_one_hot(np.zeros((self.batch_size, ))))
            if w_b is not None:
                orig_dict["weights_%d" % task] = w_b[:, task]
            else:
                # Dummy placeholders
                orig_dict["weights_%d" % task] = np.ones((self.batch_size, ))
        return TensorflowGraph.get_feed_dict(orig_dict)
Exemple #2
0
  def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):
    """Construct a feed dictionary from minibatch data.

    TODO(rbharath): ids_b is not used here. Can we remove it?

    Args:
      X_b: np.ndarray of shape (batch_size, n_features)
      y_b: np.ndarray of shape (batch_size, n_tasks)
      w_b: np.ndarray of shape (batch_size, n_tasks)
      ids_b: List of length (batch_size) with datapoint identifiers.
    """ 
    orig_dict = {}
    orig_dict["mol_features"] = X_b
    for task in range(self.n_tasks):
      if y_b is not None:
        orig_dict["labels_%d" % task] = to_one_hot(y_b[:, task])
      else:
        # Dummy placeholders
        orig_dict["labels_%d" % task] = np.squeeze(to_one_hot(
            np.zeros((self.batch_size,))))
      if w_b is not None:
        orig_dict["weights_%d" % task] = w_b[:, task]
      else:
        # Dummy placeholders
        orig_dict["weights_%d" % task] = np.ones(
            (self.batch_size,)) 
    return TensorflowGraph.get_feed_dict(orig_dict)
    def default_generator(self,
                          dataset,
                          epochs=1,
                          mode='fit',
                          deterministic=True,
                          pad_batches=True):
        for epoch in range(epochs):
            if mode == "predict" or (not self.augment):
                for (X_b, y_b, w_b,
                     ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                                   deterministic=deterministic,
                                                   pad_batches=pad_batches):
                    if self.mode == 'classification':
                        y_b = to_one_hot(y_b.flatten(),
                                         self.n_classes).reshape(
                                             -1, self.n_tasks, self.n_classes)
                    yield ([X_b], [y_b], [w_b])

            else:
                if not pad_batches:
                    n_samples = dataset.X.shape[0]
                else:
                    n_samples = dataset.X.shape[0] + (
                        self.batch_size -
                        (dataset.X.shape[0] % self.batch_size))

                n_batches = 0
                image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(
                    rotation_range=180)
                for (X_b, y_b, w_b) in image_data_generator.flow(
                        dataset.X,
                        dataset.y,
                        sample_weight=dataset.w,
                        shuffle=not deterministic,
                        batch_size=self.batch_size):
                    if pad_batches:
                        ids_b = np.arange(X_b.shape[0])
                        X_b, y_b, w_b, _ = pad_batch(self.batch_size, X_b, y_b,
                                                     w_b, ids_b)
                    n_batches += 1
                    if n_batches > n_samples / self.batch_size:
                        # This is needed because ImageDataGenerator does infinite looping
                        break
                    if self.mode == "classification":
                        y_b = to_one_hot(y_b.flatten(),
                                         self.n_classes).reshape(
                                             -1, self.n_tasks, self.n_classes)
                    yield ([X_b], [y_b], [w_b])
Exemple #4
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
   for epoch in range(epochs):
     if not predict:
       print('Starting epoch %i' % epoch)
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(
             self.batch_size, pad_batches=True, deterministic=deterministic)):
       d = {}
       for index, label in enumerate(self.my_labels):
         if self.mode == 'classification':
           d[label] = to_one_hot(y_b[:, index])
         if self.mode == 'regression':
           d[label] = np.expand_dims(y_b[:, index], -1)
       d[self.my_task_weights] = w_b
       multiConvMol = ConvMol.agglomerate_mols(X_b)
       d[self.atom_features] = multiConvMol.get_atom_features()
       d[self.degree_slice] = multiConvMol.deg_slice
       d[self.membership] = multiConvMol.membership
       for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
         d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
       yield d
Exemple #5
0
  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """ TensorGraph style implementation
    similar to deepchem.models.tf_new_models.graph_topology.AlternateWeaveTopology.batch_to_feed_dict
    """
    for epoch in range(epochs):
      if not predict:
        print('Starting epoch %i' % epoch)
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        feed_dict = dict()
        if y_b is not None and not predict:
          for index, label in enumerate(self.labels_fd):
            if self.mode == "classification":
              feed_dict[label] = to_one_hot(y_b[:, index])
            if self.mode == "regression":
              feed_dict[label] = y_b[:, index:index + 1]
        if w_b is not None:
          feed_dict[self.weights] = w_b

        atom_feat = []
        pair_feat = []
        atom_split = []
        atom_to_pair = []
        pair_split = []
        start = 0
        for im, mol in enumerate(X_b):
          n_atoms = mol.get_num_atoms()
          # number of atoms in each molecule
          atom_split.extend([im] * n_atoms)
          # index of pair features
          C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
          atom_to_pair.append(
              np.transpose(
                  np.array([C1.flatten() + start,
                            C0.flatten() + start])))
          # number of pairs for each atom
          pair_split.extend(C1.flatten() + start)
          start = start + n_atoms

          # atom features
          atom_feat.append(mol.get_atom_features())
          # pair features
          pair_feat.append(
              np.reshape(mol.get_pair_features(), (n_atoms * n_atoms,
                                                   self.n_pair_feat)))

        feed_dict[self.atom_features] = np.concatenate(atom_feat, axis=0)
        feed_dict[self.pair_features] = np.concatenate(pair_feat, axis=0)
        feed_dict[self.pair_split] = np.array(pair_split)
        feed_dict[self.atom_split] = np.array(atom_split)
        feed_dict[self.atom_to_pair] = np.concatenate(atom_to_pair, axis=0)
        yield feed_dict
Exemple #6
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
   for epoch in range(epochs):
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(
             self.batch_size,
             pad_batches=pad_batches,
             deterministic=deterministic)):
       d = {}
       if self.mode == 'classification':
         d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape(
             -1, self.n_tasks, self.n_classes)
       else:
         d[self.labels[0]] = y_b
       d[self.task_weights[0]] = w_b
       multiConvMol = ConvMol.agglomerate_mols(X_b)
       d[self.atom_features] = multiConvMol.get_atom_features()
       d[self.degree_slice] = multiConvMol.deg_slice
       d[self.membership] = multiConvMol.membership
       for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
         d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
       yield d
Exemple #7
0
    def default_generator(self,
                          dataset,
                          epochs=1,
                          predict=False,
                          deterministic=True,
                          pad_batches=True):
        """ Transfer smiles strings to fixed length integer vectors
    """
        for epoch in range(epochs):
            for (X_b, y_b, w_b,
                 ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                               deterministic=deterministic,
                                               pad_batches=pad_batches):

                feed_dict = dict()
                if y_b is not None and not predict:
                    if self.mode == "classification":
                        feed_dict[self.labels[0]] = to_one_hot(
                            y_b.flatten(), 2).reshape(-1, self.n_tasks, 2)
                    else:
                        feed_dict[self.labels[0]] = y_b
                if w_b is not None and not predict:
                    feed_dict[self.task_weights[0]] = w_b

                # Transform SMILES sequence to integers
                feed_dict[self.smiles_seqs] = self.smiles_to_seq_batch(ids_b)
                yield feed_dict
Exemple #8
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
   for epoch in range(epochs):
     if not predict:
       print('Starting epoch %i' % epoch)
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(
             self.batch_size, pad_batches=True, deterministic=deterministic)):
       d = {}
       for index, label in enumerate(self.my_labels):
         if self.mode == 'classification':
           d[label] = to_one_hot(y_b[:, index])
         if self.mode == 'regression':
           d[label] = np.expand_dims(y_b[:, index], -1)
       d[self.my_task_weights] = w_b
       d[self.adj_matrix] = np.expand_dims(np.array([x[0] for x in X_b]), -2)
       d[self.vertex_features] = np.array([x[1] for x in X_b])
       mask = np.zeros(shape=(self.batch_size, self.max_atoms, 1))
       for i in range(self.batch_size):
         mask_size = X_b[i][2]
         mask[i][:mask_size][0] = 1
       d[self.mask] = mask
       yield d
Exemple #9
0
  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """ Transfer smiles strings to fixed length integer vectors
    """
    for epoch in range(epochs):
      if not predict:
        print('Starting epoch %i' % epoch)
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        feed_dict = dict()
        if y_b is not None and not predict:
          for index, label in enumerate(self.labels_fd):
            if self.mode == "classification":
              feed_dict[label] = to_one_hot(y_b[:, index])
            if self.mode == "regression":
              feed_dict[label] = y_b[:, index:index + 1]
        if w_b is not None:
          feed_dict[self.weights] = w_b
        # Transform SMILES string to integer vectors
        smiles_seqs = [self.smiles_to_seq(smiles) for smiles in ids_b]
        feed_dict[self.smiles_seqs] = np.stack(smiles_seqs, axis=0)
        yield feed_dict
    def default_generator(self,
                          dataset,
                          epochs=1,
                          predict=False,
                          deterministic=True,
                          pad_batches=True):
        """ TensorGraph style implementation
        similar to deepchem.models.tf_new_models.graph_topology.DAGGraphTopology.batch_to_feed_dict
        """
        for epoch in range(epochs):
            if not predict:
                print('Starting epoch %i' % epoch)
            for (X_b, y_b, w_b,
                 ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                               deterministic=deterministic,
                                               pad_batches=pad_batches):

                feed_dict = dict()
                if y_b is not None and not predict:
                    for index, label in enumerate(self.labels_fd):
                        if self.mode == "classification":
                            feed_dict[label] = to_one_hot(y_b[:, index])
                        if self.mode == "regression":
                            feed_dict[label] = y_b[:, index:index + 1]
                if w_b is not None:
                    feed_dict[self.weights] = w_b

                atoms_per_mol = [mol.get_num_atoms() for mol in X_b]
                n_atoms = sum(atoms_per_mol)
                start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1])

                atoms_all = []
                # calculation orders for a batch of molecules
                parents_all = []
                calculation_orders = []
                calculation_masks = []
                membership = []
                for idm, mol in enumerate(X_b):
                    # padding atom features vector of each molecule with 0
                    atoms_all.append(mol.get_atom_features())
                    parents = mol.parents
                    parents_all.extend(parents)
                    calculation_index = np.array(parents)[:, :, 0]
                    mask = np.array(calculation_index - self.max_atoms,
                                    dtype=bool)
                    calculation_orders.append(calculation_index +
                                              start_index[idm])
                    calculation_masks.append(mask)
                    membership.extend([idm] * atoms_per_mol[idm])

                feed_dict[self.atom_features] = np.concatenate(atoms_all,
                                                               axis=0)
                feed_dict[self.parents] = np.stack(parents_all, axis=0)
                feed_dict[self.calculation_orders] = np.concatenate(
                    calculation_orders, axis=0)
                feed_dict[self.calculation_masks] = np.concatenate(
                    calculation_masks, axis=0)
                feed_dict[self.membership] = np.array(membership)
                feed_dict[self.n_atoms] = n_atoms
                yield feed_dict
Exemple #11
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       mode='fit',
                       deterministic=True,
                       pad_batches=True):
     for epoch in range(epochs):
         for (X_b, y_b, w_b,
              ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                            deterministic=deterministic,
                                            pad_batches=pad_batches):
             if self.mode == 'classification':
                 y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                     -1, self.n_tasks, self.n_classes)
             multiConvMol = ConvMol.agglomerate_mols(X_b)
             n_samples = np.array(X_b.shape[0])
             if mode == 'predict':
                 dropout = np.array(0.0)
             else:
                 dropout = np.array(1.0)
             inputs = [
                 multiConvMol.get_atom_features(), multiConvMol.deg_slice,
                 np.array(multiConvMol.membership), n_samples, dropout
             ]
             for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                 inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
             yield (inputs, [y_b], [w_b])
Exemple #12
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
     for epoch in range(epochs):
         for ind, (X_b, y_b, w_b, ids_b) in enumerate(
                 dataset.iterbatches(self.batch_size,
                                     pad_batches=pad_batches,
                                     deterministic=deterministic)):
             d = {}
             if self.mode == 'classification':
                 d[self.labels[0]] = to_one_hot(y_b.flatten(),
                                                self.n_classes).reshape(
                                                    -1, self.n_tasks,
                                                    self.n_classes)
             else:
                 d[self.labels[0]] = y_b
             d[self.task_weights[0]] = w_b
             multiConvMol = ConvMol.agglomerate_mols(X_b)
             d[self.atom_features] = multiConvMol.get_atom_features()
             d[self.degree_slice] = multiConvMol.deg_slice
             d[self.membership] = multiConvMol.membership
             for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                 d[self.deg_adjs[
                     i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
             yield d
Exemple #13
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       predict=False,
                       deterministic=True,
                       pad_batches=True):
   for epoch in range(epochs):
     for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
         batch_size=self.batch_size,
         deterministic=deterministic,
         pad_batches=pad_batches):
       feed_dict = dict()
       if y_b is not None and not predict:
         if self.mode == 'regression':
           feed_dict[self.labels[0]] = y_b
         else:
           feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(),
                                                  self.n_classes).reshape(
                                                      -1, self.n_tasks,
                                                      self.n_classes)
       if X_b is not None:
         feed_dict[self.features[0]] = X_b
       if w_b is not None and not predict:
         feed_dict[self.task_weights[0]] = w_b
       yield feed_dict
Exemple #14
0
  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """ Transfer smiles strings to fixed length integer vectors
    """
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        feed_dict = dict()
        if y_b is not None and not predict:
          if self.mode == "classification":
            feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(), 2).reshape(
                -1, self.n_tasks, 2)
          else:
            feed_dict[self.labels[0]] = y_b
        if w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b

        # Transform SMILES sequence to integers
        feed_dict[self.smiles_seqs] = self.smiles_to_seq_batch(ids_b)
        yield feed_dict
Exemple #15
0
 def test_one_hot(self):
     y = np.array([0, 0, 1, 0, 1, 1, 0])
     y_hot = metrics.to_one_hot(y)
     expected = np.array([[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1],
                          [1, 0]])
     yp = metrics.from_one_hot(y_hot)
     assert np.array_equal(expected, y_hot)
     assert np.array_equal(y, yp)
Exemple #16
0
 def test_one_hot(self):
   y = np.array([0, 0, 1, 0, 1, 1, 0])
   y_hot = metrics.to_one_hot(y)
   expected = np.array([[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1,
                                                                         0]])
   yp = metrics.from_one_hot(y_hot)
   assert np.array_equal(expected, y_hot)
   assert np.array_equal(y, yp)
Exemple #17
0
def test_one_hot():
  """Test the one hot encoding."""
  y = np.array([0, 0, 1, 0, 1, 1, 0])
  y_hot = to_one_hot(y)
  expected = np.array([[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0]])
  yp = from_one_hot(y_hot)
  assert np.array_equal(expected, y_hot)
  assert np.array_equal(y, yp)
Exemple #18
0
def test_normalize_1d_classification_multiclass_explicit_nclasses():
  """Tests 1d classification normalization."""
  y = np.random.randint(5, size=(10,))
  y_expected = np.expand_dims(to_one_hot(y, n_classes=10), 1)
  y_out = normalize_prediction_shape(
      y, mode="classification", n_classes=10, n_tasks=1)
  assert y_out.shape == (10, 1, 10)
  assert np.array_equal(y_expected, y_out)
Exemple #19
0
  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """ Same generator as Weave models """
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=False):

        X_b = pad_features(self.batch_size, X_b)
        feed_dict = dict()
        if y_b is not None:
          if self.mode == 'classification':
            feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(),
                                                   self.n_classes).reshape(
                                                       -1, self.n_tasks,
                                                       self.n_classes)
          else:
            feed_dict[self.labels[0]] = y_b
        if w_b is not None:
          feed_dict[self.task_weights[0]] = w_b

        atom_feat = []
        pair_feat = []
        atom_split = []
        atom_to_pair = []
        pair_split = []
        start = 0
        for im, mol in enumerate(X_b):
          n_atoms = mol.get_num_atoms()
          # number of atoms in each molecule
          atom_split.extend([im] * n_atoms)
          # index of pair features
          C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
          atom_to_pair.append(
              np.transpose(
                  np.array([C1.flatten() + start,
                            C0.flatten() + start])))
          # number of pairs for each atom
          pair_split.extend(C1.flatten() + start)
          start = start + n_atoms

          # atom features
          atom_feat.append(mol.get_atom_features())
          # pair features
          pair_feat.append(
              np.reshape(mol.get_pair_features(),
                         (n_atoms * n_atoms, self.n_pair_feat)))

        feed_dict[self.atom_features] = np.concatenate(atom_feat, axis=0)
        feed_dict[self.pair_features] = np.concatenate(pair_feat, axis=0)
        feed_dict[self.atom_split] = np.array(atom_split)
        feed_dict[self.atom_to_pair] = np.concatenate(atom_to_pair, axis=0)
        yield feed_dict
Exemple #20
0
    def default_generator(self,
                          dataset,
                          epochs=1,
                          predict=False,
                          deterministic=True,
                          pad_batches=True):
        """TensorGraph style implementation"""
        for epoch in range(epochs):
            for (X_b, y_b, w_b,
                 ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                               deterministic=deterministic,
                                               pad_batches=pad_batches):

                feed_dict = dict()
                if y_b is not None:
                    if self.mode == 'classification':
                        feed_dict[self.labels[0]] = to_one_hot(
                            y_b.flatten(),
                            self.n_classes).reshape(-1, self.n_tasks,
                                                    self.n_classes)
                    else:
                        feed_dict[self.labels[0]] = y_b
                if w_b is not None:
                    feed_dict[self.task_weights[0]] = w_b

                atoms_per_mol = [mol.get_num_atoms() for mol in X_b]
                n_atoms = sum(atoms_per_mol)
                start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1])

                atoms_all = []
                # calculation orders for a batch of molecules
                parents_all = []
                calculation_orders = []
                calculation_masks = []
                membership = []
                for idm, mol in enumerate(X_b):
                    # padding atom features vector of each molecule with 0
                    atoms_all.append(mol.get_atom_features())
                    parents = mol.parents
                    parents_all.extend(parents)
                    calculation_index = np.array(parents)[:, :, 0]
                    mask = np.array(calculation_index - self.max_atoms,
                                    dtype=bool)
                    calculation_orders.append(calculation_index +
                                              start_index[idm])
                    calculation_masks.append(mask)
                    membership.extend([idm] * atoms_per_mol[idm])

                feed_dict[self.atom_features] = np.concatenate(atoms_all,
                                                               axis=0)
                feed_dict[self.parents] = np.stack(parents_all, axis=0)
                feed_dict[self.calculation_orders] = np.concatenate(
                    calculation_orders, axis=0)
                feed_dict[self.calculation_masks] = np.concatenate(
                    calculation_masks, axis=0)
                feed_dict[self.membership] = np.array(membership)
                feed_dict[self.n_atoms] = n_atoms
                yield feed_dict
Exemple #21
0
 def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
     feed_dict = dict()
     if y_b is not None:
         for index, label in enumerate(self.labels):
             feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
     if self.task_weights is not None and w_b is not None:
         feed_dict[self.task_weights.out_tensor] = w_b
     if self.features is not None:
         feed_dict[self.features[0].out_tensor] = X_b
     return feed_dict
Exemple #22
0
 def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
   feed_dict = dict()
   if y_b is not None:
     for index, label in enumerate(self.labels):
       feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
   if self.task_weights is not None and w_b is not None:
     feed_dict[self.task_weights.out_tensor] = w_b
   if self.features is not None:
     feed_dict[self.features[0].out_tensor] = X_b
   return feed_dict
Exemple #23
0
  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    """TensorGraph style implementation"""
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        feed_dict = dict()
        if y_b is not None:
          if self.mode == 'classification':
            feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(),
                                                   self.n_classes).reshape(
                                                       -1, self.n_tasks,
                                                       self.n_classes)
          else:
            feed_dict[self.labels[0]] = y_b
        if w_b is not None:
          feed_dict[self.task_weights[0]] = w_b

        atoms_per_mol = [mol.get_num_atoms() for mol in X_b]
        n_atoms = sum(atoms_per_mol)
        start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1])

        atoms_all = []
        # calculation orders for a batch of molecules
        parents_all = []
        calculation_orders = []
        calculation_masks = []
        membership = []
        for idm, mol in enumerate(X_b):
          # padding atom features vector of each molecule with 0
          atoms_all.append(mol.get_atom_features())
          parents = mol.parents
          parents_all.extend(parents)
          calculation_index = np.array(parents)[:, :, 0]
          mask = np.array(calculation_index - self.max_atoms, dtype=bool)
          calculation_orders.append(calculation_index + start_index[idm])
          calculation_masks.append(mask)
          membership.extend([idm] * atoms_per_mol[idm])

        feed_dict[self.atom_features] = np.concatenate(atoms_all, axis=0)
        feed_dict[self.parents] = np.stack(parents_all, axis=0)
        feed_dict[self.calculation_orders] = np.concatenate(
            calculation_orders, axis=0)
        feed_dict[self.calculation_masks] = np.concatenate(
            calculation_masks, axis=0)
        feed_dict[self.membership] = np.array(membership)
        feed_dict[self.n_atoms] = n_atoms
        yield feed_dict
Exemple #24
0
    def default_generator(self,
                          dataset,
                          epochs=1,
                          mode='fit',
                          deterministic=True,
                          pad_batches=True):
        for epoch in range(epochs):
            for (X_b, y_b, w_b,
                 ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                               deterministic=deterministic,
                                               pad_batches=pad_batches):

                n_samples = np.array(X_b.shape[0])
                X_b = pad_features(self.batch_size, X_b)
                if y_b is not None and self.mode == 'classification':
                    y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                        -1, self.n_tasks, self.n_classes)

                atom_feat = []
                pair_feat = []
                atom_split = []
                atom_to_pair = []
                pair_split = []
                start = 0
                for im, mol in enumerate(X_b):
                    n_atoms = mol.get_num_atoms()
                    # number of atoms in each molecule
                    atom_split.extend([im] * n_atoms)
                    # index of pair features
                    C0, C1 = np.meshgrid(np.arange(n_atoms),
                                         np.arange(n_atoms))
                    atom_to_pair.append(
                        np.transpose(
                            np.array(
                                [C1.flatten() + start,
                                 C0.flatten() + start])))
                    # number of pairs for each atom
                    pair_split.extend(C1.flatten() + start)
                    start = start + n_atoms

                    # atom features
                    atom_feat.append(mol.get_atom_features())
                    # pair features
                    pair_feat.append(
                        np.reshape(mol.get_pair_features(),
                                   (n_atoms * n_atoms, self.n_pair_feat)))

                inputs = [
                    np.concatenate(atom_feat, axis=0),
                    np.concatenate(pair_feat, axis=0),
                    np.array(atom_split),
                    np.concatenate(atom_to_pair, axis=0), n_samples
                ]
                yield (inputs, [y_b], [w_b])
Exemple #25
0
def test_handle_classification_mode_threshold_one_hot():
  """Test proper thresholding."""
  y = np.random.rand(10, 2)
  y = y / np.sum(y, axis=1)[:, np.newaxis]
  y = np.expand_dims(y, 1)
  y_expected = np.expand_dims(
      to_one_hot(np.argmax(np.squeeze(y), axis=1), n_classes=2), 1)
  y_out = handle_classification_mode(
      y, "threshold-one-hot", threshold_value=0.5)
  assert y_out.shape == (10, 1, 2)
  assert np.array_equal(y_out, y_expected)
Exemple #26
0
 def get_data_dict(self, X, y=None):
     """Wrap data X in dict for graph computations (Keras graph only for now)."""
     data = {}
     data["input"] = X
     for ind, task in enumerate(self.tasks):
         task_type, taskname = self.task_types[task], "task%d" % ind
         if y is not None:
             if task_type == "classification":
                 data[taskname] = to_one_hot(y[:, ind])
             elif task_type == "regression":
                 data[taskname] = y[:, ind]
     return data
Exemple #27
0
 def get_data_dict(self, X, y=None):
     """Wrap data X in dict for graph computations (Keras graph only for now)."""
     data = {}
     data["input"] = X
     for task in range(self.n_tasks):
         taskname = "task%d" % task
         if y is not None:
             if self.task_type == "classification":
                 data[taskname] = to_one_hot(y[:, task])
             elif self.task_type == "regression":
                 data[taskname] = y[:, task]
     return data
 def predict_proba_on_batch(self, support, test_batch):
     """Make predictions on batch of data."""
     n_samples = len(test_batch)
     padded_test_batch = NumpyDataset(
         *pad_batch(self.test_batch_size, test_batch.X, test_batch.y,
                    test_batch.w, test_batch.ids))
     feed_dict = self.construct_feed_dict(padded_test_batch, support)
     # Get scores
     pred, scores = self.sess.run([self.pred_op, self.scores_op],
                                  feed_dict=feed_dict)
     y_pred_batch = to_one_hot(np.round(pred))
     return y_pred_batch
Exemple #29
0
 def get_data_dict(self, X, y=None):
   """Wrap data X in dict for graph computations (Keras graph only for now)."""
   data = {}
   data["input"] = X
   for ind, task in enumerate(self.tasks):
     task_type, taskname = self.task_types[task], "task%d" % ind
     if y is not None:
       if task_type == "classification":
         data[taskname] = to_one_hot(y[:, ind])
       elif task_type == "regression":
         data[taskname] = y[:, ind]
   return data
Exemple #30
0
 def get_data_dict(self, X, y=None):
   """Wrap data X in dict for graph computations (Keras graph only for now)."""
   data = {}
   data["input"] = X
   for task in range(self.n_tasks):
     taskname = "task%d" % task 
     if y is not None:
       if self.task_type == "classification":
         data[taskname] = to_one_hot(y[:, task])
       elif self.task_type == "regression":
         data[taskname] = y[:, task]
   return data
    def data_generator(self, dataset, batch_size:int, epochs=1):
        for e in range(epochs):
            for X, y, w, idx in dataset.iterbatches(batch_size, pad_batches=True, deterministic=True):
                feed_dict = {self.label: to_one_hot(y[:, 0]), self.weight: w}  # data for feed
                ConvMolList = ConvMol.agglomerate_mols(X)
                feed_dict[self.atom_features] = ConvMolList.get_atom_features()
                feed_dict[self.indexing] = ConvMolList.deg_slice
                feed_dict[self.membership] = ConvMolList.membership
                deg_adj_list = ConvMolList.get_deg_adjacency_lists()
                for i in range(1, len(deg_adj_list)):
                    feed_dict[self.deg_adj_list[i - 1]] = deg_adj_list[i]

                yield feed_dict
Exemple #32
0
  def default_generator(self,
                        dataset,
                        epochs=1,
                        mode='fit',
                        deterministic=True,
                        pad_batches=True):
    """Convert a dataset into the tensors needed for learning"""
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):

        if y_b is not None and self.mode == 'classification':
          y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
              -1, self.n_tasks, self.n_classes)

        atoms_per_mol = [mol.get_num_atoms() for mol in X_b]
        n_atoms = sum(atoms_per_mol)
        start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1])

        atoms_all = []
        # calculation orders for a batch of molecules
        parents_all = []
        calculation_orders = []
        calculation_masks = []
        membership = []
        for idm, mol in enumerate(X_b):
          # padding atom features vector of each molecule with 0
          atoms_all.append(mol.get_atom_features())
          parents = mol.parents
          parents_all.extend(parents)
          calculation_index = np.array(parents)[:, :, 0]
          mask = np.array(calculation_index - self.max_atoms, dtype=bool)
          calculation_orders.append(calculation_index + start_index[idm])
          calculation_masks.append(mask)
          membership.extend([idm] * atoms_per_mol[idm])
        if mode == 'predict':
          dropout = np.array(0.0)
        else:
          dropout = np.array(1.0)

        yield ([
            np.concatenate(atoms_all, axis=0),
            np.stack(parents_all, axis=0),
            np.concatenate(calculation_orders, axis=0),
            np.concatenate(calculation_masks, axis=0),
            np.array(membership),
            np.array(n_atoms), dropout
        ], [y_b], [w_b])
Exemple #33
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       mode='fit',
                       deterministic=True,
                       pad_batches=True):
     for epoch in range(epochs):
         for (X_b, y_b, w_b,
              ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                            deterministic=deterministic,
                                            pad_batches=pad_batches):
             if y_b is not None:
                 y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                     -1, self.n_tasks, self.n_classes)
             yield ([X_b], [y_b], [w_b])
Exemple #34
0
 def feed_dict_generator(dataset, batch_size, epochs=1):
   for epoch in range(epochs):
     for ind, (X_b, y_b, w_b, ids_b) in enumerate(
         dataset.iterbatches(batch_size, pad_batches=True)):
       d = {}
       for index, label in enumerate(labels):
         d[label] = to_one_hot(y_b[:, index])
       d[task_weights] = w_b
       multiConvMol = ConvMol.agglomerate_mols(X_b)
       d[atom_features] = multiConvMol.get_atom_features()
       d[degree_slice] = multiConvMol.deg_slice
       d[membership] = multiConvMol.membership
       for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
         d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
       yield d
Exemple #35
0
 def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
   feed_dict = dict()
   if y_b is not None:
     for index, label in enumerate(self.labels):
       feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
   if self.task_weights is not None and w_b is not None:
     feed_dict[self.task_weights[0].out_tensor] = w_b
   if self.features is not None:
     multiConvMol = ConvMol.agglomerate_mols(X_b)
     feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features()
     feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice
     feed_dict[self.features[2].out_tensor] = multiConvMol.membership
     for i in range(self.max_degree):
       feed_dict[self.features[i + 3]
                 .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1]
   return feed_dict
Exemple #36
0
 def _construct_feed_dict(self, X_b, y_b, w_b, ids_b):
   feed_dict = dict()
   if y_b is not None:
     for index, label in enumerate(self.labels):
       feed_dict[label.out_tensor] = to_one_hot(y_b[:, index])
   if self.task_weights is not None and w_b is not None:
     feed_dict[self.task_weights[0].out_tensor] = w_b
   if self.features is not None:
     multiConvMol = ConvMol.agglomerate_mols(X_b)
     feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features()
     feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice
     feed_dict[self.features[2].out_tensor] = multiConvMol.membership
     for i in range(self.max_degree):
       feed_dict[self.features[i + 3]
                 .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1]
   return feed_dict
Exemple #37
0
 def default_generator(
         self,
         dataset: dc.data.Dataset,
         epochs: int = 1,
         mode: str = 'fit',
         deterministic: bool = True,
         pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
     for epoch in range(epochs):
         for (X_b, y_b, w_b,
              ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                            deterministic=deterministic,
                                            pad_batches=pad_batches):
             if y_b is not None:
                 y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                     -1, self.n_tasks, self.n_classes)
             yield ([X_b], [y_b], [w_b])
Exemple #38
0
  def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):

    orig_dict = {}
    orig_dict["mol_features"] = X_b
    for task in range(self.n_tasks):
      if y_b is not None:
        y_2column = to_one_hot(y_b[:, task])
        # fix the size to be [?,1]
        orig_dict["labels_%d" % task] = y_2column[:, 1:2]
      else:
        # Dummy placeholders
        orig_dict["labels_%d" % task] = np.zeros((self.batch_size, 1))
      if w_b is not None:
        orig_dict["weights_%d" % task] = w_b[:, task]
      else:
        # Dummy placeholders
        orig_dict["weights_%d" % task] = np.ones((self.batch_size,))
    return TensorflowGraph.get_feed_dict(orig_dict)
Exemple #39
0
    def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None):

        orig_dict = {}
        orig_dict["mol_features"] = X_b
        for task in range(self.n_tasks):
            if y_b is not None:
                y_2column = to_one_hot(y_b[:, task])
                # fix the size to be [?,1]
                orig_dict["labels_%d" % task] = y_2column[:, 1:2]
            else:
                # Dummy placeholders
                orig_dict["labels_%d" % task] = np.zeros((self.batch_size, 1))
            if w_b is not None:
                orig_dict["weights_%d" % task] = w_b[:, task]
            else:
                # Dummy placeholders
                orig_dict["weights_%d" % task] = np.ones((self.batch_size, ))
        return TensorflowGraph.get_feed_dict(orig_dict)
Exemple #40
0
def compute_loss_on_valid(valid, model, tasks, mode, verbose=True):
    loss_fn = model._loss_fn
    outputs = model.predict(valid, transformers=[])

    if mode == "classification":
        labels = to_one_hot(valid.y.flatten(), 2).reshape(-1, len(tasks), 2)
    else:
        labels = valid.y

    loss_tensor = loss_fn([outputs], [labels], weights=[valid.w])
    if tf.executing_eagerly():
        loss = loss_tensor.numpy()
    else:
        loss = model.session.run(loss_tensor)

    if verbose:
        logger.info("Computed loss on validation set: {}".format(loss))
    return loss
Exemple #41
0
 def default_generator(self,
                       dataset,
                       epochs=1,
                       mode='fit',
                       deterministic=True,
                       pad_batches=True):
   """Transfer smiles strings to fixed length integer vectors"""
   for epoch in range(epochs):
     for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
         batch_size=self.batch_size,
         deterministic=deterministic,
         pad_batches=pad_batches):
       if y_b is not None:
         if self.mode == 'classification':
           y_b = to_one_hot(y_b.flatten(), 2).reshape(-1, self.n_tasks, 2)
       # Transform SMILES sequence to integers
       X_b = self.smiles_to_seq_batch(ids_b)
       yield ([X_b], [y_b], [w_b])
Exemple #42
0
def data_generator(dataset, epochs=1, predict=False, pad_batches=True):
    for epoch in range(epochs):
        if not predict:
            print('Starting epoch %i' % epoch)
        for ind, (X_b, y_b, w_b, ids_b) in enumerate(
                dataset.iterbatches(batch_size,
                                    pad_batches=pad_batches,
                                    deterministic=True)):
            d = {}
            for index, label in enumerate(labels):
                d[label] = to_one_hot(y_b[:, index])
            d[weights] = w_b
            multiConvMol = ConvMol.agglomerate_mols(X_b)
            d[atom_features] = multiConvMol.get_atom_features()
            d[degree_slice] = multiConvMol.deg_slice
            d[membership] = multiConvMol.membership
            for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
                d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i]
            yield d