def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None): """Construct a feed dictionary from minibatch data. TODO(rbharath): ids_b is not used here. Can we remove it? Args: X_b: np.ndarray of shape (batch_size, n_features) y_b: np.ndarray of shape (batch_size, n_tasks) w_b: np.ndarray of shape (batch_size, n_tasks) ids_b: List of length (batch_size) with datapoint identifiers. """ orig_dict = {} orig_dict["mol_features"] = X_b for task in range(self.n_tasks): if y_b is not None: orig_dict["labels_%d" % task] = to_one_hot(y_b[:, task]) else: # Dummy placeholders orig_dict["labels_%d" % task] = np.squeeze( to_one_hot(np.zeros((self.batch_size, )))) if w_b is not None: orig_dict["weights_%d" % task] = w_b[:, task] else: # Dummy placeholders orig_dict["weights_%d" % task] = np.ones((self.batch_size, )) return TensorflowGraph.get_feed_dict(orig_dict)
def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None): """Construct a feed dictionary from minibatch data. TODO(rbharath): ids_b is not used here. Can we remove it? Args: X_b: np.ndarray of shape (batch_size, n_features) y_b: np.ndarray of shape (batch_size, n_tasks) w_b: np.ndarray of shape (batch_size, n_tasks) ids_b: List of length (batch_size) with datapoint identifiers. """ orig_dict = {} orig_dict["mol_features"] = X_b for task in range(self.n_tasks): if y_b is not None: orig_dict["labels_%d" % task] = to_one_hot(y_b[:, task]) else: # Dummy placeholders orig_dict["labels_%d" % task] = np.squeeze(to_one_hot( np.zeros((self.batch_size,)))) if w_b is not None: orig_dict["weights_%d" % task] = w_b[:, task] else: # Dummy placeholders orig_dict["weights_%d" % task] = np.ones( (self.batch_size,)) return TensorflowGraph.get_feed_dict(orig_dict)
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): for epoch in range(epochs): if mode == "predict" or (not self.augment): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) yield ([X_b], [y_b], [w_b]) else: if not pad_batches: n_samples = dataset.X.shape[0] else: n_samples = dataset.X.shape[0] + ( self.batch_size - (dataset.X.shape[0] % self.batch_size)) n_batches = 0 image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator( rotation_range=180) for (X_b, y_b, w_b) in image_data_generator.flow( dataset.X, dataset.y, sample_weight=dataset.w, shuffle=not deterministic, batch_size=self.batch_size): if pad_batches: ids_b = np.arange(X_b.shape[0]) X_b, y_b, w_b, _ = pad_batch(self.batch_size, X_b, y_b, w_b, ids_b) n_batches += 1 if n_batches > n_samples / self.batch_size: # This is needed because ImageDataGenerator does infinite looping break if self.mode == "classification": y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) yield ([X_b], [y_b], [w_b])
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=True, deterministic=deterministic)): d = {} for index, label in enumerate(self.my_labels): if self.mode == 'classification': d[label] = to_one_hot(y_b[:, index]) if self.mode == 'regression': d[label] = np.expand_dims(y_b[:, index], -1) d[self.my_task_weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """ TensorGraph style implementation similar to deepchem.models.tf_new_models.graph_topology.AlternateWeaveTopology.batch_to_feed_dict """ for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None and not predict: for index, label in enumerate(self.labels_fd): if self.mode == "classification": feed_dict[label] = to_one_hot(y_b[:, index]) if self.mode == "regression": feed_dict[label] = y_b[:, index:index + 1] if w_b is not None: feed_dict[self.weights] = w_b atom_feat = [] pair_feat = [] atom_split = [] atom_to_pair = [] pair_split = [] start = 0 for im, mol in enumerate(X_b): n_atoms = mol.get_num_atoms() # number of atoms in each molecule atom_split.extend([im] * n_atoms) # index of pair features C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms)) atom_to_pair.append( np.transpose( np.array([C1.flatten() + start, C0.flatten() + start]))) # number of pairs for each atom pair_split.extend(C1.flatten() + start) start = start + n_atoms # atom features atom_feat.append(mol.get_atom_features()) # pair features pair_feat.append( np.reshape(mol.get_pair_features(), (n_atoms * n_atoms, self.n_pair_feat))) feed_dict[self.atom_features] = np.concatenate(atom_feat, axis=0) feed_dict[self.pair_features] = np.concatenate(pair_feat, axis=0) feed_dict[self.pair_split] = np.array(pair_split) feed_dict[self.atom_split] = np.array(atom_split) feed_dict[self.atom_to_pair] = np.concatenate(atom_to_pair, axis=0) yield feed_dict
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=pad_batches, deterministic=deterministic)): d = {} if self.mode == 'classification': d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: d[self.labels[0]] = y_b d[self.task_weights[0]] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """ Transfer smiles strings to fixed length integer vectors """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None and not predict: if self.mode == "classification": feed_dict[self.labels[0]] = to_one_hot( y_b.flatten(), 2).reshape(-1, self.n_tasks, 2) else: feed_dict[self.labels[0]] = y_b if w_b is not None and not predict: feed_dict[self.task_weights[0]] = w_b # Transform SMILES sequence to integers feed_dict[self.smiles_seqs] = self.smiles_to_seq_batch(ids_b) yield feed_dict
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=True, deterministic=deterministic)): d = {} for index, label in enumerate(self.my_labels): if self.mode == 'classification': d[label] = to_one_hot(y_b[:, index]) if self.mode == 'regression': d[label] = np.expand_dims(y_b[:, index], -1) d[self.my_task_weights] = w_b d[self.adj_matrix] = np.expand_dims(np.array([x[0] for x in X_b]), -2) d[self.vertex_features] = np.array([x[1] for x in X_b]) mask = np.zeros(shape=(self.batch_size, self.max_atoms, 1)) for i in range(self.batch_size): mask_size = X_b[i][2] mask[i][:mask_size][0] = 1 d[self.mask] = mask yield d
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """ Transfer smiles strings to fixed length integer vectors """ for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None and not predict: for index, label in enumerate(self.labels_fd): if self.mode == "classification": feed_dict[label] = to_one_hot(y_b[:, index]) if self.mode == "regression": feed_dict[label] = y_b[:, index:index + 1] if w_b is not None: feed_dict[self.weights] = w_b # Transform SMILES string to integer vectors smiles_seqs = [self.smiles_to_seq(smiles) for smiles in ids_b] feed_dict[self.smiles_seqs] = np.stack(smiles_seqs, axis=0) yield feed_dict
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """ TensorGraph style implementation similar to deepchem.models.tf_new_models.graph_topology.DAGGraphTopology.batch_to_feed_dict """ for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None and not predict: for index, label in enumerate(self.labels_fd): if self.mode == "classification": feed_dict[label] = to_one_hot(y_b[:, index]) if self.mode == "regression": feed_dict[label] = y_b[:, index:index + 1] if w_b is not None: feed_dict[self.weights] = w_b atoms_per_mol = [mol.get_num_atoms() for mol in X_b] n_atoms = sum(atoms_per_mol) start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1]) atoms_all = [] # calculation orders for a batch of molecules parents_all = [] calculation_orders = [] calculation_masks = [] membership = [] for idm, mol in enumerate(X_b): # padding atom features vector of each molecule with 0 atoms_all.append(mol.get_atom_features()) parents = mol.parents parents_all.extend(parents) calculation_index = np.array(parents)[:, :, 0] mask = np.array(calculation_index - self.max_atoms, dtype=bool) calculation_orders.append(calculation_index + start_index[idm]) calculation_masks.append(mask) membership.extend([idm] * atoms_per_mol[idm]) feed_dict[self.atom_features] = np.concatenate(atoms_all, axis=0) feed_dict[self.parents] = np.stack(parents_all, axis=0) feed_dict[self.calculation_orders] = np.concatenate( calculation_orders, axis=0) feed_dict[self.calculation_masks] = np.concatenate( calculation_masks, axis=0) feed_dict[self.membership] = np.array(membership) feed_dict[self.n_atoms] = n_atoms yield feed_dict
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) multiConvMol = ConvMol.agglomerate_mols(X_b) n_samples = np.array(X_b.shape[0]) if mode == 'predict': dropout = np.array(0.0) else: dropout = np.array(1.0) inputs = [ multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership), n_samples, dropout ] for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): inputs.append(multiConvMol.get_deg_adjacency_lists()[i]) yield (inputs, [y_b], [w_b])
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(self.batch_size, pad_batches=pad_batches, deterministic=deterministic)): d = {} if self.mode == 'classification': d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: d[self.labels[0]] = y_b d[self.task_weights[0]] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[ i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None and not predict: if self.mode == 'regression': feed_dict[self.labels[0]] = y_b else: feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) if X_b is not None: feed_dict[self.features[0]] = X_b if w_b is not None and not predict: feed_dict[self.task_weights[0]] = w_b yield feed_dict
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """ Transfer smiles strings to fixed length integer vectors """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None and not predict: if self.mode == "classification": feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(), 2).reshape( -1, self.n_tasks, 2) else: feed_dict[self.labels[0]] = y_b if w_b is not None and not predict: feed_dict[self.task_weights[0]] = w_b # Transform SMILES sequence to integers feed_dict[self.smiles_seqs] = self.smiles_to_seq_batch(ids_b) yield feed_dict
def test_one_hot(self): y = np.array([0, 0, 1, 0, 1, 1, 0]) y_hot = metrics.to_one_hot(y) expected = np.array([[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0]]) yp = metrics.from_one_hot(y_hot) assert np.array_equal(expected, y_hot) assert np.array_equal(y, yp)
def test_one_hot(): """Test the one hot encoding.""" y = np.array([0, 0, 1, 0, 1, 1, 0]) y_hot = to_one_hot(y) expected = np.array([[1, 0], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0]]) yp = from_one_hot(y_hot) assert np.array_equal(expected, y_hot) assert np.array_equal(y, yp)
def test_normalize_1d_classification_multiclass_explicit_nclasses(): """Tests 1d classification normalization.""" y = np.random.randint(5, size=(10,)) y_expected = np.expand_dims(to_one_hot(y, n_classes=10), 1) y_out = normalize_prediction_shape( y, mode="classification", n_classes=10, n_tasks=1) assert y_out.shape == (10, 1, 10) assert np.array_equal(y_expected, y_out)
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """ Same generator as Weave models """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=False): X_b = pad_features(self.batch_size, X_b) feed_dict = dict() if y_b is not None: if self.mode == 'classification': feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: feed_dict[self.labels[0]] = y_b if w_b is not None: feed_dict[self.task_weights[0]] = w_b atom_feat = [] pair_feat = [] atom_split = [] atom_to_pair = [] pair_split = [] start = 0 for im, mol in enumerate(X_b): n_atoms = mol.get_num_atoms() # number of atoms in each molecule atom_split.extend([im] * n_atoms) # index of pair features C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms)) atom_to_pair.append( np.transpose( np.array([C1.flatten() + start, C0.flatten() + start]))) # number of pairs for each atom pair_split.extend(C1.flatten() + start) start = start + n_atoms # atom features atom_feat.append(mol.get_atom_features()) # pair features pair_feat.append( np.reshape(mol.get_pair_features(), (n_atoms * n_atoms, self.n_pair_feat))) feed_dict[self.atom_features] = np.concatenate(atom_feat, axis=0) feed_dict[self.pair_features] = np.concatenate(pair_feat, axis=0) feed_dict[self.atom_split] = np.array(atom_split) feed_dict[self.atom_to_pair] = np.concatenate(atom_to_pair, axis=0) yield feed_dict
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """TensorGraph style implementation""" for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None: if self.mode == 'classification': feed_dict[self.labels[0]] = to_one_hot( y_b.flatten(), self.n_classes).reshape(-1, self.n_tasks, self.n_classes) else: feed_dict[self.labels[0]] = y_b if w_b is not None: feed_dict[self.task_weights[0]] = w_b atoms_per_mol = [mol.get_num_atoms() for mol in X_b] n_atoms = sum(atoms_per_mol) start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1]) atoms_all = [] # calculation orders for a batch of molecules parents_all = [] calculation_orders = [] calculation_masks = [] membership = [] for idm, mol in enumerate(X_b): # padding atom features vector of each molecule with 0 atoms_all.append(mol.get_atom_features()) parents = mol.parents parents_all.extend(parents) calculation_index = np.array(parents)[:, :, 0] mask = np.array(calculation_index - self.max_atoms, dtype=bool) calculation_orders.append(calculation_index + start_index[idm]) calculation_masks.append(mask) membership.extend([idm] * atoms_per_mol[idm]) feed_dict[self.atom_features] = np.concatenate(atoms_all, axis=0) feed_dict[self.parents] = np.stack(parents_all, axis=0) feed_dict[self.calculation_orders] = np.concatenate( calculation_orders, axis=0) feed_dict[self.calculation_masks] = np.concatenate( calculation_masks, axis=0) feed_dict[self.membership] = np.array(membership) feed_dict[self.n_atoms] = n_atoms yield feed_dict
def _construct_feed_dict(self, X_b, y_b, w_b, ids_b): feed_dict = dict() if y_b is not None: for index, label in enumerate(self.labels): feed_dict[label.out_tensor] = to_one_hot(y_b[:, index]) if self.task_weights is not None and w_b is not None: feed_dict[self.task_weights.out_tensor] = w_b if self.features is not None: feed_dict[self.features[0].out_tensor] = X_b return feed_dict
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): """TensorGraph style implementation""" for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): feed_dict = dict() if y_b is not None: if self.mode == 'classification': feed_dict[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: feed_dict[self.labels[0]] = y_b if w_b is not None: feed_dict[self.task_weights[0]] = w_b atoms_per_mol = [mol.get_num_atoms() for mol in X_b] n_atoms = sum(atoms_per_mol) start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1]) atoms_all = [] # calculation orders for a batch of molecules parents_all = [] calculation_orders = [] calculation_masks = [] membership = [] for idm, mol in enumerate(X_b): # padding atom features vector of each molecule with 0 atoms_all.append(mol.get_atom_features()) parents = mol.parents parents_all.extend(parents) calculation_index = np.array(parents)[:, :, 0] mask = np.array(calculation_index - self.max_atoms, dtype=bool) calculation_orders.append(calculation_index + start_index[idm]) calculation_masks.append(mask) membership.extend([idm] * atoms_per_mol[idm]) feed_dict[self.atom_features] = np.concatenate(atoms_all, axis=0) feed_dict[self.parents] = np.stack(parents_all, axis=0) feed_dict[self.calculation_orders] = np.concatenate( calculation_orders, axis=0) feed_dict[self.calculation_masks] = np.concatenate( calculation_masks, axis=0) feed_dict[self.membership] = np.array(membership) feed_dict[self.n_atoms] = n_atoms yield feed_dict
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): n_samples = np.array(X_b.shape[0]) X_b = pad_features(self.batch_size, X_b) if y_b is not None and self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) atom_feat = [] pair_feat = [] atom_split = [] atom_to_pair = [] pair_split = [] start = 0 for im, mol in enumerate(X_b): n_atoms = mol.get_num_atoms() # number of atoms in each molecule atom_split.extend([im] * n_atoms) # index of pair features C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms)) atom_to_pair.append( np.transpose( np.array( [C1.flatten() + start, C0.flatten() + start]))) # number of pairs for each atom pair_split.extend(C1.flatten() + start) start = start + n_atoms # atom features atom_feat.append(mol.get_atom_features()) # pair features pair_feat.append( np.reshape(mol.get_pair_features(), (n_atoms * n_atoms, self.n_pair_feat))) inputs = [ np.concatenate(atom_feat, axis=0), np.concatenate(pair_feat, axis=0), np.array(atom_split), np.concatenate(atom_to_pair, axis=0), n_samples ] yield (inputs, [y_b], [w_b])
def test_handle_classification_mode_threshold_one_hot(): """Test proper thresholding.""" y = np.random.rand(10, 2) y = y / np.sum(y, axis=1)[:, np.newaxis] y = np.expand_dims(y, 1) y_expected = np.expand_dims( to_one_hot(np.argmax(np.squeeze(y), axis=1), n_classes=2), 1) y_out = handle_classification_mode( y, "threshold-one-hot", threshold_value=0.5) assert y_out.shape == (10, 1, 2) assert np.array_equal(y_out, y_expected)
def get_data_dict(self, X, y=None): """Wrap data X in dict for graph computations (Keras graph only for now).""" data = {} data["input"] = X for ind, task in enumerate(self.tasks): task_type, taskname = self.task_types[task], "task%d" % ind if y is not None: if task_type == "classification": data[taskname] = to_one_hot(y[:, ind]) elif task_type == "regression": data[taskname] = y[:, ind] return data
def get_data_dict(self, X, y=None): """Wrap data X in dict for graph computations (Keras graph only for now).""" data = {} data["input"] = X for task in range(self.n_tasks): taskname = "task%d" % task if y is not None: if self.task_type == "classification": data[taskname] = to_one_hot(y[:, task]) elif self.task_type == "regression": data[taskname] = y[:, task] return data
def predict_proba_on_batch(self, support, test_batch): """Make predictions on batch of data.""" n_samples = len(test_batch) padded_test_batch = NumpyDataset( *pad_batch(self.test_batch_size, test_batch.X, test_batch.y, test_batch.w, test_batch.ids)) feed_dict = self.construct_feed_dict(padded_test_batch, support) # Get scores pred, scores = self.sess.run([self.pred_op, self.scores_op], feed_dict=feed_dict) y_pred_batch = to_one_hot(np.round(pred)) return y_pred_batch
def data_generator(self, dataset, batch_size:int, epochs=1): for e in range(epochs): for X, y, w, idx in dataset.iterbatches(batch_size, pad_batches=True, deterministic=True): feed_dict = {self.label: to_one_hot(y[:, 0]), self.weight: w} # data for feed ConvMolList = ConvMol.agglomerate_mols(X) feed_dict[self.atom_features] = ConvMolList.get_atom_features() feed_dict[self.indexing] = ConvMolList.deg_slice feed_dict[self.membership] = ConvMolList.membership deg_adj_list = ConvMolList.get_deg_adjacency_lists() for i in range(1, len(deg_adj_list)): feed_dict[self.deg_adj_list[i - 1]] = deg_adj_list[i] yield feed_dict
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): """Convert a dataset into the tensors needed for learning""" for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None and self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) atoms_per_mol = [mol.get_num_atoms() for mol in X_b] n_atoms = sum(atoms_per_mol) start_index = [0] + list(np.cumsum(atoms_per_mol)[:-1]) atoms_all = [] # calculation orders for a batch of molecules parents_all = [] calculation_orders = [] calculation_masks = [] membership = [] for idm, mol in enumerate(X_b): # padding atom features vector of each molecule with 0 atoms_all.append(mol.get_atom_features()) parents = mol.parents parents_all.extend(parents) calculation_index = np.array(parents)[:, :, 0] mask = np.array(calculation_index - self.max_atoms, dtype=bool) calculation_orders.append(calculation_index + start_index[idm]) calculation_masks.append(mask) membership.extend([idm] * atoms_per_mol[idm]) if mode == 'predict': dropout = np.array(0.0) else: dropout = np.array(1.0) yield ([ np.concatenate(atoms_all, axis=0), np.stack(parents_all, axis=0), np.concatenate(calculation_orders, axis=0), np.concatenate(calculation_masks, axis=0), np.array(membership), np.array(n_atoms), dropout ], [y_b], [w_b])
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) yield ([X_b], [y_b], [w_b])
def feed_dict_generator(dataset, batch_size, epochs=1): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=True)): d = {} for index, label in enumerate(labels): d[label] = to_one_hot(y_b[:, index]) d[task_weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[atom_features] = multiConvMol.get_atom_features() d[degree_slice] = multiConvMol.deg_slice d[membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def _construct_feed_dict(self, X_b, y_b, w_b, ids_b): feed_dict = dict() if y_b is not None: for index, label in enumerate(self.labels): feed_dict[label.out_tensor] = to_one_hot(y_b[:, index]) if self.task_weights is not None and w_b is not None: feed_dict[self.task_weights[0].out_tensor] = w_b if self.features is not None: multiConvMol = ConvMol.agglomerate_mols(X_b) feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features() feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice feed_dict[self.features[2].out_tensor] = multiConvMol.membership for i in range(self.max_degree): feed_dict[self.features[i + 3] .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1] return feed_dict
def default_generator( self, dataset: dc.data.Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) yield ([X_b], [y_b], [w_b])
def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None): orig_dict = {} orig_dict["mol_features"] = X_b for task in range(self.n_tasks): if y_b is not None: y_2column = to_one_hot(y_b[:, task]) # fix the size to be [?,1] orig_dict["labels_%d" % task] = y_2column[:, 1:2] else: # Dummy placeholders orig_dict["labels_%d" % task] = np.zeros((self.batch_size, 1)) if w_b is not None: orig_dict["weights_%d" % task] = w_b[:, task] else: # Dummy placeholders orig_dict["weights_%d" % task] = np.ones((self.batch_size,)) return TensorflowGraph.get_feed_dict(orig_dict)
def construct_feed_dict(self, X_b, y_b=None, w_b=None, ids_b=None): orig_dict = {} orig_dict["mol_features"] = X_b for task in range(self.n_tasks): if y_b is not None: y_2column = to_one_hot(y_b[:, task]) # fix the size to be [?,1] orig_dict["labels_%d" % task] = y_2column[:, 1:2] else: # Dummy placeholders orig_dict["labels_%d" % task] = np.zeros((self.batch_size, 1)) if w_b is not None: orig_dict["weights_%d" % task] = w_b[:, task] else: # Dummy placeholders orig_dict["weights_%d" % task] = np.ones((self.batch_size, )) return TensorflowGraph.get_feed_dict(orig_dict)
def compute_loss_on_valid(valid, model, tasks, mode, verbose=True): loss_fn = model._loss_fn outputs = model.predict(valid, transformers=[]) if mode == "classification": labels = to_one_hot(valid.y.flatten(), 2).reshape(-1, len(tasks), 2) else: labels = valid.y loss_tensor = loss_fn([outputs], [labels], weights=[valid.w]) if tf.executing_eagerly(): loss = loss_tensor.numpy() else: loss = model.session.run(loss_tensor) if verbose: logger.info("Computed loss on validation set: {}".format(loss)) return loss
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): """Transfer smiles strings to fixed length integer vectors""" for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), 2).reshape(-1, self.n_tasks, 2) # Transform SMILES sequence to integers X_b = self.smiles_to_seq_batch(ids_b) yield ([X_b], [y_b], [w_b])
def data_generator(dataset, epochs=1, predict=False, pad_batches=True): for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=pad_batches, deterministic=True)): d = {} for index, label in enumerate(labels): d[label] = to_one_hot(y_b[:, index]) d[weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[atom_features] = multiConvMol.get_atom_features() d[degree_slice] = multiConvMol.deg_slice d[membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d