def split(self, dataset, frac_split, split_dirs=None): """ Method that does bulk of splitting dataset. """ if split_dirs is not None: assert len(split_dirs) == 2 else: split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()] # Handle edge case where frac_split is 1 if frac_split == 1: dataset_1 = NumpyDataset(dataset.X, dataset.y, dataset.w, dataset.ids) dataset_2 = None return dataset_1, dataset_2 X, y, w, ids = randomize_arrays( (dataset.X, dataset.y, dataset.w, dataset.ids)) split_indices = self.get_task_split_indices(y, w, frac_split) # Create weight matrices fpor two haves. w_1, w_2 = np.zeros_like(w), np.zeros_like(w) for task, split_index in enumerate(split_indices): # copy over up to required index for weight first_split w_1[:split_index, task] = w[:split_index, task] w_2[split_index:, task] = w[split_index:, task] # check out if any rows in either w_1 or w_2 are just zeros rows_1 = w_1.any(axis=1) X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1] dataset_1 = NumpyDataset(X_1, y_1, w_1, ids_1) rows_2 = w_2.any(axis=1) X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2] dataset_2 = NumpyDataset(X_2, y_2, w_2, ids_2) return dataset_1, dataset_2
def train_valid_test_split(self, dataset, frac_train=.8, frac_valid=.1, frac_test=.1): """Performs a train/valid/test split of the tasks for dataset. If split is uneven, spillover goes to test. Parameters ---------- dataset: dc.data.Dataset Dataset to be split frac_train: float, optional Proportion of tasks to be put into train. Rounded to nearest int. frac_valid: float, optional Proportion of tasks to be put into valid. Rounded to nearest int. frac_test: float, optional Proportion of tasks to be put into test. Rounded to nearest int. """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1) n_tasks = len(dataset.get_task_names()) n_train = int(np.round(frac_train * n_tasks)) n_valid = int(np.round(frac_valid * n_tasks)) n_test = n_tasks - n_train - n_valid X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids train_dataset = NumpyDataset(X, y[:, :n_train], w[:, :n_train], ids) valid_dataset = NumpyDataset(X, y[:, n_train:n_train + n_valid], w[:, n_train:n_train + n_valid], ids) test_dataset = NumpyDataset(X, y[:, n_train + n_valid:], w[:, n_train + n_valid:], ids) return train_dataset, valid_dataset, test_dataset
def in_silico_mutagenesis(model: Model, encoded_sequences: np.ndarray) -> np.ndarray: """Computes in-silico-mutagenesis scores Parameters ---------- model: Model This can be any model that accepts inputs of the required shape and produces an output of shape `(N_sequences, N_tasks)`. encoded_sequences: np.ndarray A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)` Returns ------- np.ndarray A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`. """ # Shape (N_sequences, num_tasks) wild_type_predictions = model.predict(NumpyDataset(encoded_sequences)) # check whether wild_type_predictions is np.ndarray or not assert isinstance(wild_type_predictions, np.ndarray) num_tasks = wild_type_predictions.shape[1] # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks) mutagenesis_scores = np.empty( encoded_sequences.shape + (num_tasks,), dtype=np.float32) # Shape (N_sequences, num_tasks, 1, 1, 1) wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis, np.newaxis] for sequence_index, (sequence, wild_type_prediction) in enumerate( zip(encoded_sequences, wild_type_predictions)): # Mutates every position of the sequence to every letter # Shape (N_letters * sequence_length, N_letters, sequence_length, 1) # Breakdown: # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1) mutated_sequences = np.repeat( sequence[np.newaxis], np.prod(sequence.shape), axis=0) # remove wild-type # len(arange) = N_letters * sequence_length arange = np.arange(len(mutated_sequences)) # len(horizontal cycle) = N_letters * sequence_length horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0]) mutated_sequences[arange, :, horizontal_cycle, :] = 0 # add mutant vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1]) mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1 # make mutant predictions mutated_predictions = model.predict(NumpyDataset(mutated_sequences)) # check whether wild_type_predictions is np.ndarray or not assert isinstance(mutated_predictions, np.ndarray) mutated_predictions = mutated_predictions.reshape(sequence.shape + (num_tasks,)) mutagenesis_scores[ sequence_index] = wild_type_prediction - mutated_predictions rolled_scores = np.rollaxis(mutagenesis_scores, -1) return rolled_scores
def test_compute_model_performance_multitask_regressor(self): random_seed = 42 n_data_points = 20 n_features = 2 np.random.seed(seed=random_seed) X = np.random.rand(n_data_points, n_features) y1 = np.expand_dims(np.array([0.5 for x in range(n_data_points)]), axis=-1) y2 = np.expand_dims(np.array([-0.5 for x in range(n_data_points)]), axis=-1) X = NumpyDataset(X) ys = [NumpyDataset(y1), NumpyDataset(y2)] databag = Databag() features = Feature(shape=(None, n_features)) databag.add_dataset(features, X) outputs = [] losses = [] labels = [] for i in range(2): label = Label(shape=(None, 1)) dense = Dense(out_channels=1, in_layers=[features]) loss = ReduceSquareDifference(in_layers=[dense, label]) outputs.append(dense) losses.append(loss) labels.append(label) databag.add_dataset(label, ys[i]) total_loss = ReduceMean(in_layers=losses) tg = dc.models.TensorGraph(mode="regression", batch_size=20, random_seed=random_seed, learning_rate=0.1) for output in outputs: tg.add_output(output) tg.set_loss(total_loss) tg.fit_generator( databag.iterbatches(epochs=1000, batch_size=tg.batch_size, pad_batches=True)) metric = [ dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean, mode="regression"), ] scores = tg.evaluate_generator(databag.iterbatches(), metric, labels=labels, per_task_metrics=True) scores = list(scores[1].values()) assert_true(np.all(np.isclose(scores, [0.0, 0.0], atol=1.0)))
def test_compute_model_performance_multitask_classifier(self): n_data_points = 20 n_features = 2 X = np.ones(shape=(n_data_points // 2, n_features)) * -1 X1 = np.ones(shape=(n_data_points // 2, n_features)) X = np.concatenate((X, X1)) class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))]) class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))]) y1 = np.concatenate((class_0, class_1)) y2 = np.concatenate((class_1, class_0)) X = NumpyDataset(X) ys = [NumpyDataset(y1), NumpyDataset(y2)] databag = Databag() features = Feature(shape=(None, n_features)) databag.add_dataset(features, X) outputs = [] entropies = [] labels = [] for i in range(2): label = Label(shape=(None, 2)) labels.append(label) dense = Dense(out_channels=2, in_layers=[features]) output = SoftMax(in_layers=[dense]) smce = SoftMaxCrossEntropy(in_layers=[label, dense]) entropies.append(smce) outputs.append(output) databag.add_dataset(label, ys[i]) total_loss = ReduceMean(in_layers=entropies) tg = dc.models.TensorGraph(learning_rate=0.1) for output in outputs: tg.add_output(output) tg.set_loss(total_loss) tg.fit_generator( databag.iterbatches(epochs=1000, batch_size=tg.batch_size, pad_batches=True)) metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") scores = tg.evaluate_generator(databag.iterbatches(), [metric], labels=labels, per_task_metrics=True) scores = list(scores[1].values()) # Loosening atol to see if tests stop failing sporadically assert_true(np.all(np.isclose(scores, [1.0, 1.0], atol=0.20)))
def in_silico_mutagenesis(model, X): """Computes in-silico-mutagenesis scores Parameters ---------- model: TensorGraph Currently only SequenceDNN will work, but other models may be added. X: ndarray Shape (N_sequences, N_letters, sequence_length, 1) Returns ------- (num_task, N_sequences, N_letters, sequence_length, 1) ISM score array. """ #Shape (N_sequences, N_letters, sequence_length, 1, num_tasks) mutagenesis_scores = np.empty(X.shape + (model.num_tasks, ), dtype=np.float32) # Shape (N_sequences, num_tasks) wild_type_predictions = model.predict(NumpyDataset(X)) # Shape (N_sequences, num_tasks, 1, 1, 1) wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis, np.newaxis] for sequence_index, (sequence, wild_type_prediction) in enumerate( zip(X, wild_type_predictions)): # Mutates every position of the sequence to every letter # Shape (N_letters * sequence_length, N_letters, sequence_length, 1) # Breakdown: # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1) mutated_sequences = np.repeat(sequence[np.newaxis], np.prod(sequence.shape), axis=0) # remove wild-type # len(arange) = N_letters * sequence_length arange = np.arange(len(mutated_sequences)) # len(horizontal cycle) = N_letters * sequence_length horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0]) mutated_sequences[arange, :, horizontal_cycle, :] = 0 # add mutant vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1]) mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1 # make mutant predictions mutated_predictions = model.predict(NumpyDataset(mutated_sequences)) mutated_predictions = mutated_predictions.reshape(sequence.shape + (model.num_tasks, )) mutagenesis_scores[ sequence_index] = wild_type_prediction - mutated_predictions rolled_scores = np.rollaxis(mutagenesis_scores, -1) return rolled_scores
def sample(self, n_graphs: int = 100) -> NumpyDataset: """Samples graphs Parameters ---------- n_graphs: int, default 100 Number of graphs to generate Returns ------- graphs: NumpyDataset Generated Graphs """ graphs, labels = [], [] for i in range(n_graphs): n_nodes = random.randint(self.min_nodes, self.max_nodes) edge_index = generate_edge_index(n_nodes, self.avg_degree) n_edges = edge_index.shape[1] if self.task == 'graph': graph_label = random.randint(0, self.n_classes - 1) node_features = np.random.rand(n_nodes, self.n_node_features) + graph_label edge_features = np.random.rand(n_edges, self.n_edge_features) + graph_label kwargs = {} for feature_name, feature_shape in self.kwargs.items(): kwargs[feature_name] = np.random.rand(1, feature_shape) + graph_label labels.append(graph_label) elif self.task == 'node': node_label = np.random.randint(0, self.n_classes - 1, n_nodes).astype(np.float64) node_features = np.random.rand( n_nodes, self.n_node_features) + node_label.reshape(-1, 1) # For a node-prediction task, label is not added to edge features and other global features # because label here is a node-level attribute and not a graph-level attribute edge_features = np.random.rand(n_edges, self.n_edge_features) kwargs = {} for feature_name, feature_shape in self.kwargs.items(): kwargs[feature_name] = np.random.rand(1, feature_shape) kwargs['y'] = node_label graph = GraphData(node_features, edge_index, edge_features, **kwargs) graphs.append(graph) if self.task == 'graph': graph_dataset = NumpyDataset(X=np.array(graphs), y=np.array(labels)) elif self.task == 'node': # In this case, the 'y' attribute of GraphData will contain the # node-level labels. graph_dataset = NumpyDataset(X=np.array(graphs)) return graph_dataset
def test_compute_model_performance_singletask_regressor_ordering(self): n_data_points = 1000 n_features = 1 X = np.array(range(n_data_points)) X = np.expand_dims(X, axis=-1) y1 = X + 1 X = NumpyDataset(X) ys = [NumpyDataset(y1)] databag = Databag() features = Feature(shape=(None, n_features)) databag.add_dataset(features, X) outputs = [] losses = [] labels = [] for i in range(1): label = Label(shape=(None, 1)) dense = Dense(out_channels=1, in_layers=[features]) loss = ReduceSquareDifference(in_layers=[dense, label]) outputs.append(dense) losses.append(loss) labels.append(label) databag.add_dataset(label, ys[i]) total_loss = ReduceMean(in_layers=losses) tg = dc.models.TensorGraph(mode="regression", learning_rate=0.1) for output in outputs: tg.add_output(output) tg.set_loss(total_loss) tg.fit_generator( databag.iterbatches(epochs=1000, batch_size=tg.batch_size, pad_batches=True)) metric = [ dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean, mode="regression"), dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression") ] scores = tg.evaluate_generator(databag.iterbatches(batch_size=1), metric, labels=labels, per_task_metrics=True) print(scores) scores = list(scores[1].values()) assert_true(np.all(np.isclose(scores, [0.0], atol=0.5)))
def test_compute_model_performance_multitask_regressor(self): random_seed = 42 n_data_points = 20 n_features = 2 n_tasks = 2 np.random.seed(seed=random_seed) X = np.random.rand(n_data_points, n_features) y1 = np.array([0.5 for x in range(n_data_points)]) y2 = np.array([-0.5 for x in range(n_data_points)]) y = np.stack([y1, y2], axis=1) dataset = NumpyDataset(X, y) features = Feature(shape=(None, n_features)) label = Label(shape=(None, n_tasks)) dense = Dense(out_channels=n_tasks, in_layers=[features]) loss = ReduceSquareDifference(in_layers=[dense, label]) tg = dc.models.TensorGraph(random_seed=random_seed, learning_rate=0.1) tg.add_output(dense) tg.set_loss(loss) tg.fit(dataset, nb_epoch=1000) metric = [ dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean, mode="regression"), ] scores = tg.evaluate_generator(tg.default_generator(dataset), metric, labels=[label], per_task_metrics=True) scores = list(scores[1].values()) assert_true(np.all(np.isclose(scores, [0.0, 0.0], atol=1.0)))
def test_compute_model_performance_singletask_classifier(self): n_data_points = 20 n_features = 10 X = np.ones(shape=(int(n_data_points / 2), n_features)) * -1 X1 = np.ones(shape=(int(n_data_points / 2), n_features)) X = np.concatenate((X, X1)) class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))]) class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))]) y = np.concatenate((class_0, class_1)) dataset = NumpyDataset(X, y) features = Feature(shape=(None, n_features)) label = Label(shape=(None, 2)) dense = Dense(out_channels=2, in_layers=[features]) output = SoftMax(in_layers=[dense]) smce = SoftMaxCrossEntropy(in_layers=[label, dense]) total_loss = ReduceMean(in_layers=smce) tg = dc.models.TensorGraph(learning_rate=0.1) tg.add_output(output) tg.set_loss(total_loss) tg.fit(dataset, nb_epoch=1000) metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") scores = tg.evaluate_generator(tg.default_generator(dataset), [metric], labels=[label], per_task_metrics=True) scores = list(scores[1].values()) assert_true(np.isclose(scores, [1.0], atol=0.05))
def test_neighbor_list_simple(self): """Test that neighbor lists can be constructed.""" N_atoms = 10 start = 0 stop = 12 nbr_cutoff = 3 ndim = 3 M = 6 X = np.random.rand(N_atoms, ndim) y = np.random.rand(N_atoms, 1) dataset = NumpyDataset(X, y) features = Feature(shape=(N_atoms, ndim)) labels = Label(shape=(N_atoms, )) nbr_list = NeighborList(N_atoms, M, ndim, nbr_cutoff, start, stop, in_layers=[features]) nbr_list = ToFloat(in_layers=[nbr_list]) # This isn't a meaningful loss, but just for test loss = ReduceSum(in_layers=[nbr_list]) tg = dc.models.TensorGraph(use_queue=False) tg.add_output(nbr_list) tg.set_loss(loss) tg.build()
def get_task_dataset_minus_support(self, support, task): """Gets data for specified task, minus support points. Useful for evaluating model performance once trained (so that test compounds can be ensured distinct from support.) Parameters ---------- dataset: dc.data.Dataset Source dataset. support: dc.data.Dataset The support dataset task: int Task number of task to select. """ dataset = self.__getitem__(task) support_ids = set(support.ids) non_support_inds = [ ind for ind in range(len(dataset)) if dataset.ids[ind] not in support_ids ] # Remove support indices X = dataset.X[non_support_inds] y = dataset.y[non_support_inds, :] w = dataset.w[non_support_inds, :] ids = dataset.ids[non_support_inds] return NumpyDataset(X, y, w, ids)
def k_fold_split(self, dataset, K): """Performs a K-fold split of the tasks for dataset. If split is uneven, spillover goes to last fold. Parameters ---------- dataset: dc.data.Dataset Dataset to be split K: int Number of splits to be made """ n_tasks = len(dataset.get_task_names()) n_per_fold = int(np.round(n_tasks / float(K))) if K * n_per_fold != n_tasks: print("Assigning extra tasks to last fold due to uneven split") X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids fold_datasets = [] for fold in range(K): if fold != K - 1: fold_tasks = range(fold * n_per_fold, (fold + 1) * n_per_fold) else: fold_tasks = range(fold * n_per_fold, n_tasks) fold_datasets.append( NumpyDataset(X, y[:, fold_tasks], w[:, fold_tasks], ids)) return fold_datasets
def test_tensorboard(self): n_data_points = 20 n_features = 2 X = np.random.rand(n_data_points, n_features) y = [[0, 1] for x in range(n_data_points)] dataset = NumpyDataset(X, y) features = Feature(shape=(None, n_features)) dense = Dense(out_channels=2, in_layers=[features]) output = SoftMax(in_layers=[dense]) label = Label(shape=(None, 2)) smce = SoftMaxCrossEntropy(in_layers=[label, dense]) loss = ReduceMean(in_layers=[smce]) tg = dc.models.TensorGraph(tensorboard=True, tensorboard_log_frequency=1, learning_rate=0.01, model_dir='/tmp/tensorgraph') tg.add_output(output) tg.set_loss(loss) tg.fit(dataset, nb_epoch=1000) files_in_dir = os.listdir(tg.model_dir) event_file = list( filter(lambda x: x.startswith("events"), files_in_dir)) assert_true(len(event_file) > 0) event_file = os.path.join(tg.model_dir, event_file[0]) file_size = os.stat(event_file).st_size assert_true(file_size > 0)
def test_neighbor_list_vina(self): """Test under conditions closer to Vina usage.""" N_atoms = 5 M_nbrs = 2 ndim = 3 start = 0 stop = 4 nbr_cutoff = 1 X = NumpyDataset(start + np.random.rand(N_atoms, ndim) * (stop - start)) coords = Feature(shape=(N_atoms, ndim)) # Now an (N, M) shape nbr_list = NeighborList( N_atoms, M_nbrs, ndim, nbr_cutoff, start, stop, in_layers=[coords]) nbr_list = ToFloat(in_layers=[nbr_list]) flattened = Flatten(in_layers=[nbr_list]) dense = Dense(out_channels=1, in_layers=[flattened]) output = ReduceSum(in_layers=[dense]) tg = dc.models.TensorGraph(learning_rate=0.1, use_queue=False) tg.set_loss(output) databag = Databag({coords: X}) tg.fit_generator(databag.iterbatches(epochs=1))
def test_save_load(self): n_data_points = 20 n_features = 2 X = np.random.rand(n_data_points, n_features) y = [[0, 1] for x in range(n_data_points)] dataset = NumpyDataset(X, y) features = Feature(shape=(None, n_features)) dense = Dense(out_channels=2, in_layers=[features]) output = SoftMax(in_layers=[dense]) label = Label(shape=(None, 2)) smce = SoftMaxCrossEntropy(in_layers=[label, dense]) loss = ReduceMean(in_layers=[smce]) tg = dc.models.TensorGraph(learning_rate=0.01) tg.add_output(output) tg.set_loss(loss) submodel_loss = ReduceSum(in_layers=smce) submodel_opt = Adam(learning_rate=0.002) submodel = tg.create_submodel(layers=[dense], loss=submodel_loss, optimizer=submodel_opt) tg.fit(dataset, nb_epoch=1) prediction = np.squeeze(tg.predict_on_batch(X)) tg.save() dirpath = tempfile.mkdtemp() shutil.rmtree(dirpath) shutil.move(tg.model_dir, dirpath) tg1 = TensorGraph.load_from_dir(dirpath) prediction2 = np.squeeze(tg1.predict_on_batch(X)) assert_true(np.all(np.isclose(prediction, prediction2, atol=0.01)))
def test_set_optimizer(self): n_data_points = 20 n_features = 2 X = np.random.rand(n_data_points, n_features) y = [[0, 1] for x in range(n_data_points)] dataset = NumpyDataset(X, y) features = Feature(shape=(None, n_features)) dense = Dense(out_channels=2, in_layers=[features]) output = SoftMax(in_layers=[dense]) label = Label(shape=(None, 2)) smce = SoftMaxCrossEntropy(in_layers=[label, dense]) loss = ReduceMean(in_layers=[smce]) tg = dc.models.TensorGraph(learning_rate=0.01, use_queue=False) tg.add_output(output) tg.set_loss(loss) global_step = tg.get_global_step() learning_rate = ExponentialDecay(initial_rate=0.1, decay_rate=0.96, decay_steps=100000) tg.set_optimizer(GradientDescent(learning_rate=learning_rate)) tg.fit(dataset, nb_epoch=1000) prediction = np.squeeze(tg.predict_on_batch(X)) tg.save() tg1 = TensorGraph.load_from_dir(tg.model_dir) prediction2 = np.squeeze(tg1.predict_on_batch(X)) assert_true(np.all(np.isclose(prediction, prediction2, atol=0.01)))
def test_compute_model_performance_singletask_classifier(): """Computes model performance on singletask dataset with one-hot label encoding.""" n_data_points = 20 n_features = 10 X = np.ones(shape=(int(n_data_points / 2), n_features)) * -1 X1 = np.ones(shape=(int(n_data_points / 2), n_features)) X = np.concatenate((X, X1)) class_1 = np.array([[0.0, 1.0] for x in range(int(n_data_points / 2))]) class_0 = np.array([[1.0, 0.0] for x in range(int(n_data_points / 2))]) y = np.concatenate((class_0, class_1)) dataset = NumpyDataset(X, y) features = layers.Input(shape=(n_features, )) dense = layers.Dense(2)(features) output = layers.Softmax()(dense) keras_model = tf.keras.Model(inputs=features, outputs=[output]) model = dc.models.KerasModel(keras_model, dc.models.losses.SoftmaxCrossEntropy(), learning_rate=0.1) model.fit(dataset, nb_epoch=1000) metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification", n_tasks=1) scores = model.evaluate_generator(model.default_generator(dataset), [metric], per_task_metrics=True) scores = list(scores[1].values()) assert np.isclose(scores, [1.0], atol=0.05)
def test_compute_model_performance_multitask_regressor(): random_seed = 42 n_data_points = 20 n_features = 2 n_tasks = 2 np.random.seed(seed=random_seed) X = np.random.rand(n_data_points, n_features) y1 = np.array([0.5 for x in range(n_data_points)]) y2 = np.array([-0.5 for x in range(n_data_points)]) y = np.stack([y1, y2], axis=1) dataset = NumpyDataset(X, y) features = layers.Input(shape=(n_features, )) dense = layers.Dense(n_tasks)(features) keras_model = tf.keras.Model(inputs=features, outputs=[dense]) model = dc.models.KerasModel(keras_model, dc.models.losses.L2Loss(), learning_rate=0.1) model.fit(dataset, nb_epoch=1000) metric = [ dc.metrics.Metric(dc.metrics.mean_absolute_error, np.mean, mode="regression"), ] scores = model.evaluate_generator(model.default_generator(dataset), metric, per_task_metrics=True) scores = list(scores[1].values()) assert np.all(np.isclose(scores, [0.0, 0.0], atol=1.0))
def predict_on_batch( self, X: ArrayLike, transformers: List[Transformer] = [], outputs: Optional[OneOrMany[tf.Tensor]] = None) -> OneOrMany[np.ndarray]: """Generates predictions for input samples, processing samples in a batch. Parameters ---------- X: ndarray the input data, as a Numpy array. transformers: list of dc.trans.Transformers Transformers that the input data has been transformed by. The output is passed through these transformers to undo the transformations. outputs: Tensor or list of Tensors The outputs to return. If this is None, the model's standard prediction outputs will be returned. Alternatively one or more Tensors within the model may be specified, in which case the output of those Tensors will be returned. Returns ------- a NumPy array of the model produces a single output, or a list of arrays if it produces multiple outputs """ dataset = NumpyDataset(X=X, y=None) return self.predict(dataset, transformers, outputs)
def predict_uncertainty_on_batch(self, X: Sequence, masks: int = 50 ) -> OneOrMany[Tuple[np.ndarray, np.ndarray]]: """ Predict the model's outputs, along with the uncertainty in each one. The uncertainty is computed as described in https://arxiv.org/abs/1703.04977. It involves repeating the prediction many times with different dropout masks. The prediction is computed as the average over all the predictions. The uncertainty includes both the variation among the predicted values (epistemic uncertainty) and the model's own estimates for how well it fits the data (aleatoric uncertainty). Not all models support uncertainty prediction. Parameters ---------- X: ndarray the input data, as a Numpy array. masks: int the number of dropout masks to average over Returns ------- for each output, a tuple (y_pred, y_std) where y_pred is the predicted value of the output, and each element of y_std estimates the standard deviation of the corresponding element of y_pred """ dataset = NumpyDataset(X=X, y=None) return self.predict_uncertainty(dataset, masks)
def fit_on_batch(self, X, y, w, variables=None, loss=None, callbacks=[]): """Perform a single step of training. Parameters ---------- X: ndarray the inputs for the batch y: ndarray the labels for the batch w: ndarray the weights for the batch variables: list of tf.Variable the variables to train. If None (the default), all trainable variables in the model are used. loss: function a function of the form f(outputs, labels, weights) that computes the loss for each batch. If None (the default), the model's standard loss function is used. callbacks: function or list of functions one or more functions of the form f(model, step) that will be invoked after every step. This can be used to perform validation, logging, etc. """ if not self.built: self.build() dataset = NumpyDataset(X, y, w) return self.fit(dataset, nb_epoch=1, variables=variables, loss=loss, callbacks=callbacks)
def get_dataset(mode='classification', featurizer='GraphConv', num_tasks=2): data_points = 20 if mode == 'classification': tasks, all_dataset, transformers = load_bace_classification( featurizer, reload=False) else: tasks, all_dataset, transformers = load_delaney(featurizer, reload=False) train, valid, test = all_dataset for i in range(1, num_tasks): tasks.append("random_task") w = np.ones(shape=(data_points, len(tasks))) if mode == 'classification': y = np.random.randint(0, 2, size=(data_points, len(tasks))) metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") else: y = np.random.normal(size=(data_points, len(tasks))) metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression") ds = NumpyDataset(train.X[:data_points], y, w, train.ids[:data_points]) return tasks, ds, transformers, metric
def score(self, protein_file, ligand_file): """Returns a score for a protein/ligand pair.""" features = self.featurizer.featurize_complexes([ligand_file], [protein_file]) dataset = NumpyDataset(X=features, y=None, w=None, ids=None) score = self.model.predict(dataset) return score
def get_task_dataset_minus_support(dataset, support, task): """Gets data for specified task, minus support points. Useful for evaluating model performance once trained (so that test compounds can be ensured distinct from support.) Parameters ---------- dataset: dc.data.Dataset Source dataset. support: dc.data.Dataset The support dataset task: int Task number of task to select. """ support_ids = set(support.ids) non_support_inds = [ind for ind in range(len(dataset)) if dataset.ids[ind] not in support_ids] # Remove support indices X = dataset.X[non_support_inds] y = dataset.y[non_support_inds] w = dataset.w[non_support_inds] ids = dataset.ids[non_support_inds] # Get task specific entries w_task = w[:, task] X_task = X[w_task != 0] y_task = y[w_task != 0, task] ids_task = ids[w_task != 0] # Now just get weights for this task w_task = w[w_task != 0, task] return NumpyDataset(X_task, y_task, w_task, ids_task)
def test_atomic_conv_variable(self): """A simple test that initializes and fits an AtomicConvModel on variable input size.""" # For simplicity, let's assume both molecules have same number of # atoms. frag1_num_atoms = 1000 frag2_num_atoms = 1200 complex_num_atoms = frag1_num_atoms + frag2_num_atoms batch_size = 1 atomic_convnet = atomic_conv.AtomicConvModel( batch_size=batch_size, frag1_num_atoms=frag1_num_atoms, frag2_num_atoms=frag2_num_atoms, complex_num_atoms=complex_num_atoms) # Creates a set of dummy features that contain the coordinate and # neighbor-list features required by the AtomicConvModel. features = [] frag1_coords = np.random.rand(frag1_num_atoms, 3) frag1_nbr_list = {i: [] for i in range(frag1_num_atoms)} frag1_z = np.random.randint(10, size=(frag1_num_atoms)) frag2_coords = np.random.rand(frag2_num_atoms, 3) frag2_nbr_list = {i: [] for i in range(frag2_num_atoms)} frag2_z = np.random.randint(10, size=(frag2_num_atoms)) system_coords = np.random.rand(complex_num_atoms, 3) system_nbr_list = {i: [] for i in range(complex_num_atoms)} system_z = np.random.randint(10, size=(complex_num_atoms)) features.append( (frag1_coords, frag1_nbr_list, frag1_z, frag2_coords, frag2_nbr_list, frag2_z, system_coords, system_nbr_list, system_z)) features = np.asarray(features) labels = np.zeros(batch_size) train = NumpyDataset(features, labels) atomic_convnet.fit(train, nb_epoch=1)
def test_normalizing_flow(): flow_layers = [ tfb.RealNVP(num_masked=1, shift_and_log_scale_fn=tfb.real_nvp_default_template( hidden_layers=[8, 8])) ] # 3D Multivariate Gaussian base distribution nf = NormalizingFlow( base_distribution=tfd.MultivariateNormalDiag(loc=[0., 0.]), flow_layers=flow_layers) nfm = NormalizingFlowModel(nf) # Must be float32 for RealNVP target_distribution = tfd.MultivariateNormalDiag(loc=[1., 0.]) dataset = NumpyDataset(X=target_distribution.sample(96)) # Tests a simple flow of one RealNVP layer. X = nfm.flow.sample() x1 = tf.zeros([2]) x2 = dataset.X[0] # log likelihoods should be negative assert nfm.flow.log_prob(X).numpy() < 0 assert nfm.flow.log_prob(x1).numpy() < 0 assert nfm.flow.log_prob(x2).numpy() < 0 # # Fit model final = nfm.fit(dataset, nb_epoch=5) print(final) assert final > 0
def predict_mols(self, mols): featurizer = CircularFingerprint( size=self.n_features, radius=2, chiral=True) features = np.expand_dims(featurizer.featurize(mols), axis=1) features = np.concatenate([features, features], axis=1) ds = NumpyDataset(features, None, None, None) return self.predict(ds)[0][:, 0]
def predict_on_batch(self, X, transformers=[], outputs=None): dataset = NumpyDataset(X, y=None) generator = self.default_generator(dataset, predict=True, pad_batches=False) preds = self.predict_on_generator(generator, transformers, outputs) preds = 10**-preds # Since we get train on -log10(IC50) return preds
def test_training(self): """ Check training of the basicMolGANmodel on small number of compounds. Due to training instability try a few times and see if it worked at least once. Typically it fails between 1-3 times of 10. This is something that needs to be addressed in future releases. """ input_file = os.path.join(self.current_dir, "molgan_example.csv") data = pd.read_csv(input_file) molecules = list(data['Molecule']) feat = MolGanFeaturizer() featurized = feat.featurize(molecules) dataset = NumpyDataset([x.adjacency_matrix for x in featurized], [x.node_features for x in featurized]) # True will be assigned up successful training attempt success = False for _ in range(self.training_attempts): # force clear tensor flow backend keras_clear_session() # create new model gan = MolGAN(learning_rate=ExponentialDecay(0.001, 0.9, 5000)) # to avoid flake8 E125/yapf incompatibility s = gan.batch_size # generate input def iterbatches(epochs): for __ in range(epochs): for batch in dataset.iterbatches(batch_size=s, pad_batches=True): adjacency_tensor = one_hot(batch[0], gan.edges) node_tesor = one_hot(batch[1], gan.nodes) yield { gan.data_inputs[0]: adjacency_tensor, gan.data_inputs[1]: node_tesor } # train model gan.fit_gan(iterbatches(1000), generator_steps=0.2, checkpoint_interval=0) # generate sample g = gan.predict_gan_generator(1000) # check how many valid molecules were created and add to list generated_molecules = feat.defeaturize(g) valid_molecules_count = len( list(filter(lambda x: x is not None, generated_molecules))) print(valid_molecules_count) if valid_molecules_count: success = True break # finally test if there was at least one valid training session # as the model structure improves this should become more and more strict assert success