def in_silico_mutagenesis(model: Model, encoded_sequences: np.ndarray) -> np.ndarray: """Computes in-silico-mutagenesis scores Parameters ---------- model: Model This can be any model that accepts inputs of the required shape and produces an output of shape `(N_sequences, N_tasks)`. encoded_sequences: np.ndarray A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)` Returns ------- np.ndarray A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`. """ # Shape (N_sequences, num_tasks) wild_type_predictions = model.predict(NumpyDataset(encoded_sequences)) # check whether wild_type_predictions is np.ndarray or not assert isinstance(wild_type_predictions, np.ndarray) num_tasks = wild_type_predictions.shape[1] # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks) mutagenesis_scores = np.empty( encoded_sequences.shape + (num_tasks,), dtype=np.float32) # Shape (N_sequences, num_tasks, 1, 1, 1) wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis, np.newaxis] for sequence_index, (sequence, wild_type_prediction) in enumerate( zip(encoded_sequences, wild_type_predictions)): # Mutates every position of the sequence to every letter # Shape (N_letters * sequence_length, N_letters, sequence_length, 1) # Breakdown: # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1) mutated_sequences = np.repeat( sequence[np.newaxis], np.prod(sequence.shape), axis=0) # remove wild-type # len(arange) = N_letters * sequence_length arange = np.arange(len(mutated_sequences)) # len(horizontal cycle) = N_letters * sequence_length horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0]) mutated_sequences[arange, :, horizontal_cycle, :] = 0 # add mutant vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1]) mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1 # make mutant predictions mutated_predictions = model.predict(NumpyDataset(mutated_sequences)) # check whether wild_type_predictions is np.ndarray or not assert isinstance(mutated_predictions, np.ndarray) mutated_predictions = mutated_predictions.reshape(sequence.shape + (num_tasks,)) mutagenesis_scores[ sequence_index] = wild_type_prediction - mutated_predictions rolled_scores = np.rollaxis(mutagenesis_scores, -1) return rolled_scores
def predict(self, dataset, transformers=[], batch_size=None, pad_batches=False): """ Uses self to make predictions on provided Dataset object. This is overridden to make sure the batch size is always valid for Tensorflow. Returns: y_pred: numpy ndarray of shape (n_samples,) """ return Model.predict(self, dataset, transformers, self.model_instance.batch_size, True)