def localize(row, model, genome, input_window=256, batch_size=32, get_idx=False, verb=False): """Find the input_window bp window responsible for a ml prediction in a bed file row. Arguments: row -- pd dataframe row with start and end parameters. model -- keras model to make predictions on. genome -- genome to pull sequences from. Keywords: input_window -- size of window to localize on. batch_size -- batch_size accepted by the model. verb -- print output? Returns: max_tile -- input_window sized sequence that gave maximum prediciton from the model. max_pred -- prediction value for max_tile. """ # break the sequence into overlapping tiles tile_seqs = list() num_tiles = int((row['end']-row['start']) / input_window) + ((row['end']-row['start']) % input_window > 0) if verb: print(num_tiles) for idx in range(num_tiles): if row['start'] + idx*input_window - input_window//2 > 0: seq = genome[row['chr']][row['start'] + idx*input_window - input_window//2:row['start'] + (idx+1)*input_window - input_window//2].lower() tile_seqs.append(ctcf_strength_gen.encode(np.fromstring(seq, dtype=np.uint8))) else: buffered_seq = np.zeros((256,4)) buffered_seq[:row['start'] + (idx+1)*input_window - input_window//2] = genome[row['chr']][0:row['start'] + (idx+1)*input_window - input_window//2] tile_seqs.append(ctcf_strength_gen.encode(np.fromstring(buffered_seq).lower(), dtype=np.uint8)) seq = genome[row['chr']][row['start'] + idx*input_window:row['start'] + (idx+1)*input_window].lower() tile_seqs.append(ctcf_strength_gen.encode(np.fromstring(seq, dtype=np.uint8))) tile_seqs= np.asarray(tile_seqs) tile_iter = iter(tile_seqs) # get a batch generator batches = ctcf_strength_gen.filled_batch(tile_iter, batch_size=batch_size) # figure out where the max prediction is coming from preds = list() batch_list = list() for batch in batches: preds.append(model.predict_on_batch(batch)) batch_list.append(batch) preds = np.asarray(preds).reshape((-1))[:tile_seqs.shape[0]] batch_list = np.asarray(batch_list).reshape((-1, input_window, 4))[:tile_seqs.shape[0]] # get a tile centered there max_idx = np.argmax(preds) max_pred = np.max(preds) max_tile = batch_list[max_idx] if verb: print(max_idx) print(max_pred) print(preds) if get_idx: return max_tile, max_pred, max_idx*input_window + row['start'] return max_tile, max_pred
def varience_plot(self, layer_name, show=False): """ Plot covarience of neuron activations with themselves and scores. Arguments: layer_name -- Layer to get neuron activations from (in layer_dict) Keywords: show -- show the plot? """ # build a function to get activations seqs = self.model.input get_activations = K.function([seqs, K.learning_phase()], [self.layer_dict[layer_name].output, self.model.output]) # put the sequences into batches. g = ctcf_strength_gen.filled_batch(iter(self.sample_peaks['signal_seq']), batch_size=self.batch_size) # get activations. activations = list() for i in range(100): input_seqs = next(g) a, p = get_activations([input_seqs, 0]) both = a[:self.batch_size] + a[self.batch_size:] activations.append([value for value in np.max(both, axis=1)]) activations = [act for l in activations for act in l] activations = np.asarray(activations) # plot activation covarience plt.title('Correlations for ' + str(layer_name)) ml = np.repeat(np.expand_dims(self.sample_peaks['ml'], axis=1), 2, axis=1) pwm = np.repeat(np.expand_dims(self.sample_peaks['pwm'], axis=1), 2, axis=1) diff = np.repeat(np.expand_dims(self.sample_peaks['pwm'] - self.sample_peaks['ml'], axis=1), 2, axis=1) variables = np.append(activations, ml, axis=1) variables = np.append(variables, pwm, axis=1) variables = np.append(variables, diff, axis=1) label = ['ml', 'pwm', 'diffs'] cov = np.corrcoef(variables, rowvar=False) plt.yticks([activations.shape[1] + 1, activations.shape[1] + 3, activations.shape[1] + 5], ('ml', 'pwm', 'diffs')) plt.xticks([]) plt.imshow(cov, cmap='plasma') plt.savefig(os.path.join(self.out_dir, layer_name + '_corrcoef.png'), bbox_inches='tight') if show: plt.show()
def get_tsne(self, layer_name, show=False): """ Get a tsne visualization of the neuron activations. Arguments: layer_name -- layer to pull activations for (key in layer dict). Keywords: show -- show the final graph? """ # build a function to get nueron activations. seqs = self.model.input get_activations = K.function([seqs, K.learning_phase()], [self.layer_dict[layer_name].output, self.model.output]) # put the sequences into batches g = ctcf_strength_gen.filled_batch(iter(self.sample_peaks['signal_seq']), batch_size=self.batch_size) # get the layer activation for each sequence. base_activations = list() for input_batch in g: activations, predictions = get_activations([input_batch, 0]) base_activations.append(np.append(activations[:32], activations[32:], axis=1)) # reshape and take the maximum of each neuron to collapse the feature space to something more reasonable. base_activations = np.asarray(base_activations) base_activations = base_activations.reshape((-1, base_activations.shape[2], base_activations.shape[3]))[:self.sample_peaks.shape[0]] max_activations = np.amax(base_activations, axis=1) # build a t-sne model tsnemodel = TSNE(n_components=2, random_state=0) divisions = tsnemodel.fit_transform(max_activations) heatmap = plt.scatter(divisions[:,0], divisions[:,1], c=self.sample_peaks['ml'], s=10+self.sample_peaks['ctcf']*10, cmap='plasma') cbar = plt.colorbar(heatmap) plt.title('T-SNE for ' + layer_name + ' Kernel Maximum Activations') plt.savefig(os.path.join(self.out_dir, layer_name + '_tsne.png'), bbox_inches='tight') if show: plt.show() return divisions
def from_long(seq, model, input_window=256, batch_size=32, verb=False): """ Returns an input window size piece of the sequence with the maximum prediction. Arugments: seq -- one-hot encoded sequence. model -- model to localize the sequence with. Keywords: input_window -- length of the sequence to return. batch_size -- batch size accepted by the model. verb -- verbosity? Returns: max_tile -- peice of sequence with maximum response. max_pred -- prediction from max_tile. index -- index within the sequence where the tile starts. """ # check for a short sequence. if len(seq) <= 256: print('The sequence you passed to long_seq is not long.') max_tile = [0,0,0,0]*input_window idx = (input_window-len(seq))//2 max_tile[idx:idx + len(seq)] = seq.copy() pred = model.predict(max_tile) return max_tile, pred, idx # get the indexes of the tiles. idxs = list(range(0, len(seq)-input_window, input_window)) + list(range(input_window//2, len(seq)-input_window, input_window)) idxs.append(len(seq) - input_window) idxs.append(max(len(seq) -(input_window//2 +input_window), 0)) # make tiles. first_split = np.split(seq, list(range(input_window, len(seq), input_window)))[:-1] second_split = np.split(seq, list(range(input_window//2, len(seq), input_window)))[1:-1] tile_seqs = first_split + second_split tile_seqs = np.asarray(tile_seqs) if verb: print(tile_seqs.shape) print(np.asarray([seq[-input_window:].copy()]).shape) tile_seqs = np.append(tile_seqs, np.asarray([seq[-input_window:].copy()]), axis=0) tile_seqs = np.append(tile_seqs, np.asarray([seq[-(input_window//2 +input_window):-input_window//2].copy()]), axis=0) # make an iterable tile generator. tile_seqs = np.asarray(tile_seqs) tile_iter = iter(tile_seqs) # get a batch generator batches = ctcf_strength_gen.filled_batch(tile_iter, batch_size=batch_size) # figure out where the max prediction is coming from preds = list() batch_list = list() for batch in batches: preds.append(model.predict_on_batch(batch)) batch_list.append(batch) preds = np.asarray(preds).reshape((-1))[:tile_seqs.shape[0]] batch_list = np.asarray(batch_list).reshape((-1, input_window, 4))[:tile_seqs.shape[0]] # get the tile that produced that predicion. max_idx = np.argmax(preds) max_pred = np.max(preds) max_tile = batch_list[max_idx] return max_tile, max_pred, idxs[max_idx]