Esempio n. 1
0
def localize(row, model, genome, input_window=256, batch_size=32, get_idx=False, verb=False):
    """Find the input_window bp window responsible for a ml prediction in a bed file row.

    Arguments:
        row -- pd dataframe row with start and end parameters. 
        model -- keras model to make predictions on.
        genome -- genome to pull sequences from.

    Keywords:
        input_window -- size of window to localize on.
        batch_size -- batch_size accepted by the model.
        verb -- print output?

    Returns:
        max_tile -- input_window sized sequence that gave maximum prediciton from the model.
        max_pred -- prediction value for max_tile.
    """
    # break the sequence into overlapping tiles
    tile_seqs = list()
    num_tiles = int((row['end']-row['start']) / input_window) + ((row['end']-row['start']) % input_window > 0)
    if verb:
        print(num_tiles)
    for idx in range(num_tiles):
        if row['start'] + idx*input_window - input_window//2 > 0:
            seq = genome[row['chr']][row['start'] + idx*input_window - input_window//2:row['start'] + (idx+1)*input_window - input_window//2].lower()
            tile_seqs.append(ctcf_strength_gen.encode(np.fromstring(seq, dtype=np.uint8)))
        else:
            buffered_seq = np.zeros((256,4))
            buffered_seq[:row['start'] + (idx+1)*input_window - input_window//2] = genome[row['chr']][0:row['start'] + (idx+1)*input_window - input_window//2]
            tile_seqs.append(ctcf_strength_gen.encode(np.fromstring(buffered_seq).lower(), dtype=np.uint8))
        seq = genome[row['chr']][row['start'] + idx*input_window:row['start'] + (idx+1)*input_window].lower()
        tile_seqs.append(ctcf_strength_gen.encode(np.fromstring(seq, dtype=np.uint8)))
        
    tile_seqs= np.asarray(tile_seqs)
    tile_iter = iter(tile_seqs)
    
    # get a batch generator
    batches = ctcf_strength_gen.filled_batch(tile_iter, batch_size=batch_size)
    
    # figure out where the max prediction is coming from
    preds = list()
    batch_list = list()
    for batch in batches:
        preds.append(model.predict_on_batch(batch))
        batch_list.append(batch)
    preds = np.asarray(preds).reshape((-1))[:tile_seqs.shape[0]]
    batch_list = np.asarray(batch_list).reshape((-1, input_window, 4))[:tile_seqs.shape[0]]

    # get a tile centered there
    max_idx = np.argmax(preds)
    max_pred = np.max(preds)
    max_tile = batch_list[max_idx]
    if verb:
        print(max_idx)
        print(max_pred)
        print(preds)
    
    if get_idx:
        return max_tile, max_pred, max_idx*input_window + row['start']
    return max_tile, max_pred
Esempio n. 2
0
    def varience_plot(self, layer_name, show=False):
        """ Plot covarience of neuron activations with themselves and scores.
   
        Arguments:
            layer_name -- Layer to get neuron activations from (in layer_dict)

        Keywords:
            show -- show the plot?
        """
        # build a function to get activations
        seqs = self.model.input
        get_activations = K.function([seqs, K.learning_phase()], [self.layer_dict[layer_name].output, self.model.output])
       
        # put the sequences into batches.
        g = ctcf_strength_gen.filled_batch(iter(self.sample_peaks['signal_seq']), batch_size=self.batch_size)

        # get activations.
        activations = list()
        for i in range(100):
            input_seqs = next(g)
            a, p = get_activations([input_seqs, 0])
            both = a[:self.batch_size] + a[self.batch_size:]
            activations.append([value for value in np.max(both, axis=1)])
      
        activations = [act for l in activations for act in l]
        activations = np.asarray(activations)

        # plot activation covarience
        plt.title('Correlations for ' + str(layer_name))

        ml = np.repeat(np.expand_dims(self.sample_peaks['ml'], axis=1), 2, axis=1)
        pwm = np.repeat(np.expand_dims(self.sample_peaks['pwm'], axis=1), 2, axis=1)
        diff = np.repeat(np.expand_dims(self.sample_peaks['pwm'] - self.sample_peaks['ml'], axis=1), 2, axis=1)
        variables = np.append(activations, ml, axis=1)
        variables = np.append(variables, pwm, axis=1)
        variables = np.append(variables, diff, axis=1)
        label = ['ml', 'pwm', 'diffs']

        cov = np.corrcoef(variables, rowvar=False)
        plt.yticks([activations.shape[1] + 1, activations.shape[1] + 3, activations.shape[1] + 5], ('ml', 'pwm', 'diffs'))
        plt.xticks([])

        plt.imshow(cov, cmap='plasma')
        plt.savefig(os.path.join(self.out_dir, layer_name + '_corrcoef.png'), bbox_inches='tight')
        if show:
             plt.show()     
Esempio n. 3
0
    def get_tsne(self, layer_name, show=False):
        """ Get a tsne visualization of the neuron activations.

        Arguments:
            layer_name -- layer to pull activations for (key in layer dict).

        Keywords:
            show -- show the final graph?
        """
        # build a function to get nueron activations.
        seqs = self.model.input
        get_activations = K.function([seqs, K.learning_phase()], [self.layer_dict[layer_name].output, self.model.output])

        # put the sequences into batches
        g = ctcf_strength_gen.filled_batch(iter(self.sample_peaks['signal_seq']), batch_size=self.batch_size)
        # get the layer activation for each sequence.
        base_activations = list()
        for input_batch in g:
            activations, predictions = get_activations([input_batch, 0])
            base_activations.append(np.append(activations[:32], activations[32:], axis=1))
        
        # reshape and take the maximum of each neuron to collapse the feature space to something more reasonable.
        base_activations = np.asarray(base_activations)
        base_activations = base_activations.reshape((-1, base_activations.shape[2], base_activations.shape[3]))[:self.sample_peaks.shape[0]]
        max_activations = np.amax(base_activations, axis=1)

        # build a t-sne model
        tsnemodel = TSNE(n_components=2, random_state=0)
        divisions = tsnemodel.fit_transform(max_activations)
        heatmap = plt.scatter(divisions[:,0], divisions[:,1], c=self.sample_peaks['ml'], s=10+self.sample_peaks['ctcf']*10, cmap='plasma')
        cbar = plt.colorbar(heatmap)
        plt.title('T-SNE for ' + layer_name + ' Kernel Maximum Activations')
        plt.savefig(os.path.join(self.out_dir, layer_name + '_tsne.png'), bbox_inches='tight')
        if show:
             plt.show()      
        return divisions
Esempio n. 4
0
def from_long(seq, model, input_window=256, batch_size=32, verb=False):
    """ Returns an input window size piece of the sequence with the maximum prediction.
    
    Arugments:
         seq -- one-hot encoded sequence.
         model -- model to localize the sequence with.

    Keywords:
         input_window -- length of the sequence to return.
         batch_size -- batch size accepted by the model.
         verb -- verbosity?

    Returns:
         max_tile -- peice of sequence with maximum response.
         max_pred -- prediction from max_tile.
         index -- index within the sequence where the tile starts.
    """
    # check for a short sequence.
    if len(seq) <= 256:
        print('The sequence you passed to long_seq is not long.') 
        max_tile = [0,0,0,0]*input_window
        idx = (input_window-len(seq))//2
        max_tile[idx:idx + len(seq)] = seq.copy()
        pred = model.predict(max_tile)
        return max_tile, pred, idx
        
          
    # get the indexes of the tiles.
    idxs = list(range(0, len(seq)-input_window, input_window)) + list(range(input_window//2, len(seq)-input_window, input_window))
    idxs.append(len(seq) - input_window)   
    idxs.append(max(len(seq) -(input_window//2 +input_window), 0))
                                            
    # make tiles.
    first_split = np.split(seq, list(range(input_window, len(seq), input_window)))[:-1]
    second_split = np.split(seq, list(range(input_window//2, len(seq), input_window)))[1:-1]
    tile_seqs = first_split + second_split 
    tile_seqs = np.asarray(tile_seqs)

    if verb:
        print(tile_seqs.shape)
        print(np.asarray([seq[-input_window:].copy()]).shape)
    tile_seqs = np.append(tile_seqs, np.asarray([seq[-input_window:].copy()]), axis=0)
    tile_seqs = np.append(tile_seqs, np.asarray([seq[-(input_window//2 +input_window):-input_window//2].copy()]), axis=0)    

    # make an iterable tile generator.
    tile_seqs = np.asarray(tile_seqs)
    tile_iter = iter(tile_seqs)
 
    # get a batch generator
    batches = ctcf_strength_gen.filled_batch(tile_iter, batch_size=batch_size)

    # figure out where the max prediction is coming from
    preds = list()
    batch_list = list()
    for batch in batches:
        preds.append(model.predict_on_batch(batch))
        batch_list.append(batch)
    preds = np.asarray(preds).reshape((-1))[:tile_seqs.shape[0]]
    batch_list = np.asarray(batch_list).reshape((-1, input_window, 4))[:tile_seqs.shape[0]]

    # get the tile that produced that predicion.
    max_idx = np.argmax(preds)
    max_pred = np.max(preds)
    max_tile = batch_list[max_idx]
    return max_tile, max_pred, idxs[max_idx]