Esempio n. 1
0
def main_interpret(sequences=None,
                   arch_file=None,
                   weights_file=None,
                   pos_threshold=None,
                   peak_width=10,
                   prefix=None):
    # encode fasta
    print("loading sequence data...")
    X = encode_fasta_sequences(sequences)
    # load model
    print("loading model...")
    model = SequenceDNN.load(arch_file, weights_file)
    # predict
    print("getting predictions...")
    predictions = model.predict(X)
    # deeplift
    print("getting deeplift scores...")
    deeplift_scores = model.deeplift(X)
    # get important sequences and write to file
    print("extracting important sequences and writing to file...")
    for task_index, task_scores in enumerate(deeplift_scores):
        peak_positions = []
        peak_sequences = []
        for sequence_index, sequence_scores in enumerate(task_scores):
            if predictions[sequence_index, task_index] > pos_threshold:
                #print(sequence_scores.shape)
                basewise_sequence_scores = sequence_scores.max(axis=(0,1))
                peak_position = basewise_sequence_scores.argmax()
                peak_positions.append(peak_position)
                peak_sequences.append(X[sequence_index : sequence_index + 1,
                                        :,
                                        :,
                                        peak_position - peak_width :
                                        peak_position + peak_width])
            else:
                peak_positions.append(-1)
                peak_sequences.append(np.zeros((1, 1, 4, 2 * peak_width)))
        peak_sequences = np.concatenate(peak_sequences)
        peak_sequence_strings = get_sequence_strings(peak_sequences)
        # write important sequences to file
        ofname = "%s.task_%i.important_sequences.txt" % (prefix, task_index)
        with open(ofname, "w") as wf:
            for i, peak_position in enumerate(peak_positions):
                wf.write("> sequence_%i\n" % (i))
                wf.write("%i: %s\n" %(peak_position, peak_sequence_strings[i]))
    print("Done!")
Esempio n. 2
0
def main_interpret(sequences=None,
                   arch_file=None,
                   weights_file=None,
                   pos_threshold=None,
                   peak_width=10,
                   prefix=None):
    # encode fasta
    print("loading sequence data...")
    X = encode_fasta_sequences(sequences)
    # load model
    print("loading model...")
    model = SequenceDNN.load(arch_file, weights_file)
    # predict
    print("getting predictions...")
    predictions = model.predict(X)
    # deeplift
    print("getting deeplift scores...")
    deeplift_scores = model.deeplift(X)
    # get important sequences and write to file
    print("extracting important sequences and writing to file...")
    for task_index, task_scores in enumerate(deeplift_scores):
        peak_positions = []
        peak_sequences = []
        for sequence_index, sequence_scores in enumerate(task_scores):
            if predictions[sequence_index, task_index] > pos_threshold:
                #print(sequence_scores.shape)
                basewise_sequence_scores = sequence_scores.max(axis=(0, 1))
                peak_position = basewise_sequence_scores.argmax()
                peak_positions.append(peak_position)
                peak_sequences.append(
                    X[sequence_index:sequence_index + 1, :, :,
                      peak_position - peak_width:peak_position + peak_width])
            else:
                peak_positions.append(-1)
                peak_sequences.append(np.zeros((1, 1, 4, 2 * peak_width)))
        peak_sequences = np.concatenate(peak_sequences)
        peak_sequence_strings = get_sequence_strings(peak_sequences)
        # write important sequences to file
        ofname = "%s.task_%i.important_sequences.txt" % (prefix, task_index)
        with open(ofname, "w") as wf:
            for i, peak_position in enumerate(peak_positions):
                wf.write("> sequence_%i\n" % (i))
                wf.write("%i: %s\n" %
                         (peak_position, peak_sequence_strings[i]))
    print("Done!")
Esempio n. 3
0
from deeplift.dinuc_shuffle import dinuc_shuffle
from dragonn.utils import get_sequence_strings
import random
import numpy as np

import wget
url = "http://mitra.stanford.edu/kundaje/projects/dragonn/deep_lift_input_classification_spi1.npy"
wget.download(url)
deep_lift_input_classification_spi1 = np.load(
    "deep_lift_input_classification_spi1.npy")
print(deep_lift_input_classification_spi1.shape)
deep_lift_input_classification_spi1_strings = get_sequence_strings(
    deep_lift_input_classification_spi1)

for i in range(len(deep_lift_input_classification_spi1)):
    random.seed(1234)
    shuffled_strings = dinuc_shuffle(
        deep_lift_input_classification_spi1_strings[i])
    random.seed(1234)
    shuffled_array = dinuc_shuffle(
        deep_lift_input_classification_spi1[i].squeeze())
    #decode the array
    shuffled_array = ''.join(
        get_sequence_strings(
            np.expand_dims(np.expand_dims(shuffled_array, axis=1), axis=1)))
    #make sure shuffling the string and numpy array gave same shuffle output
    if (shuffled_strings != shuffled_array):
        print("FAILED!")
print("TEST PASSED!")
Esempio n. 4
0
from keras.models import load_model
from dragonn.tutorial_utils import deeplift
from dragonn.utils import get_sequence_strings, one_hot_encode
from deeplift import dinuc_shuffle
import numpy as np

import wget
url_data = "http://mitra.stanford.edu/kundaje/projects/dragonn/deep_lift_input_classification_spi1.npy"
url_model = "http://mitra.stanford.edu/kundaje/projects/dragonn/SPI1.classification.model.hdf5"
wget.download(url_data)
wget.download(url_model)

deep_lift_input_classification_spi1 = np.load(
    "deep_lift_input_classification_spi1.npy")
deep_lift_input_classification_spi1_strings = get_sequence_strings(
    deep_lift_input_classification_spi1)

#get scores with GC reference
deep_lift_scores_spi1_gc_ref = deeplift("SPI1.classification.model.hdf5",
                                        deep_lift_input_classification_spi1,
                                        reference="gc_ref")
print(deep_lift_scores_spi1_gc_ref.shape)
print(np.max(deep_lift_scores_spi1_gc_ref))
print(np.min(deep_lift_scores_spi1_gc_ref))

#Get scores with shuffled reference (starting with strings )
deep_lift_scores_spi1_shuffled_ref_strings = deeplift(
    "SPI1.classification.model.hdf5",
    deep_lift_input_classification_spi1_strings,
    one_hot_func=one_hot_encode)
print(deep_lift_scores_spi1_shuffled_ref_strings.shape)