Ejemplo n.º 1
0
def get_simulation_data(simulation_name,
                        simulation_parameters,
                        test_set_size=4000,
                        validation_set_size=3200):
    simulation_function = get_simulation_function(simulation_name)
    try:
        sequences, y = simulation_function(**simulation_parameters)
    except Exception as e:
        return

    if simulation_name == "simulate_heterodimer_grammar":
        motif_names = [
            simulation_parameters["motif1"], simulation_parameters["motif2"]
        ]
    elif simulation_name == "simulate_multi_motif_embedding":
        motif_names = simulation_parameters["motif_names"]
    else:
        motif_names = [simulation_parameters["motif_name"]]

    train_sequences, test_sequences, y_train, y_test = train_test_split(
        sequences, y, test_size=test_set_size)
    train_sequences, valid_sequences, y_train, y_valid = train_test_split(
        train_sequences, y_train, test_size=validation_set_size)
    X_train = one_hot_encode(train_sequences)
    X_valid = one_hot_encode(valid_sequences)
    X_test = one_hot_encode(test_sequences)

    return Data(X_train, X_valid, X_test, y_train, y_valid, y_test,
                motif_names)
Ejemplo n.º 2
0
def get_simulation_data(simulation_name, simulation_parameters,
                        test_set_size=4000, validation_set_size=3200):
    simulation_function = get_simulation_function(simulation_name)
    try:
        sequences, y = simulation_function(**simulation_parameters)
    except Exception as e:
        return

    if simulation_name=="simulate_heterodimer_grammar":
        motif_names = [simulation_parameters["motif1"],
                       simulation_parameters["motif2"]]
    elif simulation_name=="simulate_multi_motif_embedding":
        motif_names = simulation_parameters["motif_names"]
    else:
        motif_names = [simulation_parameters["motif_name"]]

    train_sequences, test_sequences, y_train, y_test = train_test_split(
        sequences, y, test_size=test_set_size)
    train_sequences, valid_sequences, y_train, y_valid = train_test_split(
        train_sequences, y_train, test_size=validation_set_size)
    X_train = one_hot_encode(train_sequences)
    X_valid = one_hot_encode(valid_sequences)
    X_test = one_hot_encode(test_sequences)

    return Data(X_train, X_valid, X_test, y_train, y_valid, y_test, motif_names)
def get_train_valid_test_data(simulation_func, prefix=None, test_size=0.2, valid_size=0.2, **kwargs):
    simulation_fname = ''.join('{}{}'.format(key, val) for key, val in sorted(kwargs.items()))
    simulation_fname = "{}{}.npz".format(prefix, simulation_fname)
    if prefix is not None:
        try:
            logger.debug("Checking for simulation data file {}...".format(simulation_fname) )
            data = np.load(simulation_fname)
            logger.debug("{} found. Loaded simulation data successfully!".format(simulation_fname))
            return ( data['X_train'], data['X_valid'], data['X_test'],
                     data['y_train'], data['y_valid'], data['y_test'])
        except:
            logger.debug("{} not found. Simulating data..".format(simulation_fname))
            pass

    sequences, y = simulation_func(**kwargs)
    train_sequences, test_sequences, y_train, y_test = train_test_split(
        sequences, y, test_size=test_size)
    train_sequences, valid_sequences, y_train, y_valid = train_test_split(
        train_sequences, y_train, test_size=valid_size)
    X_train = one_hot_encode(train_sequences)
    X_valid = one_hot_encode(valid_sequences)
    X_test = one_hot_encode(test_sequences)
    
    if prefix is not None:
        logger.debug("Saving simulated data to simulation_fname...".format(simulation_fname))
        np.savez_compressed(simulation_fname,
                            X_train=X_train, X_valid=X_valid, X_test=X_test,
                            y_train=y_train, y_valid=y_valid, y_test=y_test)
    
    return ( X_train, X_valid, X_test,
             y_train, y_valid, y_test )
def get_train_valid_test_data(simulation_func,
                              prefix=None,
                              test_size=0.2,
                              valid_size=0.2,
                              **kwargs):
    simulation_fname = ''.join('{}{}'.format(key, val)
                               for key, val in sorted(kwargs.items()))
    simulation_fname = "{}{}.npz".format(prefix, simulation_fname)
    if prefix is not None:
        try:
            logger.debug("Checking for simulation data file {}...".format(
                simulation_fname))
            data = np.load(simulation_fname)
            logger.debug(
                "{} found. Loaded simulation data successfully!".format(
                    simulation_fname))
            return (data['X_train'], data['X_valid'], data['X_test'],
                    data['y_train'], data['y_valid'], data['y_test'])
        except:
            logger.debug(
                "{} not found. Simulating data..".format(simulation_fname))
            pass

    sequences, y, embeddings = simulation_func(**kwargs)
    (train_sequences, test_sequences, train_embeddings, test_embeddings,
     y_train, y_test) = train_test_split(sequences,
                                         embeddings,
                                         y,
                                         test_size=test_size)
    (train_sequences, valid_sequences, train_embeddings, valid_embeddings,
     y_train, y_valid) = train_test_split(train_sequences,
                                          train_embeddings,
                                          y_train,
                                          test_size=valid_size)
    X_train = one_hot_encode(train_sequences)
    X_valid = one_hot_encode(valid_sequences)
    X_test = one_hot_encode(test_sequences)

    if prefix is not None:
        logger.debug("Saving simulated data to simulation_fname...".format(
            simulation_fname))
        np.savez_compressed(simulation_fname,
                            X_train=X_train,
                            X_valid=X_valid,
                            X_test=X_test,
                            train_embeddings=train_embeddings,
                            valid_embeddings=valid_embeddings,
                            test_embeddings=test_embeddings,
                            y_train=y_train,
                            y_valid=y_valid,
                            y_test=y_test)

    return (X_train, X_valid, X_test, y_train, y_valid, y_test)
Ejemplo n.º 5
0
def run(use_deep_CNN, use_RNN, label, golden_first_sequence, golden_results):
    seq_length = 100
    num_sequences = 200
    num_positives = 100
    num_negatives = num_sequences - num_positives
    GC_fraction = 0.4
    test_fraction = 0.2
    num_epochs = 1
    sequences, labels, embeddings = simulate_single_motif_detection(
        'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction)
    assert sequences[0] == golden_first_sequence, 'first sequence = {}, golden = {}'.format(
        sequences[0], golden_first_sequence)
    encoded_sequences = one_hot_encode(sequences)
    X_train, X_test, y_train, y_test = train_test_split(
        encoded_sequences, labels, test_size=test_fraction)
    X_train = np.concatenate((X_train, reverse_complement(X_train)))
    y_train = np.concatenate((y_train, y_train))
    random_order = np.arange(len(X_train))
    np.random.shuffle(random_order)
    X_train = X_train[random_order]
    y_train = y_train[random_order]
    hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN,
                       'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,),
                       'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs}
    if use_deep_CNN:
        hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)})
    if use_RNN:
        hyperparameters.update({'GRU_size': 35, 'TDD_size': 45})
    model = SequenceDNN(**hyperparameters)
    model.train(X_train, y_train, validation_data=(X_test, y_test))
    results = model.test(X_test, y_test).results[0]
    assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \
        '{}: result = {}, golden = {}'.format(label, results, golden_results)
Ejemplo n.º 6
0
def run(use_deep_CNN, use_RNN, label, golden_results):
    seq_length = 100
    num_sequences = 200
    test_fraction = 0.2
    num_epochs = 1
    sequences = np.array([''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences)])
    labels = np.random.choice((True, False), size=num_sequences)[:, None]
    encoded_sequences = one_hot_encode(sequences)
    X_train, X_test, y_train, y_test = train_test_split(
        encoded_sequences, labels, test_size=test_fraction)
    X_train = np.concatenate((X_train, reverse_complement(X_train)))
    y_train = np.concatenate((y_train, y_train))
    random_order = np.arange(len(X_train))
    np.random.shuffle(random_order)
    X_train = X_train[random_order]
    y_train = y_train[random_order]
    hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN,
                       'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,),
                       'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs}
    if use_deep_CNN:
        hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)})
    if use_RNN:
        hyperparameters.update({'GRU_size': 35, 'TDD_size': 45})
    model = SequenceDNN(**hyperparameters)
    model.train(X_train, y_train, validation_data=(X_test, y_test))
    results = model.test(X_test, y_test).results[0]
    assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \
        '{}: result = {}, golden = {}'.format(label, results, golden_results)
Ejemplo n.º 7
0
def run(use_deep_CNN, use_RNN, label, golden_results):
    import random
    np.random.seed(1)
    random.seed(1)
    from dragonn.models import SequenceDNN
    from simdna.simulations import simulate_single_motif_detection
    from dragonn.utils import one_hot_encode, reverse_complement
    from sklearn.cross_validation import train_test_split
    seq_length = 50
    num_sequences = 100
    num_positives = 50
    num_negatives = num_sequences - num_positives
    GC_fraction = 0.4
    test_fraction = 0.2
    validation_fraction = 0.2
    num_epochs = 1

    sequences, labels = simulate_single_motif_detection(
        'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction)
    encoded_sequences = one_hot_encode(sequences)
    X_train, X_test, y_train, y_test = train_test_split(
        encoded_sequences, labels, test_size=test_fraction)
    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train, y_train, test_size=validation_fraction)
    X_train = np.concatenate((X_train, reverse_complement(X_train)))
    y_train = np.concatenate((y_train, y_train))
    random_order = np.arange(len(X_train))
    np.random.shuffle(random_order)
    X_train = X_train[random_order]
    y_train = y_train[random_order]
    hyperparameters = {
        'seq_length': seq_length,
        'use_RNN': use_RNN,
        'num_filters': (45, ),
        'pool_width': 25,
        'conv_width': (10, ),
        'L1': 0,
        'dropout': 0.2,
        'num_epochs': num_epochs
    }
    if use_deep_CNN:
        hyperparameters.update({
            'num_filters': (45, 50, 50),
            'conv_width': (10, 8, 5)
        })
    if use_RNN:
        hyperparameters.update({'GRU_size': 35, 'TDD_size': 45})
    model = SequenceDNN(**hyperparameters)
    model.train(X_train, y_train, validation_data=(X_valid, y_valid))
    results = model.test(X_test, y_test).results[0]
    assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \
        '{}: result = {}, golden = {}'.format(label, results, golden_results)
Ejemplo n.º 8
0
def run(use_deep_CNN, use_RNN, label, golden_results):
    seq_length = 100
    num_sequences = 200
    test_fraction = 0.2
    num_epochs = 1
    sequences = np.array([
        ''.join(random.choice('ACGT') for base in range(seq_length))
        for sequence in range(num_sequences)
    ])
    labels = np.random.choice((True, False), size=num_sequences)[:, None]
    encoded_sequences = one_hot_encode(sequences)
    X_train, X_test, y_train, y_test = train_test_split(
        encoded_sequences, labels, test_size=test_fraction)
    X_train = np.concatenate((X_train, reverse_complement(X_train)))
    y_train = np.concatenate((y_train, y_train))
    random_order = np.arange(len(X_train))
    np.random.shuffle(random_order)
    X_train = X_train[random_order]
    y_train = y_train[random_order]
    hyperparameters = {
        'seq_length': seq_length,
        'use_RNN': use_RNN,
        'num_filters': (45, ),
        'pool_width': 25,
        'conv_width': (10, ),
        'L1': 0,
        'dropout': 0.2,
        'num_epochs': num_epochs
    }
    if use_deep_CNN:
        hyperparameters.update({
            'num_filters': (45, 50, 50),
            'conv_width': (10, 8, 5)
        })
    if use_RNN:
        hyperparameters.update({'GRU_size': 35, 'TDD_size': 45})
    model = SequenceDNN(**hyperparameters)
    model.train(X_train, y_train, validation_data=(X_test, y_test))
    results = model.test(X_test, y_test).results[0]
    assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \
        '{}: result = {}, golden = {}'.format(label, results, golden_results)
import tflearn as tfl
import tensorflow as tft
from sklearn.cross_validation import train_test_split
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression
from dragonn.simulations import *
from dragonn.utils import one_hot_encode, get_motif_scores, reverse_complement
from tflearn.models.dnn import DNN

# DATASET MANIPULATION & CREATION =============================================

# Create dataset
X, y = simulate_single_motif_detection('CTCF_known2', 1000, 5000, 5000, 0.4)
X = one_hot_encode(X)
random_order = np.arange(len(X))
np.random.shuffle(random_order)
X, y = X[random_order], y[random_order]

# Split dataset into a training and test set
test_fraction = validation_fraction = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction)

# Concatenate reverse complements
X_train = np.concatenate((X_train, reverse_complement(X_train)))
y_train = np.concatenate((y_train, y_train))
X_test = np.concatenate((X_test, reverse_complement(X_test)))
y_test = np.concatenate((y_test, y_test))

# Reshape dataset per Tensorflow convention: [# examples, height, width, channels]
Ejemplo n.º 10
0
test_fraction = 0.2
validation_fraction = 0.2
do_hyperparameter_search = False
num_hyperparameter_trials = 50
num_epochs = 100
use_deep_CNN = False
use_RNN = False

print('Generating sequences...')

sequences, labels = simulate_single_motif_detection(
    'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction)

print('One-hot encoding sequences...')

encoded_sequences = one_hot_encode(sequences)

print('Getting motif scores...')

motif_scores = get_motif_scores(encoded_sequences, motif_names=['SPI1_disc1'])

print('Partitioning data into training, validation and test sets...')

X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, labels, test_size=test_fraction)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=validation_fraction)

print('Adding reverse complements...')

X_train = np.concatenate((X_train, reverse_complement(X_train)))
y_train = np.concatenate((y_train, y_train))
Ejemplo n.º 11
0
# imports
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from dragonn.simulations import simulate_single_motif_detection
from dragonn.utils import reverse_complement, one_hot_encode

# generating data
## simulation of CTCF data
fullX, fullY = simulate_single_motif_detection('CTCF_known2', 500, 5000,
                                               5000, 0.4)
encodedX = one_hot_encode(fullX)

## random shuffle data
random_order = np.arange(len(encodedX))
np.random.shuffle(random_order)
X, y = encodedX[random_order], fullY[random_order]

## split datasets at 20:80 test:train, 20:80 valid:train
split_fraction = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=split_fraction)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=split_fraction)

## Tack on reverse complements
X_train = np.concatenate((X_train, reverse_complement(X_train)))
X_valid = np.concatenate((X_valid, reverse_complement(X_valid)))
X_test = np.concatenate((X_test, reverse_complement(X_test)))
Ejemplo n.º 12
0
test_fraction = 0.2
validation_fraction = 0.2
do_hyperparameter_search = False
num_hyperparameter_trials = 50
num_epochs = 100
use_deep_CNN = False
use_RNN = False

print('Generating sequences...')

sequences, labels = simulate_single_motif_detection(
    'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction)

print('One-hot encoding sequences...')

encoded_sequences = one_hot_encode(sequences)

print('Getting motif scores...')

motif_scores = get_motif_scores(encoded_sequences, motif_names=['SPI1_disc1'])

print('Partitioning data into training, validation and test sets...')

X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, labels, test_size=test_fraction)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=validation_fraction)

print('Adding reverse complements...')

X_train = np.concatenate((X_train, reverse_complement(X_train)))
y_train = np.concatenate((y_train, y_train))