def get_simulation_data(simulation_name, simulation_parameters, test_set_size=4000, validation_set_size=3200): simulation_function = get_simulation_function(simulation_name) try: sequences, y = simulation_function(**simulation_parameters) except Exception as e: return if simulation_name == "simulate_heterodimer_grammar": motif_names = [ simulation_parameters["motif1"], simulation_parameters["motif2"] ] elif simulation_name == "simulate_multi_motif_embedding": motif_names = simulation_parameters["motif_names"] else: motif_names = [simulation_parameters["motif_name"]] train_sequences, test_sequences, y_train, y_test = train_test_split( sequences, y, test_size=test_set_size) train_sequences, valid_sequences, y_train, y_valid = train_test_split( train_sequences, y_train, test_size=validation_set_size) X_train = one_hot_encode(train_sequences) X_valid = one_hot_encode(valid_sequences) X_test = one_hot_encode(test_sequences) return Data(X_train, X_valid, X_test, y_train, y_valid, y_test, motif_names)
def get_simulation_data(simulation_name, simulation_parameters, test_set_size=4000, validation_set_size=3200): simulation_function = get_simulation_function(simulation_name) try: sequences, y = simulation_function(**simulation_parameters) except Exception as e: return if simulation_name=="simulate_heterodimer_grammar": motif_names = [simulation_parameters["motif1"], simulation_parameters["motif2"]] elif simulation_name=="simulate_multi_motif_embedding": motif_names = simulation_parameters["motif_names"] else: motif_names = [simulation_parameters["motif_name"]] train_sequences, test_sequences, y_train, y_test = train_test_split( sequences, y, test_size=test_set_size) train_sequences, valid_sequences, y_train, y_valid = train_test_split( train_sequences, y_train, test_size=validation_set_size) X_train = one_hot_encode(train_sequences) X_valid = one_hot_encode(valid_sequences) X_test = one_hot_encode(test_sequences) return Data(X_train, X_valid, X_test, y_train, y_valid, y_test, motif_names)
def get_train_valid_test_data(simulation_func, prefix=None, test_size=0.2, valid_size=0.2, **kwargs): simulation_fname = ''.join('{}{}'.format(key, val) for key, val in sorted(kwargs.items())) simulation_fname = "{}{}.npz".format(prefix, simulation_fname) if prefix is not None: try: logger.debug("Checking for simulation data file {}...".format(simulation_fname) ) data = np.load(simulation_fname) logger.debug("{} found. Loaded simulation data successfully!".format(simulation_fname)) return ( data['X_train'], data['X_valid'], data['X_test'], data['y_train'], data['y_valid'], data['y_test']) except: logger.debug("{} not found. Simulating data..".format(simulation_fname)) pass sequences, y = simulation_func(**kwargs) train_sequences, test_sequences, y_train, y_test = train_test_split( sequences, y, test_size=test_size) train_sequences, valid_sequences, y_train, y_valid = train_test_split( train_sequences, y_train, test_size=valid_size) X_train = one_hot_encode(train_sequences) X_valid = one_hot_encode(valid_sequences) X_test = one_hot_encode(test_sequences) if prefix is not None: logger.debug("Saving simulated data to simulation_fname...".format(simulation_fname)) np.savez_compressed(simulation_fname, X_train=X_train, X_valid=X_valid, X_test=X_test, y_train=y_train, y_valid=y_valid, y_test=y_test) return ( X_train, X_valid, X_test, y_train, y_valid, y_test )
def get_train_valid_test_data(simulation_func, prefix=None, test_size=0.2, valid_size=0.2, **kwargs): simulation_fname = ''.join('{}{}'.format(key, val) for key, val in sorted(kwargs.items())) simulation_fname = "{}{}.npz".format(prefix, simulation_fname) if prefix is not None: try: logger.debug("Checking for simulation data file {}...".format( simulation_fname)) data = np.load(simulation_fname) logger.debug( "{} found. Loaded simulation data successfully!".format( simulation_fname)) return (data['X_train'], data['X_valid'], data['X_test'], data['y_train'], data['y_valid'], data['y_test']) except: logger.debug( "{} not found. Simulating data..".format(simulation_fname)) pass sequences, y, embeddings = simulation_func(**kwargs) (train_sequences, test_sequences, train_embeddings, test_embeddings, y_train, y_test) = train_test_split(sequences, embeddings, y, test_size=test_size) (train_sequences, valid_sequences, train_embeddings, valid_embeddings, y_train, y_valid) = train_test_split(train_sequences, train_embeddings, y_train, test_size=valid_size) X_train = one_hot_encode(train_sequences) X_valid = one_hot_encode(valid_sequences) X_test = one_hot_encode(test_sequences) if prefix is not None: logger.debug("Saving simulated data to simulation_fname...".format( simulation_fname)) np.savez_compressed(simulation_fname, X_train=X_train, X_valid=X_valid, X_test=X_test, train_embeddings=train_embeddings, valid_embeddings=valid_embeddings, test_embeddings=test_embeddings, y_train=y_train, y_valid=y_valid, y_test=y_test) return (X_train, X_valid, X_test, y_train, y_valid, y_test)
def run(use_deep_CNN, use_RNN, label, golden_first_sequence, golden_results): seq_length = 100 num_sequences = 200 num_positives = 100 num_negatives = num_sequences - num_positives GC_fraction = 0.4 test_fraction = 0.2 num_epochs = 1 sequences, labels, embeddings = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) assert sequences[0] == golden_first_sequence, 'first sequence = {}, golden = {}'.format( sequences[0], golden_first_sequence) encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def run(use_deep_CNN, use_RNN, label, golden_results): seq_length = 100 num_sequences = 200 test_fraction = 0.2 num_epochs = 1 sequences = np.array([''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences)]) labels = np.random.choice((True, False), size=num_sequences)[:, None] encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = {'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45,), 'pool_width': 25, 'conv_width': (10,), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs} if use_deep_CNN: hyperparameters.update({'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5)}) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def run(use_deep_CNN, use_RNN, label, golden_results): import random np.random.seed(1) random.seed(1) from dragonn.models import SequenceDNN from simdna.simulations import simulate_single_motif_detection from dragonn.utils import one_hot_encode, reverse_complement from sklearn.cross_validation import train_test_split seq_length = 50 num_sequences = 100 num_positives = 50 num_negatives = num_sequences - num_positives GC_fraction = 0.4 test_fraction = 0.2 validation_fraction = 0.2 num_epochs = 1 sequences, labels = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=validation_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = { 'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45, ), 'pool_width': 25, 'conv_width': (10, ), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs } if use_deep_CNN: hyperparameters.update({ 'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5) }) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_valid, y_valid)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
def run(use_deep_CNN, use_RNN, label, golden_results): seq_length = 100 num_sequences = 200 test_fraction = 0.2 num_epochs = 1 sequences = np.array([ ''.join(random.choice('ACGT') for base in range(seq_length)) for sequence in range(num_sequences) ]) labels = np.random.choice((True, False), size=num_sequences)[:, None] encoded_sequences = one_hot_encode(sequences) X_train, X_test, y_train, y_test = train_test_split( encoded_sequences, labels, test_size=test_fraction) X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) random_order = np.arange(len(X_train)) np.random.shuffle(random_order) X_train = X_train[random_order] y_train = y_train[random_order] hyperparameters = { 'seq_length': seq_length, 'use_RNN': use_RNN, 'num_filters': (45, ), 'pool_width': 25, 'conv_width': (10, ), 'L1': 0, 'dropout': 0.2, 'num_epochs': num_epochs } if use_deep_CNN: hyperparameters.update({ 'num_filters': (45, 50, 50), 'conv_width': (10, 8, 5) }) if use_RNN: hyperparameters.update({'GRU_size': 35, 'TDD_size': 45}) model = SequenceDNN(**hyperparameters) model.train(X_train, y_train, validation_data=(X_test, y_test)) results = model.test(X_test, y_test).results[0] assert np.allclose(tuple(results.values()), tuple(golden_results.values())), \ '{}: result = {}, golden = {}'.format(label, results, golden_results)
import tflearn as tfl import tensorflow as tft from sklearn.cross_validation import train_test_split from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_2d, max_pool_2d from tflearn.layers.normalization import local_response_normalization from tflearn.layers.estimator import regression from dragonn.simulations import * from dragonn.utils import one_hot_encode, get_motif_scores, reverse_complement from tflearn.models.dnn import DNN # DATASET MANIPULATION & CREATION ============================================= # Create dataset X, y = simulate_single_motif_detection('CTCF_known2', 1000, 5000, 5000, 0.4) X = one_hot_encode(X) random_order = np.arange(len(X)) np.random.shuffle(random_order) X, y = X[random_order], y[random_order] # Split dataset into a training and test set test_fraction = validation_fraction = 0.2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_fraction) # Concatenate reverse complements X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train)) X_test = np.concatenate((X_test, reverse_complement(X_test))) y_test = np.concatenate((y_test, y_test)) # Reshape dataset per Tensorflow convention: [# examples, height, width, channels]
test_fraction = 0.2 validation_fraction = 0.2 do_hyperparameter_search = False num_hyperparameter_trials = 50 num_epochs = 100 use_deep_CNN = False use_RNN = False print('Generating sequences...') sequences, labels = simulate_single_motif_detection( 'SPI1_disc1', seq_length, num_positives, num_negatives, GC_fraction) print('One-hot encoding sequences...') encoded_sequences = one_hot_encode(sequences) print('Getting motif scores...') motif_scores = get_motif_scores(encoded_sequences, motif_names=['SPI1_disc1']) print('Partitioning data into training, validation and test sets...') X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, labels, test_size=test_fraction) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=validation_fraction) print('Adding reverse complements...') X_train = np.concatenate((X_train, reverse_complement(X_train))) y_train = np.concatenate((y_train, y_train))
# imports import matplotlib.pyplot as plt import seaborn as sns import numpy as np import tensorflow as tf from sklearn.cross_validation import train_test_split from dragonn.simulations import simulate_single_motif_detection from dragonn.utils import reverse_complement, one_hot_encode # generating data ## simulation of CTCF data fullX, fullY = simulate_single_motif_detection('CTCF_known2', 500, 5000, 5000, 0.4) encodedX = one_hot_encode(fullX) ## random shuffle data random_order = np.arange(len(encodedX)) np.random.shuffle(random_order) X, y = encodedX[random_order], fullY[random_order] ## split datasets at 20:80 test:train, 20:80 valid:train split_fraction = 0.2 X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=split_fraction) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=split_fraction) ## Tack on reverse complements X_train = np.concatenate((X_train, reverse_complement(X_train))) X_valid = np.concatenate((X_valid, reverse_complement(X_valid))) X_test = np.concatenate((X_test, reverse_complement(X_test)))