Ejemplo n.º 1
0
def test_init_model():
    # create neural net
    nn = NN.NeuralNetwork(sizes=[3, 2, 1])

    # test that weights are the correct sizes
    assert nn.weights[0].shape == (2, 3)
    assert nn.weights[1].shape == (1, 2)

    # test that bias are the correct sizes
    assert len(nn.bias) == 3

    # create neural net
    nn2 = NN.NeuralNetwork(sizes=[1, 2, 3])

    # test that weights are the correct sizes
    assert nn2.weights[0].shape == (2, 1)
    assert nn2.weights[1].shape == (3, 2)

    # test that bias are the correct sizes
    assert len(nn2.bias) == 3

    # create 4-layer neural net
    nn3 = NN.NeuralNetwork(sizes=[1, 2, 3, 4])

    # test that weights are the correct sizes
    assert nn3.weights[0].shape == (2, 1)
    assert nn3.weights[1].shape == (3, 2)
    assert nn3.weights[2].shape == (4, 3)

    # test that bias are the correct sizes
    assert len(nn3.bias) == 4
Ejemplo n.º 2
0
def test_others():
	a = NN.oneHotDNA("AACGT", flatten = False)
	assert (a == np.array([[1., 0., 0., 0.],
       				   [1., 0., 0., 0.],
       				   [0., 1., 0., 0.],
       				   [0., 0., 1., 0.],
       				   [0., 0., 0., 1.]])).all(), "Failing to one-hot-encode"
	a = NN.oneHotDNA("AACGT", flatten = True)
	assert (a == np.array([1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.])).all(), "Failing to ecode + flatten"
Ejemplo n.º 3
0
def test_nn():
    X, labels = make_blobs(n_samples=200, n_features=8, centers=4)
    N = io.norm(X)

    net = nn.NeuralNetwork(layers=[(8, None), (4, nn.Sigmoid),
                                   (8, nn.Sigmoid)],
                           learning_rate=0.8)
    Loss = nn.MSE()

    net.fit(N, N, Loss)
    Y = net.predict(N)

    assert np.isnan(Y).sum() == 0
Ejemplo n.º 4
0
def test_fit():
    # create neural net
    # use an autoencoder here
    nn = NN.NeuralNetwork(sizes=[3, 1, 3], seed=0)

    # test that weights are the correct sizes
    assert (nn.weights[0].shape == (1, 3))
    assert (nn.weights[1].shape == (3, 1))

    # test that bias are the correct sizes
    assert len(nn.bias) == 3

    # create fake dataset
    test_x = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    test_y = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])

    # store the current weights & bias to compare to after fitting
    w = [a for a in nn.weights[0][0]]
    b = [a for a in nn.bias]

    # fit neural net to text_x, text_y
    nn.fit(test_x, test_y)

    # check that layers are being imported correctly
    assert (nn.layers[0] == test_x).all()
    assert (nn.layers[1].shape == (1, 3))
    assert (nn.layers[-1].shape == (3, 3))

    # check that the weights have been updated
    assert ~(nn.weights[0] == np.array([w])).all()
    assert ~(nn.bias == np.array([b])).all()
Ejemplo n.º 5
0
def test_NN():
	X_train = np.asarray(
           [[2.7810836,2.550537003], [1.465489372,2.362125076], [3.396561688,4.400293529],
           [1.38807019,1.850220317],[3.06407232,3.005305973], [7.627531214,2.759262235],
           [5.332441248,2.088626775], [6.922596716,1.77106367],[8.675418651,-0.242068655],
           [7.673756466,3.508563011]]
        )

	y_train = np.asarray([[0], [0], [0],
                    [0], [0], [1],
                    [1], [1], [1], 
                    [1]])

	X_test = np.asarray([[1.235425, 2.121455],  # should be 0 with the lower scores
                    [6.1234, 2.1234]])  #should be 1 with higher scores
	#test comes from:
	#https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/
            
	amanda_auto  = NN.NeuralNetwork(setup_nodes = (2, 3, 1), 
                             activation = "sigmoid", seed=1)
	amanda_auto.fit(input_data=X_train, 
                expected_output = y_train, #here, training is the same as our hopeful prediction
                epochs = 100, 
                learning_rate = 20,
                verbose=False)
	predict = amanda_auto.predict(X_test, task="round")
	assert (predict == np.asarray([[0],[1]])).all() , "Fail 15"
Ejemplo n.º 6
0
def test_binary_classifier():
    '''
    This function tests the Rap1 Binary Classifier by ensuring that
    '''
    # Reading in postive sequences
    positive_seqs = io.read_text_file('./data/rap1-lieb-positives.txt')
    # Reading in upstream yeast sequences
    pot_neg_seqs = io.read_fasta_file('./data/yeast-upstream-1k-negative.fa')
    # Randomly sampling negative sequences
    negative_seqs = preprocess.sample_negative_examples(pot_neg_seqs,
                                                        positive_seqs,
                                                        num_samples=1000,
                                                        seq_length=17)
    # Generating labels
    pos_labels = np.expand_dims(np.array([1] * len(positive_seqs)), axis=0)
    neg_labels = np.expand_dims(np.array([0] * len(negative_seqs)), axis=0)
    labels = np.swapaxes(np.hstack([pos_labels, neg_labels]), 0, 1)
    # Generating Unraveled One Hot Encoded Input Feauture --> 4*seq_length
    pos_input = preprocess.encode_seqs(positive_seqs)
    neg_input = preprocess.encode_seqs(negative_seqs)
    # Generating full inputs to network
    inputs = np.vstack([pos_input, neg_input])
    # Defining neccessary hyperparameters for the NN model
    nn_architecture = [{
        'input_dim': 68,
        'output_dim': 2,
        'activation': 'relu'
    }, {
        'input_dim': 2,
        'output_dim': 1,
        'activation': 'sigmoid'
    }]
    loss_function = 'binary_crossentropy'
    learning_rate = 2
    seed = 14
    epochs = 1000
    early_stop = [20, -1e-12]
    # Generating NN class instance for binary classifier
    bc_nn = NN.NeuralNetwork(nn_architecture,
                             lr=learning_rate,
                             seed=seed,
                             epochs=epochs,
                             loss_function=loss_function)
    # saving initial weights to make sure they are changing
    init_w1 = bc_nn.param_dict['W1'].copy()
    init_w2 = bc_nn.param_dict['W2'].copy()
    # Doing sinlge fold training split
    X_train, X_val, y_train, y_val = preprocess.split_basic_binary(
        inputs, labels, split=[0.8, 0.2], shuffle=True)
    # Training Model
    per_epoch_loss_train, per_epoch_loss_val, per_epoch_acc_train, per_epoch_acc_val = bc_nn.fit(
        X_train, y_train, X_val, y_val, early_stop=early_stop)
    ### ensuring that training loss always decreases
    assert np.sum(np.diff(per_epoch_loss_train) > 0) == 0
    ### ensuring that weight matrices change while training
    final_w1 = bc_nn.param_dict['W1']
    final_w2 = bc_nn.param_dict['W2']
    assert np.sum(init_w1 - final_w1) != 0
    assert np.sum(init_w2 - final_w2) != 0
Ejemplo n.º 7
0
def simple_network():
    """A very simple neural network that is useful for tests below"""
    setup = [8,3,1]
    alpha = 0.5
    lamba = 0 
    simple_NN = NN.NeuralNetwork(setup,NN.activation,alpha,lamba)
    
    return simple_NN
Ejemplo n.º 8
0
def test_activation_sigmoid():
    a = nn.Sigmoid()
    test = np.random.random((1, 100))
    test_activation = a.activation(test)
    test_derivative = a.derivative(test)

    assert np.isnan(test_activation).sum() == 0
    assert np.isnan(test_derivative).sum() == 0
Ejemplo n.º 9
0
def test_encoder():
	amanda_auto  = NN.NeuralNetwork(setup_nodes = (8, 3, 8), activation = "sigmoid", seed=1)
	inputs = np.random.randint(2, size=(1,8))
	amanda_auto.fit(input_data=inputs, 
                expected_output = inputs, #here, training is the same as our hopeful prediction
                epochs = 10, 
                learning_rate = 20,
                verbose=False) #dont want the output to be messy 
	assert amanda_auto.final_score < 0.1, "Fail 13"
Ejemplo n.º 10
0
def test_activation_free():
    a = nn.Free()
    test = np.random.random((1, 100))
    test_activation = a.activation(test)
    test_derivative = a.derivative(test)

    assert np.sum(test - test_activation) == 0
    assert np.unique(test_derivative).size == 1
    assert np.unique(test_derivative)[0] == 1
Ejemplo n.º 11
0
def test_CE():
    Loss = nn.CE()
    test_x = np.random.random((3, 100))
    test_y = np.random.random((3, 100))

    test_cost = Loss.loss(test_x, test_y)
    test_derivative = Loss.derivative(test_x, test_y)

    assert np.isnan(test_cost).sum() == 0
    assert np.isnan(test_derivative).sum() == 0
Ejemplo n.º 12
0
def test_auto():
    x = np.identity(8)
    y = np.identity(8)
    NN_auto = NN.Neural_Network(input_layer_size=8,
                                hidden_layer_size=3,
                                output_layer_size=8,
                                Lambda=2e-6)
    NN_auto.train(x, y, 10000, 0.45)
    predict = NN_auto.forward(x)
    assert np.array_equal(x, predict.round())
Ejemplo n.º 13
0
def test_encoder():
    """
	Can my 8 x 3 x 8 autoencoder learn to recreate 8 x 8 identity matrix
	"""
    my_NN = NN.NeuralNetwork(lr=0.1, batch_size=1)

    id_mat = np.identity(8)
    training(my_NN, id_mat, id_mat, 1000, verbose=True)

    predicted = my_NN.predict(id_mat)

    assert (it_mat - predicted).mean() < 0.005
Ejemplo n.º 14
0
def test_predict():
    # create neural net
    nn = NN.NeuralNetwork(sizes=[2, 1, 1], seed=0, alpha=1)

    # create fake dataset of various sample sizes
    test_x = np.array([[1, 0], [0, 1]])
    test_x2 = np.array([[1, 0], [0, 1], [0, 1]])
    test_x3 = np.array([[1, 0], [0, 1], [0, 1], [0, 1]])

    # check that the predictions are the right shape
    assert (nn.predict(test_x).shape == (2, 1))
    assert (nn.predict(test_x2).shape == (3, 1))
    assert (nn.predict(test_x3).shape == (4, 1))
Ejemplo n.º 15
0
def test_read_fasta():

    # read in fasta files
    pos = NN.read_fasta("./data/rap1-lieb-positives.txt")
    neg = NN.read_fasta("./data/yeast-upstream-1k-negative.fa")

    # testing the first and last sequences were read in correctly
    assert pos[0] == 'ACATCCGTGCACCTCCG'
    assert pos[-1] == 'ACACCCATACACCAAAC'

    # testing that the number of negative sequences were read in correctly
    # I would do the same thing as the the positive samples but the sequences are too long
    assert len(neg.keys()) == 3147

    # testing that the sequences read in are the correct length
    assert len(list(neg.keys())[0]) == 1000
    assert len(list(neg.keys())[-1]) == 1000

    # testing that the sequences were mapped to the correct fasta header
    assert neg[list(neg.keys(
    ))[0]] == ">YAL003W 5' untranslated region, chrI 141172 - 142171, 1000 bp"
    assert neg[list(
        neg.keys()
    )[-1]] == ">YPR202W 5' untranslated region, chrXVI 942027 - 943026, 1000 bp"
Ejemplo n.º 16
0
def test_autoencoder():
    '''
    This funciton tests the AutoEncoder by generating and training the
    autoencoder and ensuring that the output dimensions are correct and
    that the loss is always decreasing. This also tests that the NN
    architecture is correct and weight matrices are the right size.
    '''
    # Defining neccessary hyperparameters for the autoencoder
    nn_architecture = [{
        'input_dim': 8,
        'output_dim': 3,
        'activation': 'sigmoid'
    }, {
        'input_dim': 3,
        'output_dim': 8,
        'activation': 'sigmoid'
    }]
    loss_function = 'mse'
    learning_rate = 10
    seed = 20
    epochs = 10000
    # Generating autoencoder NN class instance
    identity_ae = NN.NeuralNetwork(nn_architecture,
                                   lr=learning_rate,
                                   seed=seed,
                                   epochs=epochs,
                                   loss_function=loss_function)
    # Defining data for use in the autoencoder
    X = np.eye(8, 8)
    y = X
    # Training the Auto Encoder
    per_epoch_loss_train, per_epoch_loss_val, _, _ = identity_ae.fit(
        X, y, X, y)
    recon = identity_ae.predict(X[:2, :])
    ### ensuring that training loss always decreases
    assert np.sum(np.diff(per_epoch_loss_train) > 0) == 0
    ### ensuring that the output predicted reconstruction has the right shape
    assert recon.shape == (2, 8)
    ### testing nn architecture is correct
    assert identity_ae.arch == nn_architecture
    ### testing weight matrices are the correct shape
    assert identity_ae.param_dict['W1'].shape == (3, 8)
    assert identity_ae.param_dict['W2'].shape == (8, 3)
    ### testing bias matrices are the correct shape
    assert identity_ae.param_dict['b1'].shape == (3, 1)
    assert identity_ae.param_dict['b2'].shape == (8, 1)
Ejemplo n.º 17
0
def test_backprop():
    """This function test if backprop produces matrices in the correct list orientation and dimensions. Doesn't
    actually check if the values are correct given the complex calculations. Does so in the auto-encoder context."""
    setup = [8,3,8]
    alpha = 0.5
    lamba = 0 
    simple_NN = NN.NeuralNetwork(setup,NN.activation,alpha,lamba)
    simple_NN.get_single_input([1,0,0,0,0,0,0,0])
    simple_NN.feedforward()
    Ws, bs = simple_NN.backprop(simple_NN.edge_matrices,simple_NN.biases,simple_NN.input_layer,simple_NN.layer_z,simple_NN.layer_a)
    
    #test the dimensions of the returned partial W and partial B matrices
    assert len(Ws[0]) == 3
    assert len(Ws[1]) == 8
    assert len(Ws[0][0]) == 8
    assert len(Ws[1][0]) == 3
    assert len(bs[0]) == 3
    assert len(bs[1]) == 8
Ejemplo n.º 18
0
def autoencoder(plot=True):
    """
	Can my 8 x 3 x 8 autoencoder learn to recreate 8 x 8 identity matrix
	"""
    my_NN = NN.NeuralNetwork(lr=0.1, batch_size=1)

    id_mat = np.identity(8)
    training(my_NN, id_mat, id_mat, 100000, verbose=True)

    predicted = my_NN.predict(id_mat)
    print(predicted)

    if plot:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(22, 10))
        sns.heatmap(id_mat, ax=ax1)
        sns.heatmap(predicted, ax=ax2)
        ax1.set_title("Input")
        ax2.set_title("Reconstructed")
        plt.savefig('id_matrix_reconstruction.png', dpi=200)
Ejemplo n.º 19
0
def test_encoder():

    # test that each letter is being properly encoded
    assert (NN.encode("A") == [1, 0, 0, 0]).all()
    assert (NN.encode("C") == [0, 1, 0, 0]).all()
    assert (NN.encode("G") == [0, 0, 1, 0]).all()
    assert (NN.encode("T") == [0, 0, 0, 1]).all()

    # test that the encoder is giving back the right number of values
    assert (len(NN.encode("acgt")) == 16)

    # test that a sequence returns the right encoded values
    assert ((NN.encode("acgt") == [
        1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1
    ]).all())
Ejemplo n.º 20
0
def test_general():
	amanda_auto  = NN.NeuralNetwork(setup_nodes = (8, 3, 8), activation = "sigmoid", seed=1)

	assert amanda_auto.W1.shape == (8, 3), "Fail 1"
	assert amanda_auto.W2.shape == (3, 8), "Fail 2"
	assert amanda_auto.bias1.shape == (1, 3), "Fail 3"
	assert amanda_auto.bias2.shape == (1, 8), "Fail 4"

	inputs = np.random.randint(2, size=(1,8))
	hidden_z, hidden_activation, output_z, output_activation = amanda_auto.feedforward(inputs, report = True)
	assert hidden_z.shape == (1, 3), "Fail 5"
	assert hidden_activation.shape == (1, 3), "Fail 6"
	assert output_z.shape == (1, 8), "Fail 7"
	assert output_activation.shape == (1, 8), "Fail 8"

	amanda_auto.yhat = output_z
	hidden_weight, output_weight, delta_hidden, delta_out = amanda_auto.backprop(inputs, inputs) #here they're the same thing. 
	assert hidden_weight.shape == (8, 3), "Fail 9"
	assert output_weight.shape == (3, 8), "Fail 10"
	assert delta_hidden.shape == (1, 3), "Fail 11"
	assert delta_out.shape == (1, 8), "Fail 12"
def cross_validation(pos, neg, k, reg_term, rounds, learn_rate, num_hidden):
    master = np.append(pos, neg)
    random.shuffle(master)
    plt.figure(figsize=[5, 5])
    size = len(master) // k
    AUCs = []
    MSEs = []
    for i in range(1, k + 1):
        idx1 = (i - 1) * size
        idx2 = i * size
        test = master[idx1:idx2]
        train = np.append(master[:idx1], master[idx2:])
        test = string_to_array(test)
        train = string_to_array(train)

        nn = NN.NeuralNetwork(input=train,
                              test_set=test,
                              output_dim=1,
                              reg_term=reg_term,
                              rounds=rounds,
                              learn_rate=learn_rate,
                              num_hidden=num_hidden)
        err = nn.fit()
        p, mse_test = nn.predict()
        test_e = nn.test_exp

        fpr, tpr, thresh = metrics.roc_curve(test_e, p, pos_label=1)
        AUC = metrics.auc(fpr, tpr)
        plt.plot(fpr, tpr, label=str(i) + ': AUC = ' + str(AUC))
        plt.legend()
        AUCs.append(AUC)
        MSEs.append(mse_test)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC ~ k-fold Cross Validation")

    return AUCs, MSEs
Ejemplo n.º 22
0
def main():
    np.random.seed(2)
    #Part 1, Autoencoder Implementation
    Xtrain = np.array([[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0],
                       [0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0],
                       [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0],
                       [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1]])

    Ytrain = np.array([[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0],
                       [0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0],
                       [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0],
                       [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1]])

    Test = NN.NeuralNetwork(X=Xtrain,
                            Y=Ytrain,
                            layers=[3, 8],
                            epoch=1000,
                            learningrate=0.1,
                            seed=1)
    Test.runmodel()
    plt.plot(Test.loss_per_epoch)
    plt.title("Training of AutoEncoder on 8x8  Identity Matrix")
    plt.ylabel('Loss(MSE)')
    plt.show()
    exit()

    #Part 2a, first I will load the positive and negative data for training/testing
    neg_seq = io.read_fasta("../data/yeast-upstream-1k-negative.fa")
    pos_seq = io.read_seq_txts("../data/rap1-lieb-positives.txt")

    X, Y = io.get_dataset(pos_seq, neg_seq, 500)

    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    Test = NN.NeuralNetwork(X=X_train,
                            Y=y_train,
                            layers=[17, 4, 1],
                            epoch=1000,
                            learningrate=0.01,
                            seed=80,
                            batch_size=5)
    Test.runmodel()
    prediction = Test.predict(X_test)

    #Part 4a Provide an example of the input and output for one true positive sequence and one true negative sequence
    for i in range(0, len(X_test)):
        print("Input Seq", X_test[i], "actual", y_test[i], "pred",
              prediction[i])

    #Part 4c Stop criterion for convergence in my learned parameters
    Test = NN.NeuralNetwork(X=X_train,
                            Y=y_train,
                            layers=[17, 4, 1],
                            epoch=1000,
                            learningrate=0.01,
                            seed=80,
                            batch_size=5)
    Test.runmodel()
    prediction = Test.predict(X_test)
    fpr, tpr, threshold = roc_curve(y_test, prediction)
    roc_auc = auc(fpr, tpr)
    print(roc_auc)
    plt.plot(Test.loss_per_epoch)
    plt.title(
        "Training for regions of trancription factor binding sites (500 Negative Contols)"
    )
    plt.ylabel('Loss(MSE)')
    plt.show()

    #Part 5, k-fold cross validation
    X, Y = io.get_dataset(pos_seq, neg_seq, 503)
    X, Y = io.get_dataset(pos_seq, neg_seq, 500)
    X_train, X_test, y_train, y_test = train_test_split(X, Y)
    K = [2, 5, 10, 15, 20]
    for k in K:
        print("Running k fold test with a k=", k)
        cv = KFold(n_splits=k, random_state=1, shuffle=True)
        auroc_accuracy = []
        for train, test in cv.split(X):
            # print('train: %s, test: %s' % (X[train].shape, X[test].shape))
            # print('train: %s, test: %s' % (Y[train].shape, Y[test].shape))
            kfold = NN.NeuralNetwork(X=X[train],
                                     Y=Y[train],
                                     layers=[17, 4, 1],
                                     epoch=200,
                                     learningrate=0.01,
                                     seed=420,
                                     batch_size=5)
            kfold.runmodel()
            ypred = kfold.predict(X[test])
            fpr, tpr, threshold = roc_curve(Y[test], ypred)
            roc_auc = auc(fpr, tpr)
            auroc_accuracy.append(roc_auc)
            del kfold, roc_auc, tpr, fpr, ypred
        print("Average AuRoc for k fold trial: ", np.mean(auroc_accuracy))

    X, Y = io.get_dataset(pos_seq, neg_seq, 503)
    k_roc = []

    for i in range(0, 200):
        print(i)
        cv = KFold(n_splits=5, random_state=i, shuffle=True)
        auroc_accuracy = []
        for train, test in cv.split(X):
            kfold = NN.NeuralNetwork(X=X[train],
                                     Y=Y[train],
                                     layers=[17, 4, 1],
                                     epoch=200,
                                     learningrate=0.01,
                                     seed=i,
                                     batch_size=5)
            kfold.runmodel()
            ypred = kfold.predict(X[test])
            fpr, tpr, threshold = roc_curve(Y[test], ypred)
            roc_auc = auc(fpr, tpr)
            auroc_accuracy.append(auroc_accuracy)
        k_roc.append(np.mean(roc_auc))

    print("Mean k=5 fold auroc after 200 iterations", np.mean(k_roc))

    #Part 4 Grid Search Implementation
    X, Y = io.get_dataset(pos_seq, neg_seq, 503)
    learning_rate = [0.01, 0.1, 0.2, 0.5, 1]
    layers = [[50, 17, 1], [10, 4, 1], [17, 4, 1], [10, 1], [4, 1]]
    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    cnt_i = 0
    grid_search = np.zeros((5, 5))
    for i in learning_rate:
        cnt_j = 0
        for j in layers:
            print(cnt_i)
            print(cnt_j)
            kfold = NN.NeuralNetwork(X=X_train,
                                     Y=y_train,
                                     layers=j,
                                     epoch=200,
                                     learningrate=i,
                                     seed=421,
                                     batch_size=5)
            kfold.runmodel()
            ypred = kfold.predict(X_test)
            fpr, tpr, threshold = roc_curve(y_test, ypred)
            roc_auc = auc(fpr, tpr)
            grid_search[cnt_i, cnt_j] = roc_auc
            print(grid_search)
            cnt_j = cnt_j + 1
        cnt_i = cnt_i + 1

    #Testing the updated function
    X, Y = io.get_dataset(pos_seq, neg_seq, 500)

    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    Test = NN.NeuralNetwork(X=X_train,
                            Y=y_train,
                            layers=[17, 4, 1],
                            gradient_descent='stochastic',
                            epoch=10000,
                            learningrate=0.01,
                            seed=10)
    Test.runmodel()

    plt.plot(Test.loss_per_epoch)
    plt.title(
        "Training via stochastic gradient decsent (500 Negative Contols)")
    plt.ylabel('Loss (MSE)')
    plt.show()

    #Test two, training the model with fillbatch gradient descent
    X, Y = io.get_dataset(pos_seq, neg_seq, 500)

    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    Test = NN.NeuralNetwork(X=X_train,
                            Y=y_train,
                            layers=[17, 4, 1],
                            gradient_descent='batch',
                            epoch=10000,
                            learningrate=0.01,
                            seed=10)
    Test.runmodel()

    plt.plot(Test.loss_per_epoch)
    plt.title(
        "Training via full batch gradient decsent (500 Negative Contols)")
    plt.ylabel('Loss (MSE)')
    plt.show()
    #
    #
    # #
    X, Y = io.get_dataset(pos_seq, neg_seq, 500)

    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    Test = NN.NeuralNetwork(X=X_train,
                            Y=y_train,
                            layers=[17, 4, 1],
                            gradient_descent='mini_batch',
                            batch_size=2,
                            epoch=40000,
                            learningrate=0.01,
                            seed=10)
    Test.runmodel()

    plt.plot(Test.loss_per_epoch)
    plt.title(
        "Training via mini batch (10) gradient decsent (500 Negative Contols)")
    plt.ylabel('Loss (MSE)')
    plt.show()

    #Part 5 Predictions, Show Time!!!
    X, Y = io.get_dataset(pos_seq, neg_seq, 500)

    FinalModel = NN.NeuralNetwork(X=X,
                                  Y=Y,
                                  layers=[17, 4, 1],
                                  gradient_descent='stochastic',
                                  epoch=100,
                                  learningrate=0.01,
                                  seed=12)
    FinalModel.runmodel()

    rap1_seq = io.read_seq_txts("../data/rap1-lieb-test.txt")

    rap1_seq_predictions = []
    for seq in rap1_seq:
        seq_encoded = io.dna_one_hot(seq)
        # print(seq_encoded.onehot)
        prediction = FinalModel.predict(seq_encoded.onehot)
        prediction = np.where(prediction > 0.5, 1, 0)
        row = [seq_encoded.raw_sequence, prediction[0]]
        rap1_seq_predictions.append(row)

        del seq_encoded

    rap1_seq_predictions = np.array(rap1_seq_predictions)

    np.savetxt("../data/rap1_seq_predictions.txt",
               rap1_seq_predictions,
               fmt='%s')
Ejemplo n.º 23
0
def classify_sites(lr,
                   n_epochs,
                   architecture,
                   batch_size,
                   class_members,
                   loss_dict=None,
                   predict_test=False,
                   plot=False):
    """
	Run full classification w cross validation
	"""

    np.random.seed(666)
    layers = "_".join([str(layer) for layer in architecture])
    trial_name = f'lr-{lr}_epochs-{n_epochs}_arch-{layers}_bs-{batch_size}_cm-{class_members}'

    pos = io.read_seqs('data/rap1-lieb-positives.txt')
    neg = io.read_seqs('data/yeast-upstream-1k-negative.fa')

    neg = generate_neg_examples(pos, neg)

    #balance classes - downsample negative and duplicate positive examples until even
    # this is naive, might update

    #downsample negative, too many to run in any realistic amt of time
    #currently select 5000, then sample both classus until 2000 so I dont generate all 100k+ representations -> change?
    neg = [neg[i] for i in np.random.choice(len(neg), 5000, replace=False)]

    #generate 1-hot encoding + purine/pyrimidine classification
    pos_examples = np.concatenate([generate_rep(x) for x in pos])
    neg_examples = np.concatenate([generate_rep(x) for x in neg])

    #create full training + truth, 1 if binding sequence else 0
    training_set = np.concatenate((pos_examples, neg_examples))
    training_truth = np.atleast_2d(
        np.concatenate(
            (np.ones(len(pos_examples)), np.zeros(len(neg_examples))))).T

    #train
    #using supplied parameters
    my_NN = NN.NeuralNetwork(architecture=architecture,
                             lr=lr,
                             batch_size=batch_size)

    if predict_test:
        #retrain net on full dataset
        my_NN = NN.NeuralNetwork(architecture=architecture,
                                 lr=lr,
                                 batch_size=batch_size)
        #nn, input, truth, n_epochs, verbose = False, shuffle = True, balance_classes = False, class_members = 2000)
        training(my_NN,
                 training_set,
                 training_truth,
                 n_epochs,
                 verbose=True,
                 balance_classes=True,
                 class_members=2000)

        rank_tests(my_NN)

    else:
        valid_loss = cross_validation(my_NN,
                                      training_set,
                                      training_truth,
                                      n_epochs,
                                      5,
                                      batch_size,
                                      class_members=class_members,
                                      balance_classes=True,
                                      plot=plot,
                                      title=trial_name)

        if loss_dict is None:
            #Train and print out a few example classifications

            my_NN = NN.NeuralNetwork(architecture=architecture,
                                     lr=lr,
                                     batch_size=batch_size)
            #nn, input, truth, n_epochs, verbose = False, shuffle = True, balance_classes = False, class_members = 2000)
            training(my_NN,
                     training_set,
                     training_truth,
                     n_epochs,
                     verbose=True,
                     balance_classes=True,
                     class_members=2000)

            #print out a few example to check what it learned
            test = np.random.choice(pos_examples.shape[0], 5)
            pos_test = pos_examples[test, :]
            neg_test = neg_examples[test, :]
            test_truth = np.atleast_2d(
                np.concatenate((np.ones(5), np.zeros(5)))).T

            predictions = my_NN.predict(np.concatenate((pos_test, neg_test)))

            for x, y in zip(test_truth, predictions):
                print(x, y)

        else:
            #update loss_dict
            loss_dict[trial_name] = valid_loss
        print(trial_name, valid_loss)
Ejemplo n.º 24
0
def test_NN():
    """
	Test various methods in my NN class to make sure that network architecture is
	behaving as expected.
	"""
    #Test overall structure of new network
    nn = NN.NeuralNetwork()
    nn.make_weights(nn.network)
    assert len(nn.network) == 2, 'This should be 2.'
    #Get the hidden layer.
    hidden = nn.network[0]
    assert len(hidden) == 3, 'Length of hidden layer'
    #Test activation
    test_vec = [1, 0, 0, 0, 0, 0, 0, 0]
    #Get the first neuron in the hidden layer and input its weights with activation by test vector above
    test_act = nn.activate(hidden[0]['weights'], test_vec)
    assert isinstance(test_act, float), 'Activation should be a real scalar.'

    #Testing feedforward
    nn.feedforward(nn.network, test_vec)
    hidden = nn.network[0]
    assert isinstance(hidden, list), 'Should be a list'
    assert len(hidden) == 3

    #Test backprop output
    nn.backprop(nn.network, test_vec)
    output = nn.network[1]
    assert len(output) == 8

    #Testing fit: using a large learning rate, and train/val data from fastas, examine error structures
    train, val = io.return_training_data(), io.return_training_data()
    nucleotide_dict = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    lr = 3
    epochs = 25
    inputs = 68
    hidden = 10
    output = 1
    nn.network = list()
    nn.make_weights(nn.network, inputs, hidden, output)
    errors = nn.fit(nn.network,
                    lr,
                    epochs,
                    autoencode=False,
                    nucleotide_dict=nucleotide_dict,
                    train=train,
                    val=val)
    assert len(errors) <= epochs, 'Single error for each epoch.'
    assert min(errors) >= max(
        errors
    ) / 1e3, 'The maximum and minimum errors should differ at most by 1,000.'

    #Testing predictions using trained neural net
    training_accuracy = nn.evaluate(train,
                                    nn.network,
                                    lr,
                                    nucleotide_dict,
                                    acc=True)
    assert training_accuracy <= 1, 'Accuracy should not go above 1.'

    #Testing model selection method (which in turn indirectly tests cross validation)
    k_dict = nn.model_selection(train,
                                val,
                                4,
                                inputs,
                                hidden,
                                output,
                                epochs,
                                nucleotide_dict,
                                lr_range=[0, 1])
    k_max = [
        key for (key, value) in k_dict.items() if value == max(k_dict.values())
    ][0]
    assert k_max == 1, 'This is the only learning rate in set [0, 1] that should come close to convergence.'