def test_init_model(): # create neural net nn = NN.NeuralNetwork(sizes=[3, 2, 1]) # test that weights are the correct sizes assert nn.weights[0].shape == (2, 3) assert nn.weights[1].shape == (1, 2) # test that bias are the correct sizes assert len(nn.bias) == 3 # create neural net nn2 = NN.NeuralNetwork(sizes=[1, 2, 3]) # test that weights are the correct sizes assert nn2.weights[0].shape == (2, 1) assert nn2.weights[1].shape == (3, 2) # test that bias are the correct sizes assert len(nn2.bias) == 3 # create 4-layer neural net nn3 = NN.NeuralNetwork(sizes=[1, 2, 3, 4]) # test that weights are the correct sizes assert nn3.weights[0].shape == (2, 1) assert nn3.weights[1].shape == (3, 2) assert nn3.weights[2].shape == (4, 3) # test that bias are the correct sizes assert len(nn3.bias) == 4
def test_others(): a = NN.oneHotDNA("AACGT", flatten = False) assert (a == np.array([[1., 0., 0., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]])).all(), "Failing to one-hot-encode" a = NN.oneHotDNA("AACGT", flatten = True) assert (a == np.array([1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.])).all(), "Failing to ecode + flatten"
def test_nn(): X, labels = make_blobs(n_samples=200, n_features=8, centers=4) N = io.norm(X) net = nn.NeuralNetwork(layers=[(8, None), (4, nn.Sigmoid), (8, nn.Sigmoid)], learning_rate=0.8) Loss = nn.MSE() net.fit(N, N, Loss) Y = net.predict(N) assert np.isnan(Y).sum() == 0
def test_fit(): # create neural net # use an autoencoder here nn = NN.NeuralNetwork(sizes=[3, 1, 3], seed=0) # test that weights are the correct sizes assert (nn.weights[0].shape == (1, 3)) assert (nn.weights[1].shape == (3, 1)) # test that bias are the correct sizes assert len(nn.bias) == 3 # create fake dataset test_x = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) test_y = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) # store the current weights & bias to compare to after fitting w = [a for a in nn.weights[0][0]] b = [a for a in nn.bias] # fit neural net to text_x, text_y nn.fit(test_x, test_y) # check that layers are being imported correctly assert (nn.layers[0] == test_x).all() assert (nn.layers[1].shape == (1, 3)) assert (nn.layers[-1].shape == (3, 3)) # check that the weights have been updated assert ~(nn.weights[0] == np.array([w])).all() assert ~(nn.bias == np.array([b])).all()
def test_NN(): X_train = np.asarray( [[2.7810836,2.550537003], [1.465489372,2.362125076], [3.396561688,4.400293529], [1.38807019,1.850220317],[3.06407232,3.005305973], [7.627531214,2.759262235], [5.332441248,2.088626775], [6.922596716,1.77106367],[8.675418651,-0.242068655], [7.673756466,3.508563011]] ) y_train = np.asarray([[0], [0], [0], [0], [0], [1], [1], [1], [1], [1]]) X_test = np.asarray([[1.235425, 2.121455], # should be 0 with the lower scores [6.1234, 2.1234]]) #should be 1 with higher scores #test comes from: #https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/ amanda_auto = NN.NeuralNetwork(setup_nodes = (2, 3, 1), activation = "sigmoid", seed=1) amanda_auto.fit(input_data=X_train, expected_output = y_train, #here, training is the same as our hopeful prediction epochs = 100, learning_rate = 20, verbose=False) predict = amanda_auto.predict(X_test, task="round") assert (predict == np.asarray([[0],[1]])).all() , "Fail 15"
def test_binary_classifier(): ''' This function tests the Rap1 Binary Classifier by ensuring that ''' # Reading in postive sequences positive_seqs = io.read_text_file('./data/rap1-lieb-positives.txt') # Reading in upstream yeast sequences pot_neg_seqs = io.read_fasta_file('./data/yeast-upstream-1k-negative.fa') # Randomly sampling negative sequences negative_seqs = preprocess.sample_negative_examples(pot_neg_seqs, positive_seqs, num_samples=1000, seq_length=17) # Generating labels pos_labels = np.expand_dims(np.array([1] * len(positive_seqs)), axis=0) neg_labels = np.expand_dims(np.array([0] * len(negative_seqs)), axis=0) labels = np.swapaxes(np.hstack([pos_labels, neg_labels]), 0, 1) # Generating Unraveled One Hot Encoded Input Feauture --> 4*seq_length pos_input = preprocess.encode_seqs(positive_seqs) neg_input = preprocess.encode_seqs(negative_seqs) # Generating full inputs to network inputs = np.vstack([pos_input, neg_input]) # Defining neccessary hyperparameters for the NN model nn_architecture = [{ 'input_dim': 68, 'output_dim': 2, 'activation': 'relu' }, { 'input_dim': 2, 'output_dim': 1, 'activation': 'sigmoid' }] loss_function = 'binary_crossentropy' learning_rate = 2 seed = 14 epochs = 1000 early_stop = [20, -1e-12] # Generating NN class instance for binary classifier bc_nn = NN.NeuralNetwork(nn_architecture, lr=learning_rate, seed=seed, epochs=epochs, loss_function=loss_function) # saving initial weights to make sure they are changing init_w1 = bc_nn.param_dict['W1'].copy() init_w2 = bc_nn.param_dict['W2'].copy() # Doing sinlge fold training split X_train, X_val, y_train, y_val = preprocess.split_basic_binary( inputs, labels, split=[0.8, 0.2], shuffle=True) # Training Model per_epoch_loss_train, per_epoch_loss_val, per_epoch_acc_train, per_epoch_acc_val = bc_nn.fit( X_train, y_train, X_val, y_val, early_stop=early_stop) ### ensuring that training loss always decreases assert np.sum(np.diff(per_epoch_loss_train) > 0) == 0 ### ensuring that weight matrices change while training final_w1 = bc_nn.param_dict['W1'] final_w2 = bc_nn.param_dict['W2'] assert np.sum(init_w1 - final_w1) != 0 assert np.sum(init_w2 - final_w2) != 0
def simple_network(): """A very simple neural network that is useful for tests below""" setup = [8,3,1] alpha = 0.5 lamba = 0 simple_NN = NN.NeuralNetwork(setup,NN.activation,alpha,lamba) return simple_NN
def test_activation_sigmoid(): a = nn.Sigmoid() test = np.random.random((1, 100)) test_activation = a.activation(test) test_derivative = a.derivative(test) assert np.isnan(test_activation).sum() == 0 assert np.isnan(test_derivative).sum() == 0
def test_encoder(): amanda_auto = NN.NeuralNetwork(setup_nodes = (8, 3, 8), activation = "sigmoid", seed=1) inputs = np.random.randint(2, size=(1,8)) amanda_auto.fit(input_data=inputs, expected_output = inputs, #here, training is the same as our hopeful prediction epochs = 10, learning_rate = 20, verbose=False) #dont want the output to be messy assert amanda_auto.final_score < 0.1, "Fail 13"
def test_activation_free(): a = nn.Free() test = np.random.random((1, 100)) test_activation = a.activation(test) test_derivative = a.derivative(test) assert np.sum(test - test_activation) == 0 assert np.unique(test_derivative).size == 1 assert np.unique(test_derivative)[0] == 1
def test_CE(): Loss = nn.CE() test_x = np.random.random((3, 100)) test_y = np.random.random((3, 100)) test_cost = Loss.loss(test_x, test_y) test_derivative = Loss.derivative(test_x, test_y) assert np.isnan(test_cost).sum() == 0 assert np.isnan(test_derivative).sum() == 0
def test_auto(): x = np.identity(8) y = np.identity(8) NN_auto = NN.Neural_Network(input_layer_size=8, hidden_layer_size=3, output_layer_size=8, Lambda=2e-6) NN_auto.train(x, y, 10000, 0.45) predict = NN_auto.forward(x) assert np.array_equal(x, predict.round())
def test_encoder(): """ Can my 8 x 3 x 8 autoencoder learn to recreate 8 x 8 identity matrix """ my_NN = NN.NeuralNetwork(lr=0.1, batch_size=1) id_mat = np.identity(8) training(my_NN, id_mat, id_mat, 1000, verbose=True) predicted = my_NN.predict(id_mat) assert (it_mat - predicted).mean() < 0.005
def test_predict(): # create neural net nn = NN.NeuralNetwork(sizes=[2, 1, 1], seed=0, alpha=1) # create fake dataset of various sample sizes test_x = np.array([[1, 0], [0, 1]]) test_x2 = np.array([[1, 0], [0, 1], [0, 1]]) test_x3 = np.array([[1, 0], [0, 1], [0, 1], [0, 1]]) # check that the predictions are the right shape assert (nn.predict(test_x).shape == (2, 1)) assert (nn.predict(test_x2).shape == (3, 1)) assert (nn.predict(test_x3).shape == (4, 1))
def test_read_fasta(): # read in fasta files pos = NN.read_fasta("./data/rap1-lieb-positives.txt") neg = NN.read_fasta("./data/yeast-upstream-1k-negative.fa") # testing the first and last sequences were read in correctly assert pos[0] == 'ACATCCGTGCACCTCCG' assert pos[-1] == 'ACACCCATACACCAAAC' # testing that the number of negative sequences were read in correctly # I would do the same thing as the the positive samples but the sequences are too long assert len(neg.keys()) == 3147 # testing that the sequences read in are the correct length assert len(list(neg.keys())[0]) == 1000 assert len(list(neg.keys())[-1]) == 1000 # testing that the sequences were mapped to the correct fasta header assert neg[list(neg.keys( ))[0]] == ">YAL003W 5' untranslated region, chrI 141172 - 142171, 1000 bp" assert neg[list( neg.keys() )[-1]] == ">YPR202W 5' untranslated region, chrXVI 942027 - 943026, 1000 bp"
def test_autoencoder(): ''' This funciton tests the AutoEncoder by generating and training the autoencoder and ensuring that the output dimensions are correct and that the loss is always decreasing. This also tests that the NN architecture is correct and weight matrices are the right size. ''' # Defining neccessary hyperparameters for the autoencoder nn_architecture = [{ 'input_dim': 8, 'output_dim': 3, 'activation': 'sigmoid' }, { 'input_dim': 3, 'output_dim': 8, 'activation': 'sigmoid' }] loss_function = 'mse' learning_rate = 10 seed = 20 epochs = 10000 # Generating autoencoder NN class instance identity_ae = NN.NeuralNetwork(nn_architecture, lr=learning_rate, seed=seed, epochs=epochs, loss_function=loss_function) # Defining data for use in the autoencoder X = np.eye(8, 8) y = X # Training the Auto Encoder per_epoch_loss_train, per_epoch_loss_val, _, _ = identity_ae.fit( X, y, X, y) recon = identity_ae.predict(X[:2, :]) ### ensuring that training loss always decreases assert np.sum(np.diff(per_epoch_loss_train) > 0) == 0 ### ensuring that the output predicted reconstruction has the right shape assert recon.shape == (2, 8) ### testing nn architecture is correct assert identity_ae.arch == nn_architecture ### testing weight matrices are the correct shape assert identity_ae.param_dict['W1'].shape == (3, 8) assert identity_ae.param_dict['W2'].shape == (8, 3) ### testing bias matrices are the correct shape assert identity_ae.param_dict['b1'].shape == (3, 1) assert identity_ae.param_dict['b2'].shape == (8, 1)
def test_backprop(): """This function test if backprop produces matrices in the correct list orientation and dimensions. Doesn't actually check if the values are correct given the complex calculations. Does so in the auto-encoder context.""" setup = [8,3,8] alpha = 0.5 lamba = 0 simple_NN = NN.NeuralNetwork(setup,NN.activation,alpha,lamba) simple_NN.get_single_input([1,0,0,0,0,0,0,0]) simple_NN.feedforward() Ws, bs = simple_NN.backprop(simple_NN.edge_matrices,simple_NN.biases,simple_NN.input_layer,simple_NN.layer_z,simple_NN.layer_a) #test the dimensions of the returned partial W and partial B matrices assert len(Ws[0]) == 3 assert len(Ws[1]) == 8 assert len(Ws[0][0]) == 8 assert len(Ws[1][0]) == 3 assert len(bs[0]) == 3 assert len(bs[1]) == 8
def autoencoder(plot=True): """ Can my 8 x 3 x 8 autoencoder learn to recreate 8 x 8 identity matrix """ my_NN = NN.NeuralNetwork(lr=0.1, batch_size=1) id_mat = np.identity(8) training(my_NN, id_mat, id_mat, 100000, verbose=True) predicted = my_NN.predict(id_mat) print(predicted) if plot: fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(22, 10)) sns.heatmap(id_mat, ax=ax1) sns.heatmap(predicted, ax=ax2) ax1.set_title("Input") ax2.set_title("Reconstructed") plt.savefig('id_matrix_reconstruction.png', dpi=200)
def test_encoder(): # test that each letter is being properly encoded assert (NN.encode("A") == [1, 0, 0, 0]).all() assert (NN.encode("C") == [0, 1, 0, 0]).all() assert (NN.encode("G") == [0, 0, 1, 0]).all() assert (NN.encode("T") == [0, 0, 0, 1]).all() # test that the encoder is giving back the right number of values assert (len(NN.encode("acgt")) == 16) # test that a sequence returns the right encoded values assert ((NN.encode("acgt") == [ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 ]).all())
def test_general(): amanda_auto = NN.NeuralNetwork(setup_nodes = (8, 3, 8), activation = "sigmoid", seed=1) assert amanda_auto.W1.shape == (8, 3), "Fail 1" assert amanda_auto.W2.shape == (3, 8), "Fail 2" assert amanda_auto.bias1.shape == (1, 3), "Fail 3" assert amanda_auto.bias2.shape == (1, 8), "Fail 4" inputs = np.random.randint(2, size=(1,8)) hidden_z, hidden_activation, output_z, output_activation = amanda_auto.feedforward(inputs, report = True) assert hidden_z.shape == (1, 3), "Fail 5" assert hidden_activation.shape == (1, 3), "Fail 6" assert output_z.shape == (1, 8), "Fail 7" assert output_activation.shape == (1, 8), "Fail 8" amanda_auto.yhat = output_z hidden_weight, output_weight, delta_hidden, delta_out = amanda_auto.backprop(inputs, inputs) #here they're the same thing. assert hidden_weight.shape == (8, 3), "Fail 9" assert output_weight.shape == (3, 8), "Fail 10" assert delta_hidden.shape == (1, 3), "Fail 11" assert delta_out.shape == (1, 8), "Fail 12"
def cross_validation(pos, neg, k, reg_term, rounds, learn_rate, num_hidden): master = np.append(pos, neg) random.shuffle(master) plt.figure(figsize=[5, 5]) size = len(master) // k AUCs = [] MSEs = [] for i in range(1, k + 1): idx1 = (i - 1) * size idx2 = i * size test = master[idx1:idx2] train = np.append(master[:idx1], master[idx2:]) test = string_to_array(test) train = string_to_array(train) nn = NN.NeuralNetwork(input=train, test_set=test, output_dim=1, reg_term=reg_term, rounds=rounds, learn_rate=learn_rate, num_hidden=num_hidden) err = nn.fit() p, mse_test = nn.predict() test_e = nn.test_exp fpr, tpr, thresh = metrics.roc_curve(test_e, p, pos_label=1) AUC = metrics.auc(fpr, tpr) plt.plot(fpr, tpr, label=str(i) + ': AUC = ' + str(AUC)) plt.legend() AUCs.append(AUC) MSEs.append(mse_test) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("ROC ~ k-fold Cross Validation") return AUCs, MSEs
def main(): np.random.seed(2) #Part 1, Autoencoder Implementation Xtrain = np.array([[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1]]) Ytrain = np.array([[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1]]) Test = NN.NeuralNetwork(X=Xtrain, Y=Ytrain, layers=[3, 8], epoch=1000, learningrate=0.1, seed=1) Test.runmodel() plt.plot(Test.loss_per_epoch) plt.title("Training of AutoEncoder on 8x8 Identity Matrix") plt.ylabel('Loss(MSE)') plt.show() exit() #Part 2a, first I will load the positive and negative data for training/testing neg_seq = io.read_fasta("../data/yeast-upstream-1k-negative.fa") pos_seq = io.read_seq_txts("../data/rap1-lieb-positives.txt") X, Y = io.get_dataset(pos_seq, neg_seq, 500) X_train, X_test, y_train, y_test = train_test_split(X, Y) Test = NN.NeuralNetwork(X=X_train, Y=y_train, layers=[17, 4, 1], epoch=1000, learningrate=0.01, seed=80, batch_size=5) Test.runmodel() prediction = Test.predict(X_test) #Part 4a Provide an example of the input and output for one true positive sequence and one true negative sequence for i in range(0, len(X_test)): print("Input Seq", X_test[i], "actual", y_test[i], "pred", prediction[i]) #Part 4c Stop criterion for convergence in my learned parameters Test = NN.NeuralNetwork(X=X_train, Y=y_train, layers=[17, 4, 1], epoch=1000, learningrate=0.01, seed=80, batch_size=5) Test.runmodel() prediction = Test.predict(X_test) fpr, tpr, threshold = roc_curve(y_test, prediction) roc_auc = auc(fpr, tpr) print(roc_auc) plt.plot(Test.loss_per_epoch) plt.title( "Training for regions of trancription factor binding sites (500 Negative Contols)" ) plt.ylabel('Loss(MSE)') plt.show() #Part 5, k-fold cross validation X, Y = io.get_dataset(pos_seq, neg_seq, 503) X, Y = io.get_dataset(pos_seq, neg_seq, 500) X_train, X_test, y_train, y_test = train_test_split(X, Y) K = [2, 5, 10, 15, 20] for k in K: print("Running k fold test with a k=", k) cv = KFold(n_splits=k, random_state=1, shuffle=True) auroc_accuracy = [] for train, test in cv.split(X): # print('train: %s, test: %s' % (X[train].shape, X[test].shape)) # print('train: %s, test: %s' % (Y[train].shape, Y[test].shape)) kfold = NN.NeuralNetwork(X=X[train], Y=Y[train], layers=[17, 4, 1], epoch=200, learningrate=0.01, seed=420, batch_size=5) kfold.runmodel() ypred = kfold.predict(X[test]) fpr, tpr, threshold = roc_curve(Y[test], ypred) roc_auc = auc(fpr, tpr) auroc_accuracy.append(roc_auc) del kfold, roc_auc, tpr, fpr, ypred print("Average AuRoc for k fold trial: ", np.mean(auroc_accuracy)) X, Y = io.get_dataset(pos_seq, neg_seq, 503) k_roc = [] for i in range(0, 200): print(i) cv = KFold(n_splits=5, random_state=i, shuffle=True) auroc_accuracy = [] for train, test in cv.split(X): kfold = NN.NeuralNetwork(X=X[train], Y=Y[train], layers=[17, 4, 1], epoch=200, learningrate=0.01, seed=i, batch_size=5) kfold.runmodel() ypred = kfold.predict(X[test]) fpr, tpr, threshold = roc_curve(Y[test], ypred) roc_auc = auc(fpr, tpr) auroc_accuracy.append(auroc_accuracy) k_roc.append(np.mean(roc_auc)) print("Mean k=5 fold auroc after 200 iterations", np.mean(k_roc)) #Part 4 Grid Search Implementation X, Y = io.get_dataset(pos_seq, neg_seq, 503) learning_rate = [0.01, 0.1, 0.2, 0.5, 1] layers = [[50, 17, 1], [10, 4, 1], [17, 4, 1], [10, 1], [4, 1]] X_train, X_test, y_train, y_test = train_test_split(X, Y) cnt_i = 0 grid_search = np.zeros((5, 5)) for i in learning_rate: cnt_j = 0 for j in layers: print(cnt_i) print(cnt_j) kfold = NN.NeuralNetwork(X=X_train, Y=y_train, layers=j, epoch=200, learningrate=i, seed=421, batch_size=5) kfold.runmodel() ypred = kfold.predict(X_test) fpr, tpr, threshold = roc_curve(y_test, ypred) roc_auc = auc(fpr, tpr) grid_search[cnt_i, cnt_j] = roc_auc print(grid_search) cnt_j = cnt_j + 1 cnt_i = cnt_i + 1 #Testing the updated function X, Y = io.get_dataset(pos_seq, neg_seq, 500) X_train, X_test, y_train, y_test = train_test_split(X, Y) Test = NN.NeuralNetwork(X=X_train, Y=y_train, layers=[17, 4, 1], gradient_descent='stochastic', epoch=10000, learningrate=0.01, seed=10) Test.runmodel() plt.plot(Test.loss_per_epoch) plt.title( "Training via stochastic gradient decsent (500 Negative Contols)") plt.ylabel('Loss (MSE)') plt.show() #Test two, training the model with fillbatch gradient descent X, Y = io.get_dataset(pos_seq, neg_seq, 500) X_train, X_test, y_train, y_test = train_test_split(X, Y) Test = NN.NeuralNetwork(X=X_train, Y=y_train, layers=[17, 4, 1], gradient_descent='batch', epoch=10000, learningrate=0.01, seed=10) Test.runmodel() plt.plot(Test.loss_per_epoch) plt.title( "Training via full batch gradient decsent (500 Negative Contols)") plt.ylabel('Loss (MSE)') plt.show() # # # # X, Y = io.get_dataset(pos_seq, neg_seq, 500) X_train, X_test, y_train, y_test = train_test_split(X, Y) Test = NN.NeuralNetwork(X=X_train, Y=y_train, layers=[17, 4, 1], gradient_descent='mini_batch', batch_size=2, epoch=40000, learningrate=0.01, seed=10) Test.runmodel() plt.plot(Test.loss_per_epoch) plt.title( "Training via mini batch (10) gradient decsent (500 Negative Contols)") plt.ylabel('Loss (MSE)') plt.show() #Part 5 Predictions, Show Time!!! X, Y = io.get_dataset(pos_seq, neg_seq, 500) FinalModel = NN.NeuralNetwork(X=X, Y=Y, layers=[17, 4, 1], gradient_descent='stochastic', epoch=100, learningrate=0.01, seed=12) FinalModel.runmodel() rap1_seq = io.read_seq_txts("../data/rap1-lieb-test.txt") rap1_seq_predictions = [] for seq in rap1_seq: seq_encoded = io.dna_one_hot(seq) # print(seq_encoded.onehot) prediction = FinalModel.predict(seq_encoded.onehot) prediction = np.where(prediction > 0.5, 1, 0) row = [seq_encoded.raw_sequence, prediction[0]] rap1_seq_predictions.append(row) del seq_encoded rap1_seq_predictions = np.array(rap1_seq_predictions) np.savetxt("../data/rap1_seq_predictions.txt", rap1_seq_predictions, fmt='%s')
def classify_sites(lr, n_epochs, architecture, batch_size, class_members, loss_dict=None, predict_test=False, plot=False): """ Run full classification w cross validation """ np.random.seed(666) layers = "_".join([str(layer) for layer in architecture]) trial_name = f'lr-{lr}_epochs-{n_epochs}_arch-{layers}_bs-{batch_size}_cm-{class_members}' pos = io.read_seqs('data/rap1-lieb-positives.txt') neg = io.read_seqs('data/yeast-upstream-1k-negative.fa') neg = generate_neg_examples(pos, neg) #balance classes - downsample negative and duplicate positive examples until even # this is naive, might update #downsample negative, too many to run in any realistic amt of time #currently select 5000, then sample both classus until 2000 so I dont generate all 100k+ representations -> change? neg = [neg[i] for i in np.random.choice(len(neg), 5000, replace=False)] #generate 1-hot encoding + purine/pyrimidine classification pos_examples = np.concatenate([generate_rep(x) for x in pos]) neg_examples = np.concatenate([generate_rep(x) for x in neg]) #create full training + truth, 1 if binding sequence else 0 training_set = np.concatenate((pos_examples, neg_examples)) training_truth = np.atleast_2d( np.concatenate( (np.ones(len(pos_examples)), np.zeros(len(neg_examples))))).T #train #using supplied parameters my_NN = NN.NeuralNetwork(architecture=architecture, lr=lr, batch_size=batch_size) if predict_test: #retrain net on full dataset my_NN = NN.NeuralNetwork(architecture=architecture, lr=lr, batch_size=batch_size) #nn, input, truth, n_epochs, verbose = False, shuffle = True, balance_classes = False, class_members = 2000) training(my_NN, training_set, training_truth, n_epochs, verbose=True, balance_classes=True, class_members=2000) rank_tests(my_NN) else: valid_loss = cross_validation(my_NN, training_set, training_truth, n_epochs, 5, batch_size, class_members=class_members, balance_classes=True, plot=plot, title=trial_name) if loss_dict is None: #Train and print out a few example classifications my_NN = NN.NeuralNetwork(architecture=architecture, lr=lr, batch_size=batch_size) #nn, input, truth, n_epochs, verbose = False, shuffle = True, balance_classes = False, class_members = 2000) training(my_NN, training_set, training_truth, n_epochs, verbose=True, balance_classes=True, class_members=2000) #print out a few example to check what it learned test = np.random.choice(pos_examples.shape[0], 5) pos_test = pos_examples[test, :] neg_test = neg_examples[test, :] test_truth = np.atleast_2d( np.concatenate((np.ones(5), np.zeros(5)))).T predictions = my_NN.predict(np.concatenate((pos_test, neg_test))) for x, y in zip(test_truth, predictions): print(x, y) else: #update loss_dict loss_dict[trial_name] = valid_loss print(trial_name, valid_loss)
def test_NN(): """ Test various methods in my NN class to make sure that network architecture is behaving as expected. """ #Test overall structure of new network nn = NN.NeuralNetwork() nn.make_weights(nn.network) assert len(nn.network) == 2, 'This should be 2.' #Get the hidden layer. hidden = nn.network[0] assert len(hidden) == 3, 'Length of hidden layer' #Test activation test_vec = [1, 0, 0, 0, 0, 0, 0, 0] #Get the first neuron in the hidden layer and input its weights with activation by test vector above test_act = nn.activate(hidden[0]['weights'], test_vec) assert isinstance(test_act, float), 'Activation should be a real scalar.' #Testing feedforward nn.feedforward(nn.network, test_vec) hidden = nn.network[0] assert isinstance(hidden, list), 'Should be a list' assert len(hidden) == 3 #Test backprop output nn.backprop(nn.network, test_vec) output = nn.network[1] assert len(output) == 8 #Testing fit: using a large learning rate, and train/val data from fastas, examine error structures train, val = io.return_training_data(), io.return_training_data() nucleotide_dict = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } lr = 3 epochs = 25 inputs = 68 hidden = 10 output = 1 nn.network = list() nn.make_weights(nn.network, inputs, hidden, output) errors = nn.fit(nn.network, lr, epochs, autoencode=False, nucleotide_dict=nucleotide_dict, train=train, val=val) assert len(errors) <= epochs, 'Single error for each epoch.' assert min(errors) >= max( errors ) / 1e3, 'The maximum and minimum errors should differ at most by 1,000.' #Testing predictions using trained neural net training_accuracy = nn.evaluate(train, nn.network, lr, nucleotide_dict, acc=True) assert training_accuracy <= 1, 'Accuracy should not go above 1.' #Testing model selection method (which in turn indirectly tests cross validation) k_dict = nn.model_selection(train, val, 4, inputs, hidden, output, epochs, nucleotide_dict, lr_range=[0, 1]) k_max = [ key for (key, value) in k_dict.items() if value == max(k_dict.values()) ][0] assert k_max == 1, 'This is the only learning rate in set [0, 1] that should come close to convergence.'