def calculate_accurracy(root, noOfAcids, kMers, train_file, test_file, laplace_alpha, train_end_index = -1): csv_path = os.path.join(root, test_file) test_x, test_y = bs._load_dataset(csv_path) res, _, _ = bs.result_bayes(root, train_file, test_x, kMers, noOfAcids, laplace_alpha, train_end_index) #Find index of elements where we predicted cleavable trueIndices = np.where(np.array(test_y) == 1) #Find index of elements where we predicted nonCleavable falseIndices = np.where(np.array(test_y) == 0) #Generate results accuracy = ((np.sum(res[0,trueIndices]) + (np.size(falseIndices) - np.sum(res[0,falseIndices])))/len(test_x)) return accuracy
def part1(root='./Dataset', trainfile='q2_train_set.txt', testfile='q2_test_set.txt'): #Load Datasets noOfMers = 8 noOfAcids = 20 csv_path = os.path.join(root, trainfile) train_x, train_y = bs._load_dataset(csv_path) csv_path = os.path.join(root, testfile) test_x, test_y = bs._load_dataset(csv_path) #Train myRes, _, _ = bs.result_bayes(root, trainfile, test_x, noOfMers, noOfAcids) #Find index of elements where we predicted cleavable trueIndices = np.where(np.array(test_y) == 1) #Find index of elements where we predicted nonCleavable falseIndices = np.where(np.array(test_y) == 0) #Generate results print("Real cleavable number: \t", np.size(trueIndices), "\t Number predicted true cleavable:\t", np.sum(myRes[0, trueIndices]), "\t Accuracy:\t", np.sum(myRes[0, trueIndices]) / np.size(trueIndices)) print("Real nonCleavable number:\t", np.size(falseIndices), "\t Number predicted true nonCleavable:\t", np.size(falseIndices) - np.sum(myRes[0, falseIndices]), "\t Accuracy:\t", (np.size(falseIndices) - np.sum(myRes[0, falseIndices])) / np.size(falseIndices)) print( "Total test size:\t\t", len(test_x), "\t Number predicted true in total:\t", np.sum(myRes[0, trueIndices]) + (np.size(falseIndices) - np.sum(myRes[0, falseIndices])), "\t Accuracy:\t", ((np.sum(myRes[0, trueIndices]) + (np.size(falseIndices) - np.sum(myRes[0, falseIndices]))) / len(test_x)))
def part6(root='./Dataset', trainfile='q2_train_set.txt'): def rotate(angle): ax.view_init(azim=angle) csv_path = os.path.join(root, trainfile) train_x, train_y = bs._load_dataset(csv_path) no_of_rows, _ = train_x.shape centroid = np.mean(train_x, axis=0) std = np.std(train_x, axis=0) Z = (train_x - centroid) / std Z_transpose = Z.T #covariance matrix with up to a constant k cov_mat_wk = np.matmul(Z_transpose, Z) eig_values, eig_col_vectors = np.linalg.eig(cov_mat_wk) idx = eig_values.argsort()[::-1] eig_values_sorted = eig_values[idx] eig_col_vectors_sorted = eig_col_vectors[:, idx] Z_centered = np.matmul(Z, eig_col_vectors_sorted) PC1 = Z_centered[:, 0] PC2 = Z_centered[:, 1] PC3 = Z_centered[:, 2] PVE = np.sum(eig_values_sorted[0:3]) / np.sum(eig_values) print("PVE: ", PVE) #plot plt.close('all') fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(PC1, PC2, PC3, c=PC1, linewidth=0.1) ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.set_zlabel('PC 3') ax.view_init(azim=50) animation.FuncAnimation(fig, rotate, frames=np.arange(0, 365, 1), interval=0.1) print("Please close figures to continue...") plt.show()
def part5(root='./Dataset', trainfile='q2_train_set.txt', testfile='q2_test_set.txt'): noOfAcids = 20 kMers = 8 csv_path = os.path.join(root, trainfile) train_x, train_y = bs._load_dataset(csv_path) csv_path = os.path.join(root, testfile) test_x, test_y = bs._load_dataset(csv_path) ##CALCULATE PROBABILITIES## N = len(train_x) #Find number of ones for each feature cleavable = [ row for index, row in enumerate(train_x) if train_y[index] == 1 ] N11 = np.array(cleavable).sum(axis=0) N01 = len(cleavable) - N11 #Not Cleavables notCleavable = [ row for index, row in enumerate(train_x) if train_y[index] == 0 ] N10 = np.array(notCleavable).sum(axis=0) N00 = len(notCleavable) - N10 N1dot = N10 + N11 N0dot = N00 + N01 Ndot1 = len(cleavable) Ndot0 = len(notCleavable) ## sum_term1 = N11 * (np.log2((N * N11) / (N1dot * Ndot1))) sum_term2 = N01 * (np.log2((N * N01) / (N0dot * Ndot1))) sum_term3 = N10 * (np.log2((N * N10) / (N1dot * Ndot0))) sum_term4 = N00 * (np.log2((N * N00) / (N0dot * Ndot0))) sum_const = 1 / N I_UC = np.multiply(sum_const, (sum_term1 + sum_term2 + sum_term3 + sum_term4)) ## I_UC[np.where(np.isnan(I_UC))] = np.Inf I_UC_sort_indices = np.argsort(I_UC)[::-1] I_UC_sorted = I_UC[I_UC_sort_indices] ## trueIndices = np.where(np.array(test_y) == 1) falseIndices = np.where(np.array(test_y) == 0) learningParams = np.empty(shape=(train_x.shape[0], 0)) testParams = np.empty(shape=(test_x.shape[0], 0)) accuracies = [] for i in range(1, noOfAcids * kMers): learningParams = np.hstack( (learningParams, train_x[:, I_UC_sort_indices[i - 1:i]])) testParams = np.hstack( (testParams, test_x[:, I_UC_sort_indices[i - 1:i]])) res, _, _ = bs._bayes(learningParams, train_y, testParams, kMers, noOfAcids) accuracies.append( ((np.sum(res[0, trueIndices]) + (np.size(falseIndices) - np.sum(res[0, falseIndices]))) / len(test_x))) print("Max accuracy:\n", np.array(accuracies)[np.where(accuracies == np.max(accuracies))[0]]) print("k = ", np.where(accuracies == np.max(accuracies))[0]) #plot plt.close('all') plt.plot(range(1, noOfAcids * kMers), accuracies, '-k', linewidth=1) plt.ylabel('Accuracy (%)') plt.xlabel('k') plt.grid(True) plt.title("k vs. Accuracy") print("Please close figures to continue...") plt.show()