def main(): parser = argparse.ArgumentParser(description = "pairwise local alignment") parser.add_argument("gap_cost", help="affine gap penalty cost", type = int) parser.add_argument("gap_extension", help = "gap extension cost", type = int) parser.add_argument("matrix", help="scoring matrix") parser.add_argument("--score_only", help="set True if you only want the alignment score, False otherwise", type=str2bool, nargs="?", const=True, default=True) args = parser.parse_args() matrix_path = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/scoring_matrices/", args.matrix) with open("/Users/matt/OneDrive/UCSF/algorithms/HW3/metadata/Negpairs.txt") as tsvfile: reader = csv.reader(tsvfile, delimiter = " ") for x, pair in enumerate(reader): A_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[0]) B_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[1]) fasta_seq = SeqIO.parse(open(A_input),"fasta") seqarrayA = np.empty((50),dtype=np.str) seqarrayB = np.empty((50),dtype=np.str) print(seqarrayA) for fasta in fasta_seq: seq_A = str(fasta.seq).upper() seqarrayA[x] = seq_A fasta_seqb = SeqIO.parse(open(B_input),"fasta") for fasta in fasta_seqb: seq_B = str(fasta.seq).upper() seqarrayB[x] = seq_B print(seq_A) fp_matrix=np.zeros((20,5)) for i in range(0,20,5): for j in range(0,5,2): with open("/Users/matt/OneDrive/UCSF/algorithms/HW3/metadata/Negpairs.txt") as tsvfile: reader = csv.reader(tsvfile, delimiter=" ") scores_array = np.zeros((50)) for x, pair in enumerate(reader): #Loop through pairs A_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[0]) B_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[1]) fasta_seq = SeqIO.parse(open(A_input),"fasta") for fasta in fasta_seq: seq_A = str(fasta.seq).upper() fasta_seqb= SeqIO.parse(open(B_input), "fasta") for fasta in fasta_seqb: seq_B = str(fasta.seq).upper() M = scoringMatrixParse(matrix_path) H = matrix(seq_A, seq_B, M, -(i+1), -(j+1)) s = traceback(H, args.score_only, b=seq_B, b_="", old_i=0) scores_array[x] = s fpr = sum(scores_array>311.8)/50 fp_matrix[i,j] = fpr print(fp_matrix)
def test_matrix(): ############-- Online check --############################ # Program: water # Rundate: Mon 25 Feb 2019 00:21:17 # Commandline: water # -auto # -stdout # -asequence emboss_water-I20190225-002115-0388-17773373-p2m.asequence # -bsequence emboss_water-I20190225-002115-0388-17773373-p2m.bsequence # -datafile EBLOSUM50 # -gapopen 10.0 # -gapextend 1.0 # -aformat3 pair # -sprotein1 # -sprotein2 # Align_format: pair # Report_file: stdout ######################################## #======================================= # # Aligned_sequences: 2 # 1: HBA_HUMAN # 2: HBA_MOUSE # Matrix: EBLOSUM50 # Gap_penalty: 10.0 # Extend_penalty: 1.0 # # Length: 26 # Identity: 18/26 (69.2%) # Similarity: 21/26 (80.8%) # Gaps: 0/26 ( 0.0%) # Score: 135.0 # # #======================================= seqA = "MVLSPADKTNVKAAWGKVGAHAGEYG" seqB = "MVLSGEDKSNIKAAWGKIGGHGAEYGAE" M = matrix(seqA, seqB, B50, -10, -1) s = traceback(M, True, b=seqB, b_="", old_i=0) assert s == 135 #135 should be the score (shown above)
def test_alignment_format(): seqA = "AtCtggTTcc" seqB = "atcTgccTcT" M = matrix(seqA, seqB, B50, -9, -3) assert np.issubdtype(M.dtype, np.dtype( 'int64')) #assert a matrix is produced dispite lowercase amino acids
negative = np.array(list(zip(nega_list, negb_list))[:4]) positive = np.array(list(zip(posa_list, posb_list))[:4]) optimized_matrix = mutate(negative, positive) pd.DataFrame(optimized_matrix) optimized_matrix.shape np.reshape(optimized_matrix, (22,22)).shape s_list = np.zeros((50)) p_list = np.zeros((50)) fpr_list = [] roc_list = [] M = scoringMatrixParse(os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/scoring_matrices/", "PAM250")) M.shape M = optimized_matrix for x, a in enumerate(list(zip(nega_list,negb_list))[:1]): H = matrix(a[0], a[1], M, -9, -3) pd.DataFrame(H) s = traceback(H, True, b=seq_B, b_="", old_i=0) s_list[x] = s for x, a in enumerate(zip(posa_list,posb_list)): H = matrix(a[0], a[1], M, -9, -3) s = traceback(H, True, b=seq_B, b_="", old_i=0) p_list[x] = s all_values = set(np.append(p_list, s_list).flatten()) tpr_list = [] fpr_list = [] for value in all_values: tpr = sum(p_list > value)/len(p_list) tpr_list.append(tpr) fpr = sum(s_list > value)/len(s_list)