Exemple #1
0
def main():
    parser = argparse.ArgumentParser(description = "pairwise local alignment")
    parser.add_argument("gap_cost", help="affine gap penalty cost", type = int)
    parser.add_argument("gap_extension", help = "gap extension cost", type = int)
    parser.add_argument("matrix", help="scoring matrix")
    parser.add_argument("--score_only", help="set True if you only want the alignment score, False otherwise", type=str2bool, nargs="?", const=True, default=True)
    args = parser.parse_args()
    matrix_path = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/scoring_matrices/", args.matrix)
    
    with open("/Users/matt/OneDrive/UCSF/algorithms/HW3/metadata/Negpairs.txt") as tsvfile:
        reader = csv.reader(tsvfile, delimiter = " ")
        for x, pair in enumerate(reader):
            A_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[0])
            B_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[1])
            fasta_seq = SeqIO.parse(open(A_input),"fasta")
            seqarrayA = np.empty((50),dtype=np.str)
            seqarrayB = np.empty((50),dtype=np.str)
            print(seqarrayA)
            for fasta in fasta_seq:
                seq_A = str(fasta.seq).upper()
                seqarrayA[x] = seq_A
            fasta_seqb = SeqIO.parse(open(B_input),"fasta")
            for fasta in fasta_seqb:
                seq_B = str(fasta.seq).upper()
                seqarrayB[x] = seq_B
            print(seq_A)
    
    fp_matrix=np.zeros((20,5))
    for i in range(0,20,5):
        for j in range(0,5,2):
            with open("/Users/matt/OneDrive/UCSF/algorithms/HW3/metadata/Negpairs.txt") as tsvfile:
                reader = csv.reader(tsvfile, delimiter=" ")
                scores_array = np.zeros((50))
                for x, pair in enumerate(reader):
                    #Loop through pairs
                    A_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[0])
                    B_input = os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/", pair[1])
                    fasta_seq = SeqIO.parse(open(A_input),"fasta")
                    for fasta in fasta_seq:
                        seq_A = str(fasta.seq).upper()
                    fasta_seqb= SeqIO.parse(open(B_input), "fasta")
                    for fasta in fasta_seqb:
                        seq_B = str(fasta.seq).upper()
                    M = scoringMatrixParse(matrix_path)
                    H = matrix(seq_A, seq_B, M, -(i+1), -(j+1))    
                    s = traceback(H, args.score_only, b=seq_B, b_="", old_i=0)
                    scores_array[x] = s
                fpr = sum(scores_array>311.8)/50
                fp_matrix[i,j] = fpr
        print(fp_matrix)
Exemple #2
0
def test_matrix():
    ############-- Online check --############################
    # Program: water
    # Rundate: Mon 25 Feb 2019 00:21:17
    # Commandline: water
    #    -auto
    #    -stdout
    #    -asequence emboss_water-I20190225-002115-0388-17773373-p2m.asequence
    #    -bsequence emboss_water-I20190225-002115-0388-17773373-p2m.bsequence
    #    -datafile EBLOSUM50
    #    -gapopen 10.0
    #    -gapextend 1.0
    #    -aformat3 pair
    #    -sprotein1
    #    -sprotein2
    # Align_format: pair
    # Report_file: stdout
    ########################################

    #=======================================
    #
    # Aligned_sequences: 2
    # 1: HBA_HUMAN
    # 2: HBA_MOUSE
    # Matrix: EBLOSUM50
    # Gap_penalty: 10.0
    # Extend_penalty: 1.0
    #
    # Length: 26
    # Identity:      18/26 (69.2%)
    # Similarity:    21/26 (80.8%)
    # Gaps:           0/26 ( 0.0%)
    # Score: 135.0
    #
    #
    #=======================================

    seqA = "MVLSPADKTNVKAAWGKVGAHAGEYG"
    seqB = "MVLSGEDKSNIKAAWGKIGGHGAEYGAE"
    M = matrix(seqA, seqB, B50, -10, -1)
    s = traceback(M, True, b=seqB, b_="", old_i=0)
    assert s == 135  #135 should be the score (shown above)
Exemple #3
0
def test_alignment_format():
    seqA = "AtCtggTTcc"
    seqB = "atcTgccTcT"
    M = matrix(seqA, seqB, B50, -9, -3)
    assert np.issubdtype(M.dtype, np.dtype(
        'int64'))  #assert a matrix is produced dispite lowercase amino acids
    negative = np.array(list(zip(nega_list, negb_list))[:4])
    positive = np.array(list(zip(posa_list, posb_list))[:4])
    optimized_matrix = mutate(negative, positive)
    pd.DataFrame(optimized_matrix)
    optimized_matrix.shape
    np.reshape(optimized_matrix, (22,22)).shape

s_list = np.zeros((50))
p_list = np.zeros((50))
fpr_list = []
roc_list = []
M = scoringMatrixParse(os.path.join("/Users/matt/OneDrive/UCSF/algorithms/HW3/scoring_matrices/", "PAM250"))
M.shape
M = optimized_matrix
for x, a in enumerate(list(zip(nega_list,negb_list))[:1]):
    H = matrix(a[0], a[1], M, -9, -3)
    pd.DataFrame(H)
    s = traceback(H, True, b=seq_B, b_="", old_i=0)
    s_list[x] = s
for x, a in enumerate(zip(posa_list,posb_list)):
    H = matrix(a[0], a[1], M, -9, -3)
    s = traceback(H, True, b=seq_B, b_="", old_i=0)
    p_list[x] = s

all_values = set(np.append(p_list, s_list).flatten())
tpr_list = []
fpr_list = []
for value in all_values:
    tpr = sum(p_list > value)/len(p_list)
    tpr_list.append(tpr)
    fpr = sum(s_list > value)/len(s_list)