Example #1
0
def compare_matrices(
        pos,
        neg,
        matrices=["BLOSUM50", "BLOSUM62", "MATIO", "PAM100", "PAM250"],
        gen_roc=True,
        filename="all_matrices_roc.png"):
    """
	Takes in pos pairs, neg pairs, matrices, plots ROC curve with all provided matrices
	"""

    TSs = []
    FSs = []

    if prescored is None:
        prescored = []
        for m in matix:
            prescored.append(None)

    for i, matrix in enumerate(matrices):
        if prescored[i] is not None and prescored[0] == i:
            TSs.append(prescored[1][0])
            FSs.append(prescored[1][1])
            continue
        pos_scores = calc_all_scores(pos, matrix, 11, 1)
        TSs.append([x[1] for x in pos_scores])
        neg_scores = calc_all_scores(neg, matrix, 11, 1)
        FSs.append([x[1] for x in neg_scores])
    if gen_roc:
        algs.roc(TSs, FSs, matrices, save=True, filename=filename)
    else:
        for ts, fs, matrix in zip(TSs, FSs, matrices):
            print(matrix, calc_fp_rate(ts, fs, 0.7))
Example #2
0
def test_roc():
    false_pos = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
    true_pos = roc('BLOSUM50',10,2,'Pospairs.txt','Negpairs.txt', false_pos, normalize=False)
    assert len(true_pos) == len(false_pos)
    for value in true_pos:
        assert value >= 0
        assert value <= 1
Example #3
0
def compare_normalized(pos, neg, matrix):
    """
	Calculate ROC curve for raw and normalized scores for a given matrix and pos/neg pairs
	"""
    TSs = []
    FSs = []
    pos_scores = calc_all_scores(pos, matrix, 11, 1)
    TSs.append([x[1] for x in pos_scores])
    TSs.append([
        score / min(len(pair[0]), len(pair[1])) for pair, score in pos_scores
    ])

    neg_scores = calc_all_scores(neg, matrix, 11, 1)
    FSs.append([x[1] for x in neg_scores])
    FSs.append([
        score / min(len(pair[0]), len(pair[1])) for pair, score in neg_scores
    ])

    algs.roc(TSs,
             FSs, ["Raw", "Normalized"],
             save=True,
             filename=matrix + "_normalization.png")
Example #4
0
def test_roc():
	"""
	Make sure all ORC values are between 0 and 1, inclusive
	"""
	pos = sw.read_pairs("Pospairs.txt")
	neg = sw.read_pairs("Negpairs.txt")

	true_scores = sw.calc_all_scores(pos, "BLOSUM50", 11, 1)
	false_scores = sw.calc_all_scores(neg, "BLOSUM50", 11, 1)

	xs, ys = algs.roc([[x[1] for x in true_scores]],
		[[x[1] for x in false_scores]], ["BLOSUM50"])

	assert all([all([x <= 1 and x >= 0 for x in xvals]) for xvals in xs])
	assert all([all([y <= 1 and y >= 0 for y in yvals]) for yvals in ys])
Example #5
0
        pickle.dump (dict_roc, f)

#Else just load the previous dict_roc object

elif logic == '-R':
    print('Loading in previously defined objects')
    with open("scoring_matrix_dictionary_for_roc.pkl", "rb") as g:
        dict_roc = pickle.load(g)

    #Call to roc function to handle the logic by each dictionary handle. Then graph all lines together on one graph.
    range_dict = {key: np.linspace(dict_roc[key].values.max(), dict_roc[key].values.min(), 200) for key in dict_roc}
    dict_for_roc = return_roc_df (dict_roc, range_dict)
    print('The dictionary of dataframes looks like', dict_for_roc)

    #Plot each roc curve (5 total) on the same graph
    roc(dict_for_roc)

#So it looks like PAM100 is the best scoring matrix. Let's see what happens if we
#normalize.

#Let's now normalize the scores for just two situations: let's choose the best scoring matrix we had previously and
#generate scores both with and without normalization. Then, plot the ROC curves for these two matrices.

#Calculate for PAM100
optimal = 'PAM100'
matrix = BLOSUM_reader (optimal)

#Calculate non-normalized and normalized dfs, with evaluation only once

if logic == '-E':
    #We've actually calculated this before, but re-do for consistency's sake
Example #6
0
            #Save the final matrix with name defined previously
            with open(name, "wb") as f:
                pickle.dump (optimal_matrix, f)

            sys.exit('Optimal matrix calculation complete!')

        #If no termination: iterate again. Repopulate the selected individuals using
        #a combination of recombination (of two individuals) and mutation.
        
        #Lose the scores at this point; get a list of score matrices again
        populations = repopulate (populations)
        fitness = []

elif logic == '-R':
    with open(name, 'rb') as g:
        optimal_matrix = pickle.load (g)

    #Plot ROC curves for original and new optimized matrices that I select

    orig_df  = return_score_df (neg_pairs, pos_pairs, unopt_matrix, gap_start, gap_extension)
    optimized_df = return_score_df (neg_pairs, pos_pairs, optimal_matrix, gap_start, gap_extension)
    opt_dict = {'Unoptimized': orig_df, 'Optimized': optimized_df}

    opt_range_dict = {key: np.linspace(opt_dict[key].values.max(), opt_dict[key].values.min(), 200) for key in opt_dict}
    dict_for_opt_roc = return_roc_df(opt_dict, opt_range_dict)

    fig_name = sys.argv[4] + ' optimal_vs_unoptimal.png'
    roc(dict_for_opt_roc, fig_name = fig_name, title = 'Optimized vs unoptimized scoring matrix ROC' +  sys.argv[4])

    sys.exit ('Optimization and ROC graphs complete!')
Example #7
0
def full_optimization_run(starting_pos,
                          starting_neg,
                          starting_matrix,
                          goal=4,
                          max_gen=5000):

    #optimize starting
    pos_aligns = read_existing_aligns(starting_pos)
    neg_aligns = read_existing_aligns(starting_neg)

    #get best matrix and filename
    new_matrix, matrix_filename = optimize_score_matrix(pos_aligns,
                                                        neg_aligns,
                                                        starting_matrix,
                                                        goal=goal,
                                                        max_gen=max_gen)

    #generate scores in initial alignment
    starting_mat = algs.get_scoring_matrix(starting_matrix)

    #calculate the starting score
    true_scores = [
        algs.score_existing_align(*(p), starting_mat) for p in pos_aligns
    ]
    false_scores = [
        algs.score_existing_align(*(n), starting_mat) for n in neg_aligns
    ]

    #calculate score on previous alignment
    precal_pos = [
        algs.score_existing_align(*(p), new_matrix) for p in pos_aligns
    ]
    precal_neg = [
        algs.score_existing_align(*(n), new_matrix) for n in neg_aligns
    ]

    #read in initial pairs for realign
    pos = read_pairs("Pospairs.txt")
    neg = read_pairs("Negpairs.txt")

    #save new aligns
    calc_all_aligns(pos, new_matrix, 11, 1,
                    starting_matrix + "_optimized_pos_aligns.txt")
    calc_all_aligns(neg, new_matrix, 11, 1,
                    starting_matrix + "_optimized_neg_aligns.txt")

    #read in and score new aligns for consistency in scoring
    pos_aligns_2 = read_existing_aligns(starting_matrix +
                                        "_optimized_pos_aligns.txt")
    neg_aligns_2 = read_existing_aligns(starting_matrix +
                                        "_optimized_neg_aligns.txt")

    precal_pos_2 = [
        algs.score_existing_align(*(p), new_matrix) for p in pos_aligns_2
    ]
    precal_neg_2 = [
        algs.score_existing_align(*(n), new_matrix) for n in neg_aligns_2
    ]

    prescored = [(0, (true_scores, false_scores)),
                 (1, (precal_pos, precal_neg)), (2, ())]

    TSs = [true_scores, precal_pos, precal_pos_2]
    FSs = [false_scores, precal_neg, precal_neg_2]

    #plot all new scores
    algs.roc(TSs,
             FSs, [
                 starting_matrix, starting_matrix + "_optimized",
                 starting_matrix + "_optimized_realign"
             ],
             save=True,
             filename=starting_matrix + "_optimization.png")