def compare_matrices( pos, neg, matrices=["BLOSUM50", "BLOSUM62", "MATIO", "PAM100", "PAM250"], gen_roc=True, filename="all_matrices_roc.png"): """ Takes in pos pairs, neg pairs, matrices, plots ROC curve with all provided matrices """ TSs = [] FSs = [] if prescored is None: prescored = [] for m in matix: prescored.append(None) for i, matrix in enumerate(matrices): if prescored[i] is not None and prescored[0] == i: TSs.append(prescored[1][0]) FSs.append(prescored[1][1]) continue pos_scores = calc_all_scores(pos, matrix, 11, 1) TSs.append([x[1] for x in pos_scores]) neg_scores = calc_all_scores(neg, matrix, 11, 1) FSs.append([x[1] for x in neg_scores]) if gen_roc: algs.roc(TSs, FSs, matrices, save=True, filename=filename) else: for ts, fs, matrix in zip(TSs, FSs, matrices): print(matrix, calc_fp_rate(ts, fs, 0.7))
def test_roc(): false_pos = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] true_pos = roc('BLOSUM50',10,2,'Pospairs.txt','Negpairs.txt', false_pos, normalize=False) assert len(true_pos) == len(false_pos) for value in true_pos: assert value >= 0 assert value <= 1
def compare_normalized(pos, neg, matrix): """ Calculate ROC curve for raw and normalized scores for a given matrix and pos/neg pairs """ TSs = [] FSs = [] pos_scores = calc_all_scores(pos, matrix, 11, 1) TSs.append([x[1] for x in pos_scores]) TSs.append([ score / min(len(pair[0]), len(pair[1])) for pair, score in pos_scores ]) neg_scores = calc_all_scores(neg, matrix, 11, 1) FSs.append([x[1] for x in neg_scores]) FSs.append([ score / min(len(pair[0]), len(pair[1])) for pair, score in neg_scores ]) algs.roc(TSs, FSs, ["Raw", "Normalized"], save=True, filename=matrix + "_normalization.png")
def test_roc(): """ Make sure all ORC values are between 0 and 1, inclusive """ pos = sw.read_pairs("Pospairs.txt") neg = sw.read_pairs("Negpairs.txt") true_scores = sw.calc_all_scores(pos, "BLOSUM50", 11, 1) false_scores = sw.calc_all_scores(neg, "BLOSUM50", 11, 1) xs, ys = algs.roc([[x[1] for x in true_scores]], [[x[1] for x in false_scores]], ["BLOSUM50"]) assert all([all([x <= 1 and x >= 0 for x in xvals]) for xvals in xs]) assert all([all([y <= 1 and y >= 0 for y in yvals]) for yvals in ys])
pickle.dump (dict_roc, f) #Else just load the previous dict_roc object elif logic == '-R': print('Loading in previously defined objects') with open("scoring_matrix_dictionary_for_roc.pkl", "rb") as g: dict_roc = pickle.load(g) #Call to roc function to handle the logic by each dictionary handle. Then graph all lines together on one graph. range_dict = {key: np.linspace(dict_roc[key].values.max(), dict_roc[key].values.min(), 200) for key in dict_roc} dict_for_roc = return_roc_df (dict_roc, range_dict) print('The dictionary of dataframes looks like', dict_for_roc) #Plot each roc curve (5 total) on the same graph roc(dict_for_roc) #So it looks like PAM100 is the best scoring matrix. Let's see what happens if we #normalize. #Let's now normalize the scores for just two situations: let's choose the best scoring matrix we had previously and #generate scores both with and without normalization. Then, plot the ROC curves for these two matrices. #Calculate for PAM100 optimal = 'PAM100' matrix = BLOSUM_reader (optimal) #Calculate non-normalized and normalized dfs, with evaluation only once if logic == '-E': #We've actually calculated this before, but re-do for consistency's sake
#Save the final matrix with name defined previously with open(name, "wb") as f: pickle.dump (optimal_matrix, f) sys.exit('Optimal matrix calculation complete!') #If no termination: iterate again. Repopulate the selected individuals using #a combination of recombination (of two individuals) and mutation. #Lose the scores at this point; get a list of score matrices again populations = repopulate (populations) fitness = [] elif logic == '-R': with open(name, 'rb') as g: optimal_matrix = pickle.load (g) #Plot ROC curves for original and new optimized matrices that I select orig_df = return_score_df (neg_pairs, pos_pairs, unopt_matrix, gap_start, gap_extension) optimized_df = return_score_df (neg_pairs, pos_pairs, optimal_matrix, gap_start, gap_extension) opt_dict = {'Unoptimized': orig_df, 'Optimized': optimized_df} opt_range_dict = {key: np.linspace(opt_dict[key].values.max(), opt_dict[key].values.min(), 200) for key in opt_dict} dict_for_opt_roc = return_roc_df(opt_dict, opt_range_dict) fig_name = sys.argv[4] + ' optimal_vs_unoptimal.png' roc(dict_for_opt_roc, fig_name = fig_name, title = 'Optimized vs unoptimized scoring matrix ROC' + sys.argv[4]) sys.exit ('Optimization and ROC graphs complete!')
def full_optimization_run(starting_pos, starting_neg, starting_matrix, goal=4, max_gen=5000): #optimize starting pos_aligns = read_existing_aligns(starting_pos) neg_aligns = read_existing_aligns(starting_neg) #get best matrix and filename new_matrix, matrix_filename = optimize_score_matrix(pos_aligns, neg_aligns, starting_matrix, goal=goal, max_gen=max_gen) #generate scores in initial alignment starting_mat = algs.get_scoring_matrix(starting_matrix) #calculate the starting score true_scores = [ algs.score_existing_align(*(p), starting_mat) for p in pos_aligns ] false_scores = [ algs.score_existing_align(*(n), starting_mat) for n in neg_aligns ] #calculate score on previous alignment precal_pos = [ algs.score_existing_align(*(p), new_matrix) for p in pos_aligns ] precal_neg = [ algs.score_existing_align(*(n), new_matrix) for n in neg_aligns ] #read in initial pairs for realign pos = read_pairs("Pospairs.txt") neg = read_pairs("Negpairs.txt") #save new aligns calc_all_aligns(pos, new_matrix, 11, 1, starting_matrix + "_optimized_pos_aligns.txt") calc_all_aligns(neg, new_matrix, 11, 1, starting_matrix + "_optimized_neg_aligns.txt") #read in and score new aligns for consistency in scoring pos_aligns_2 = read_existing_aligns(starting_matrix + "_optimized_pos_aligns.txt") neg_aligns_2 = read_existing_aligns(starting_matrix + "_optimized_neg_aligns.txt") precal_pos_2 = [ algs.score_existing_align(*(p), new_matrix) for p in pos_aligns_2 ] precal_neg_2 = [ algs.score_existing_align(*(n), new_matrix) for n in neg_aligns_2 ] prescored = [(0, (true_scores, false_scores)), (1, (precal_pos, precal_neg)), (2, ())] TSs = [true_scores, precal_pos, precal_pos_2] FSs = [false_scores, precal_neg, precal_neg_2] #plot all new scores algs.roc(TSs, FSs, [ starting_matrix, starting_matrix + "_optimized", starting_matrix + "_optimized_realign" ], save=True, filename=starting_matrix + "_optimization.png")