def calc_performance_differences_with_selected_pairs(allele_fnames, output_fname): """ :param allele_fnames: list of tuples, (blind data fname, scoring matrix fname, pair matrix fname) :param output_fname: Desired filename to save differences to :return: """ pcc_diff_dict = {} auc_diff_dict = {} for allele in allele_fnames: allele_name = get_allele_name_from_path(allele[0]) # if '0206' in allele_name: # continue matrix_pssm = PSSM(allele[1], False) matrix_pssm.load_peptides(allele[0]) matrix_pssm.predict() matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values) for position1 in range(1, 10): for position2 in range(position1 + 1, 10): position_pair = (position1, position2) pair_pssm = PSSM(allele[2], True, position_pair) if pair_pssm.num_pairs > 0: pair_pssm.load_peptides(allele[0]) pair_pssm.predict() pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values) pcc_diff = pair_performance[0] - matrix_performance[0] # if pcc_diff < -0.2: # print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff) if position_pair not in pcc_diff_dict: pcc_diff_dict[position_pair] = [pcc_diff] else: pcc_diff_dict[position_pair].append(pcc_diff) if None not in (pair_performance[1], matrix_performance[1]): auc_diff = pair_performance[1] - matrix_performance[1] if position_pair not in auc_diff_dict: auc_diff_dict[position_pair] = [auc_diff] else: auc_diff_dict[position_pair].append(auc_diff) pcc_sorted_differences = [] for position_pair, coefficient_values in pcc_diff_dict.iteritems(): pcc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values)))) pcc_sorted_differences = sorted(pcc_sorted_differences, key=lambda x: x[1], reverse=True) auc_sorted_differences = [] for position_pair, coefficient_values in auc_diff_dict.iteritems(): auc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values)))) auc_sorted_differences = sorted(auc_sorted_differences, key=lambda x: x[1], reverse=True) pcc_file = open('../selected_pairs/pcc_' + output_fname, 'wb') pcc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in pcc_sorted_differences])) pcc_file.close() auc_file = open('../selected_pairs/auc_' + output_fname, 'wb') auc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in auc_sorted_differences])) auc_file.close()
def get_matrix_and_pair_performance(blind_data, scoring_matrix, pair_matrix): """ Convienence function used for analysis. Oftentimes I've found that I just needed to get the pcc/auc values from a model, so this function does the predictions for both with and without pair coefficients on the same blind set :param blind_data: The blind split used to evaluate performance :param scoring_matrix: Path to a scoring matrix without pair coefficients :param pair_matrix: Path to a scoring matrix with pair coefficients :return: For both matrices a performance tuple with (PCC, AUC, RMSD) """ matrix_pssm = PSSM(scoring_matrix, False) matrix_pssm.load_peptides(blind_data) matrix_pssm.predict() matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values) pair_pssm = PSSM(pair_matrix, True) pair_pssm.load_peptides(blind_data) pair_pssm.predict() pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values) return matrix_performance, pair_performance
def calc_pair_improvements(): allele_dirs = os.listdir(output_dir + 'all_alleles_v7/') allele_dirs = sorted(allele_dirs) position_pair_pcc_dict = {} position_pair_auc_dict = {} index_offset = 0 for index, allele in enumerate(allele_dirs): pair_matrix_files = os.listdir(output_dir + 'all_alleles_v7/' + allele) blind_data = blind_files[index] allele_name = get_allele_name_from_path(blind_data) if (index - index_offset) >= len(matrix_files): break matrix_file = matrix_files[index - index_offset] if allele not in matrix_file: index_offset += 1 continue assert allele in blind_data and allele in matrix_file # trying to seee how improvements change when only looking at 10 biggest alleles # if allele not in top_ten_allele_names: # continue matrix_pssm = PSSM(output_dir + 'all_alleles_v5/matrix/' + matrix_file, False) matrix_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data) matrix_pssm.predict() matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values) for specified_pair_file in pair_matrix_files: if specified_pair_file == 'stdout' or specified_pair_file == 'stderr': continue else: position_pair = tuple(specified_pair_file[:-4].split('_')[2:4]) pair_pssm = PSSM(output_dir + 'all_alleles_v7/' + allele + '/' + specified_pair_file, True) pair_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data) pair_pssm.predict() pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values) pcc_diff = pair_performance[0] - matrix_performance[0] if pcc_diff < -0.1: print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff) continue if position_pair not in position_pair_pcc_dict: position_pair_pcc_dict[position_pair] = [pcc_diff] else: position_pair_pcc_dict[position_pair].append(pcc_diff) if pair_performance[1] is not None and matrix_performance[1] is not None: auc_diff = pair_performance[1] - matrix_performance[1] if position_pair not in position_pair_auc_dict: position_pair_auc_dict[position_pair] = [auc_diff] else: position_pair_auc_dict[position_pair].append(auc_diff) position_pair_pcc_list = [] position_pair_auc_list = [] for position_pair, diff_list in position_pair_pcc_dict.iteritems(): avg = sum(diff_list)/float(len(diff_list)) position_pair_pcc_list.append((position_pair, avg)) for position_pair, diff_list in position_pair_auc_dict.iteritems(): avg = sum(diff_list)/float(len(diff_list)) position_pair_auc_list.append((position_pair, avg)) position_pair_pcc_list = sorted(position_pair_pcc_list, key=lambda x: x[1], reverse=True) position_pair_auc_list = sorted(position_pair_auc_list, key=lambda x: x[1], reverse=True) pcc_diff_file = open(output_dir + 'all_alleles_v7/pcc_diff_selected_pairs.txt', 'wb') auc_diff_file = open(output_dir + 'all_alleles_v7/auc_diff_selected_pairs.txt', 'wb') pcc_diff_file.write('\n'.join([str(x) for x in position_pair_pcc_list])) auc_diff_file.write('\n'.join([str(x) for x in position_pair_auc_list])) pcc_diff_file.close() auc_diff_file.close()
def create_graphs(): fname_tuples = [] # (blind data, scoring matrix, scoring matrix w/ pair coeffs) for index, blind_fname in enumerate(blind_files): allele_name = blind_fname[:-10] matrix_fname = get_file_by_allele(matrix_files, allele_name) pairs_fname = get_file_by_allele(pair_files, allele_name) if matrix_fname is None or pairs_fname is None: continue else: fname_tuples.append((blind_fname, matrix_fname, pairs_fname)) matrix_pcc = [] pair_pcc = [] matrix_auc = [] pair_auc = [] position_pair_lists = [] for allele_files in fname_tuples: allele_name = allele_files[0][:-10] matrix_pssm = PSSM(matrix_dir + allele_files[1], False) matrix_pssm.load_peptides(allele_data_dir + allele_files[0], True) matrix_pssm.predict() matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values) pair_pssm = PSSM(pairs_dir + allele_files[2], True) pair_pssm.load_peptides(allele_data_dir + allele_files[0], True) pair_pssm.predict() pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values) matrix_pcc.append(matrix_performance[0]) pair_pcc.append(pair_performance[0]) matrix_auc.append(matrix_performance[1]) pair_auc.append(pair_performance[1]) if allele_name in top_ten_allele_names: position_pair_lists.append(pair_pssm.position_pair_vals) if 'A-0101' in allele_name or 'A-0201' in allele_name or 'A-0202' in allele_name or 'A-0301' in allele_name: plt.scatter(matrix_pssm.measured_values, matrix_pssm.predicted_values) plt.plot([0, 8], [0, 8], 'k-') plt.axis((0, 8.0, 0, 8.0)) plt.xlabel('Measured log10(IC50)') plt.ylabel('Predicted log10(IC50)') plt.title(allele_name + ' Matrix') plt.savefig(output_dir + 'all_alleles_' + version + '/' + allele_name + '_matrix_meas_vs_pred.png') plt.clf() plt.cla() plt.close() plt.scatter(pair_pssm.measured_values, pair_pssm.predicted_values) plt.plot([0, 8], [0, 8], 'k-') plt.axis((0, 8.0, 0, 8.0)) plt.xlabel('Measured log10(IC50)') plt.ylabel('Predicted log10(IC50)') plt.title(allele_name + ' with ' + str(pair_pssm.num_pairs) + ' pairs selected') plt.savefig(output_dir + 'all_alleles_' + version + '/' + allele_name + '_pair_meas_vs_pred.png') plt.clf() plt.cla() plt.close() calc_avg_pair_rms(position_pair_lists) plt.scatter(matrix_pcc, pair_pcc) plt.plot([0, 1], [0, 1], 'k-') plt.axis((0.5, 1.0, 0.5, 1.0)) plt.xlabel('Pairs(-)') plt.ylabel('Pairs(+)') plt.title('Matrix vs Pair PCC') plt.savefig(output_dir + 'all_alleles_' + version + '/mat_vs_pair_pcc.png') plt.clf() plt.cla() plt.close() plt.scatter(matrix_auc, pair_auc) plt.plot([0, 1], [0, 1], 'k-') plt.axis((0.75, 1.0, 0.75, 1.0)) plt.xlabel('Pairs(-)') plt.ylabel('Pairs(+)') plt.title('Matrix vs Pair AUC') plt.savefig(output_dir + 'all_alleles_' + version + '/mat_vs_pair_auc.png') plt.clf() plt.cla() plt.close()