def calc_performance_differences_with_selected_pairs(allele_fnames, output_fname): """ :param allele_fnames: list of tuples, (blind data fname, scoring matrix fname, pair matrix fname) :param output_fname: Desired filename to save differences to :return: """ pcc_diff_dict = {} auc_diff_dict = {} for allele in allele_fnames: allele_name = get_allele_name_from_path(allele[0]) # if '0206' in allele_name: # continue matrix_pssm = PSSM(allele[1], False) matrix_pssm.load_peptides(allele[0]) matrix_pssm.predict() matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values) for position1 in range(1, 10): for position2 in range(position1 + 1, 10): position_pair = (position1, position2) pair_pssm = PSSM(allele[2], True, position_pair) if pair_pssm.num_pairs > 0: pair_pssm.load_peptides(allele[0]) pair_pssm.predict() pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values) pcc_diff = pair_performance[0] - matrix_performance[0] # if pcc_diff < -0.2: # print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff) if position_pair not in pcc_diff_dict: pcc_diff_dict[position_pair] = [pcc_diff] else: pcc_diff_dict[position_pair].append(pcc_diff) if None not in (pair_performance[1], matrix_performance[1]): auc_diff = pair_performance[1] - matrix_performance[1] if position_pair not in auc_diff_dict: auc_diff_dict[position_pair] = [auc_diff] else: auc_diff_dict[position_pair].append(auc_diff) pcc_sorted_differences = [] for position_pair, coefficient_values in pcc_diff_dict.iteritems(): pcc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values)))) pcc_sorted_differences = sorted(pcc_sorted_differences, key=lambda x: x[1], reverse=True) auc_sorted_differences = [] for position_pair, coefficient_values in auc_diff_dict.iteritems(): auc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values)))) auc_sorted_differences = sorted(auc_sorted_differences, key=lambda x: x[1], reverse=True) pcc_file = open('../selected_pairs/pcc_' + output_fname, 'wb') pcc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in pcc_sorted_differences])) pcc_file.close() auc_file = open('../selected_pairs/auc_' + output_fname, 'wb') auc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in auc_sorted_differences])) auc_file.close()
def zip_result_files(blind_files, matrix_files, pair_files): m_files = list(matrix_files) p_files = list(pair_files) fname_tuples = [] # (blind data, scoring matrix, scoring matrix w/ pair coeffs) for index, blind_fname in enumerate(blind_files): allele_name = get_allele_name_from_path(blind_fname) matrix_fname = get_file_by_allele(m_files, allele_name) pairs_fname = get_file_by_allele(p_files, allele_name) if matrix_fname is None or pairs_fname is None: continue else: fname_tuples.append((blind_fname, matrix_fname, pairs_fname)) return fname_tuples
result_tuples = zip_result_files(blind_files, scoring_matrix_files, pair_matrix_files) pcc_differences = [] auc_differences = [] for result_files in result_tuples: blind = result_files[0] scoring_matrix = result_files[1] pair_matrix = result_files[2] matrix_results, pair_results = get_matrix_and_pair_performance(blind, scoring_matrix, pair_matrix) pcc_diff = pair_results[0] - matrix_results[0] if pcc_diff < -0.2: print(get_allele_name_from_path(blind), ' has a pcc diff of ', pcc_diff) continue # do not add, outlier pcc_differences.append(pcc_diff) if None not in (pair_results[1], matrix_results[1]): auc_differences.append(pair_results[1] - matrix_results[1]) fig = plt.figure() print('pcc avg diff: ', sum(pcc_differences)/float(len(pcc_differences))) fig.add_subplot(121) plt.plot(range(1, len(pcc_differences) + 1), pcc_differences) fig.add_subplot(122) print('auc avg diff: ', sum(auc_differences)/float(len(auc_differences))) plt.plot(range(1, len(auc_differences) + 1), auc_differences) plt.show()
def calc_pair_improvements(): allele_dirs = os.listdir(output_dir + 'all_alleles_v7/') allele_dirs = sorted(allele_dirs) position_pair_pcc_dict = {} position_pair_auc_dict = {} index_offset = 0 for index, allele in enumerate(allele_dirs): pair_matrix_files = os.listdir(output_dir + 'all_alleles_v7/' + allele) blind_data = blind_files[index] allele_name = get_allele_name_from_path(blind_data) if (index - index_offset) >= len(matrix_files): break matrix_file = matrix_files[index - index_offset] if allele not in matrix_file: index_offset += 1 continue assert allele in blind_data and allele in matrix_file # trying to seee how improvements change when only looking at 10 biggest alleles # if allele not in top_ten_allele_names: # continue matrix_pssm = PSSM(output_dir + 'all_alleles_v5/matrix/' + matrix_file, False) matrix_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data) matrix_pssm.predict() matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values) for specified_pair_file in pair_matrix_files: if specified_pair_file == 'stdout' or specified_pair_file == 'stderr': continue else: position_pair = tuple(specified_pair_file[:-4].split('_')[2:4]) pair_pssm = PSSM(output_dir + 'all_alleles_v7/' + allele + '/' + specified_pair_file, True) pair_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data) pair_pssm.predict() pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values) pcc_diff = pair_performance[0] - matrix_performance[0] if pcc_diff < -0.1: print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff) continue if position_pair not in position_pair_pcc_dict: position_pair_pcc_dict[position_pair] = [pcc_diff] else: position_pair_pcc_dict[position_pair].append(pcc_diff) if pair_performance[1] is not None and matrix_performance[1] is not None: auc_diff = pair_performance[1] - matrix_performance[1] if position_pair not in position_pair_auc_dict: position_pair_auc_dict[position_pair] = [auc_diff] else: position_pair_auc_dict[position_pair].append(auc_diff) position_pair_pcc_list = [] position_pair_auc_list = [] for position_pair, diff_list in position_pair_pcc_dict.iteritems(): avg = sum(diff_list)/float(len(diff_list)) position_pair_pcc_list.append((position_pair, avg)) for position_pair, diff_list in position_pair_auc_dict.iteritems(): avg = sum(diff_list)/float(len(diff_list)) position_pair_auc_list.append((position_pair, avg)) position_pair_pcc_list = sorted(position_pair_pcc_list, key=lambda x: x[1], reverse=True) position_pair_auc_list = sorted(position_pair_auc_list, key=lambda x: x[1], reverse=True) pcc_diff_file = open(output_dir + 'all_alleles_v7/pcc_diff_selected_pairs.txt', 'wb') auc_diff_file = open(output_dir + 'all_alleles_v7/auc_diff_selected_pairs.txt', 'wb') pcc_diff_file.write('\n'.join([str(x) for x in position_pair_pcc_list])) auc_diff_file.write('\n'.join([str(x) for x in position_pair_auc_list])) pcc_diff_file.close() auc_diff_file.close()