def calc_performance_differences_with_selected_pairs(allele_fnames, output_fname):
    """

    :param allele_fnames: list of tuples, (blind data fname, scoring matrix fname, pair matrix fname)
    :param output_fname: Desired filename to save differences to
    :return:
    """
    pcc_diff_dict = {}
    auc_diff_dict = {}
    for allele in allele_fnames:
        allele_name = get_allele_name_from_path(allele[0])
        # if '0206' in allele_name:
        #     continue
        matrix_pssm = PSSM(allele[1], False)
        matrix_pssm.load_peptides(allele[0])
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)
        for position1 in range(1, 10):
            for position2 in range(position1 + 1, 10):
                position_pair = (position1, position2)
                pair_pssm = PSSM(allele[2], True, position_pair)
                if pair_pssm.num_pairs > 0:
                    pair_pssm.load_peptides(allele[0])
                    pair_pssm.predict()
                    pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

                    pcc_diff = pair_performance[0] - matrix_performance[0]
                    # if pcc_diff < -0.2:
                    #     print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff)
                    if position_pair not in pcc_diff_dict:
                        pcc_diff_dict[position_pair] = [pcc_diff]
                    else:
                        pcc_diff_dict[position_pair].append(pcc_diff)

                    if None not in (pair_performance[1], matrix_performance[1]):
                        auc_diff = pair_performance[1] - matrix_performance[1]
                        if position_pair not in auc_diff_dict:
                            auc_diff_dict[position_pair] = [auc_diff]
                        else:
                            auc_diff_dict[position_pair].append(auc_diff)

    pcc_sorted_differences = []
    for position_pair, coefficient_values in pcc_diff_dict.iteritems():
        pcc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values))))
    pcc_sorted_differences = sorted(pcc_sorted_differences, key=lambda x: x[1], reverse=True)

    auc_sorted_differences = []
    for position_pair, coefficient_values in auc_diff_dict.iteritems():
        auc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values))))
    auc_sorted_differences = sorted(auc_sorted_differences, key=lambda x: x[1], reverse=True)

    pcc_file = open('../selected_pairs/pcc_' + output_fname, 'wb')
    pcc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in pcc_sorted_differences]))
    pcc_file.close()

    auc_file = open('../selected_pairs/auc_' + output_fname, 'wb')
    auc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in auc_sorted_differences]))
    auc_file.close()
def zip_result_files(blind_files, matrix_files, pair_files):
    m_files = list(matrix_files)
    p_files = list(pair_files)
    fname_tuples = []  # (blind data, scoring matrix, scoring matrix w/ pair coeffs)
    for index, blind_fname in enumerate(blind_files):
        allele_name = get_allele_name_from_path(blind_fname)
        matrix_fname = get_file_by_allele(m_files, allele_name)
        pairs_fname = get_file_by_allele(p_files, allele_name)
        if matrix_fname is None or pairs_fname is None:
            continue
        else:
            fname_tuples.append((blind_fname, matrix_fname, pairs_fname))

    return fname_tuples
    result_tuples = zip_result_files(blind_files, scoring_matrix_files, pair_matrix_files)

    pcc_differences = []
    auc_differences = []

    for result_files in result_tuples:
        blind = result_files[0]
        scoring_matrix = result_files[1]
        pair_matrix = result_files[2]

        matrix_results, pair_results = get_matrix_and_pair_performance(blind, scoring_matrix, pair_matrix)

        pcc_diff = pair_results[0] - matrix_results[0]

        if pcc_diff < -0.2:
            print(get_allele_name_from_path(blind), ' has a pcc diff of ', pcc_diff)
            continue  # do not add, outlier

        pcc_differences.append(pcc_diff)
        if None not in (pair_results[1], matrix_results[1]):
            auc_differences.append(pair_results[1] - matrix_results[1])

    fig = plt.figure()
    print('pcc avg diff: ', sum(pcc_differences)/float(len(pcc_differences)))
    fig.add_subplot(121)
    plt.plot(range(1, len(pcc_differences) + 1), pcc_differences)
    fig.add_subplot(122)
    print('auc avg diff: ', sum(auc_differences)/float(len(auc_differences)))
    plt.plot(range(1, len(auc_differences) + 1), auc_differences)
    plt.show()
def calc_pair_improvements():
    allele_dirs = os.listdir(output_dir + 'all_alleles_v7/')
    allele_dirs = sorted(allele_dirs)
    position_pair_pcc_dict = {}
    position_pair_auc_dict = {}

    index_offset = 0
    for index, allele in enumerate(allele_dirs):
        pair_matrix_files = os.listdir(output_dir + 'all_alleles_v7/' + allele)
        blind_data = blind_files[index]
        allele_name = get_allele_name_from_path(blind_data)
        if (index - index_offset) >= len(matrix_files):
            break
        matrix_file = matrix_files[index - index_offset]

        if allele not in matrix_file:
            index_offset += 1
            continue

        assert allele in blind_data and allele in matrix_file

        # trying to seee how improvements change when only looking at 10 biggest alleles
        # if allele not in top_ten_allele_names:
        #     continue

        matrix_pssm = PSSM(output_dir + 'all_alleles_v5/matrix/' + matrix_file, False)
        matrix_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data)
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

        for specified_pair_file in pair_matrix_files:
            if specified_pair_file == 'stdout' or specified_pair_file == 'stderr':
                continue
            else:
                position_pair = tuple(specified_pair_file[:-4].split('_')[2:4])
                pair_pssm = PSSM(output_dir + 'all_alleles_v7/' + allele + '/' + specified_pair_file, True)
                pair_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data)
                pair_pssm.predict()
                pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

                pcc_diff = pair_performance[0] - matrix_performance[0]
                if pcc_diff < -0.1:
                    print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff)
                    continue
                if position_pair not in position_pair_pcc_dict:
                    position_pair_pcc_dict[position_pair] = [pcc_diff]
                else:
                    position_pair_pcc_dict[position_pair].append(pcc_diff)

                if pair_performance[1] is not None and matrix_performance[1] is not None:
                    auc_diff = pair_performance[1] - matrix_performance[1]
                    if position_pair not in position_pair_auc_dict:
                        position_pair_auc_dict[position_pair] = [auc_diff]
                    else:
                        position_pair_auc_dict[position_pair].append(auc_diff)

    position_pair_pcc_list = []
    position_pair_auc_list = []
    for position_pair, diff_list in position_pair_pcc_dict.iteritems():
        avg = sum(diff_list)/float(len(diff_list))
        position_pair_pcc_list.append((position_pair, avg))
    for position_pair, diff_list in position_pair_auc_dict.iteritems():
        avg = sum(diff_list)/float(len(diff_list))
        position_pair_auc_list.append((position_pair, avg))

    position_pair_pcc_list = sorted(position_pair_pcc_list, key=lambda x: x[1], reverse=True)
    position_pair_auc_list = sorted(position_pair_auc_list, key=lambda x: x[1], reverse=True)

    pcc_diff_file = open(output_dir + 'all_alleles_v7/pcc_diff_selected_pairs.txt', 'wb')
    auc_diff_file = open(output_dir + 'all_alleles_v7/auc_diff_selected_pairs.txt', 'wb')
    pcc_diff_file.write('\n'.join([str(x) for x in position_pair_pcc_list]))
    auc_diff_file.write('\n'.join([str(x) for x in position_pair_auc_list]))
    pcc_diff_file.close()
    auc_diff_file.close()