def calc_performance_differences_with_selected_pairs(allele_fnames, output_fname):
    """

    :param allele_fnames: list of tuples, (blind data fname, scoring matrix fname, pair matrix fname)
    :param output_fname: Desired filename to save differences to
    :return:
    """
    pcc_diff_dict = {}
    auc_diff_dict = {}
    for allele in allele_fnames:
        allele_name = get_allele_name_from_path(allele[0])
        # if '0206' in allele_name:
        #     continue
        matrix_pssm = PSSM(allele[1], False)
        matrix_pssm.load_peptides(allele[0])
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)
        for position1 in range(1, 10):
            for position2 in range(position1 + 1, 10):
                position_pair = (position1, position2)
                pair_pssm = PSSM(allele[2], True, position_pair)
                if pair_pssm.num_pairs > 0:
                    pair_pssm.load_peptides(allele[0])
                    pair_pssm.predict()
                    pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

                    pcc_diff = pair_performance[0] - matrix_performance[0]
                    # if pcc_diff < -0.2:
                    #     print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff)
                    if position_pair not in pcc_diff_dict:
                        pcc_diff_dict[position_pair] = [pcc_diff]
                    else:
                        pcc_diff_dict[position_pair].append(pcc_diff)

                    if None not in (pair_performance[1], matrix_performance[1]):
                        auc_diff = pair_performance[1] - matrix_performance[1]
                        if position_pair not in auc_diff_dict:
                            auc_diff_dict[position_pair] = [auc_diff]
                        else:
                            auc_diff_dict[position_pair].append(auc_diff)

    pcc_sorted_differences = []
    for position_pair, coefficient_values in pcc_diff_dict.iteritems():
        pcc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values))))
    pcc_sorted_differences = sorted(pcc_sorted_differences, key=lambda x: x[1], reverse=True)

    auc_sorted_differences = []
    for position_pair, coefficient_values in auc_diff_dict.iteritems():
        auc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values))))
    auc_sorted_differences = sorted(auc_sorted_differences, key=lambda x: x[1], reverse=True)

    pcc_file = open('../selected_pairs/pcc_' + output_fname, 'wb')
    pcc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in pcc_sorted_differences]))
    pcc_file.close()

    auc_file = open('../selected_pairs/auc_' + output_fname, 'wb')
    auc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in auc_sorted_differences]))
    auc_file.close()
Esempio n. 2
0
def get_matrix_and_pair_performance(blind_data, scoring_matrix, pair_matrix):
    """
    Convienence function used for analysis. Oftentimes I've found that I just needed to get the pcc/auc values from
    a model, so this function does the predictions for both with and without pair coefficients on the same blind set

    :param blind_data: The blind split used to evaluate performance
    :param scoring_matrix: Path to a scoring matrix without pair coefficients
    :param pair_matrix: Path to a scoring matrix with pair coefficients
    :return: For both matrices a performance tuple with (PCC, AUC, RMSD)
    """
    matrix_pssm = PSSM(scoring_matrix, False)
    matrix_pssm.load_peptides(blind_data)
    matrix_pssm.predict()
    matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

    pair_pssm = PSSM(pair_matrix, True)
    pair_pssm.load_peptides(blind_data)
    pair_pssm.predict()
    pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

    return matrix_performance, pair_performance
def calc_pair_improvements():
    allele_dirs = os.listdir(output_dir + 'all_alleles_v7/')
    allele_dirs = sorted(allele_dirs)
    position_pair_pcc_dict = {}
    position_pair_auc_dict = {}

    index_offset = 0
    for index, allele in enumerate(allele_dirs):
        pair_matrix_files = os.listdir(output_dir + 'all_alleles_v7/' + allele)
        blind_data = blind_files[index]
        allele_name = get_allele_name_from_path(blind_data)
        if (index - index_offset) >= len(matrix_files):
            break
        matrix_file = matrix_files[index - index_offset]

        if allele not in matrix_file:
            index_offset += 1
            continue

        assert allele in blind_data and allele in matrix_file

        # trying to seee how improvements change when only looking at 10 biggest alleles
        # if allele not in top_ten_allele_names:
        #     continue

        matrix_pssm = PSSM(output_dir + 'all_alleles_v5/matrix/' + matrix_file, False)
        matrix_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data)
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

        for specified_pair_file in pair_matrix_files:
            if specified_pair_file == 'stdout' or specified_pair_file == 'stderr':
                continue
            else:
                position_pair = tuple(specified_pair_file[:-4].split('_')[2:4])
                pair_pssm = PSSM(output_dir + 'all_alleles_v7/' + allele + '/' + specified_pair_file, True)
                pair_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data)
                pair_pssm.predict()
                pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

                pcc_diff = pair_performance[0] - matrix_performance[0]
                if pcc_diff < -0.1:
                    print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff)
                    continue
                if position_pair not in position_pair_pcc_dict:
                    position_pair_pcc_dict[position_pair] = [pcc_diff]
                else:
                    position_pair_pcc_dict[position_pair].append(pcc_diff)

                if pair_performance[1] is not None and matrix_performance[1] is not None:
                    auc_diff = pair_performance[1] - matrix_performance[1]
                    if position_pair not in position_pair_auc_dict:
                        position_pair_auc_dict[position_pair] = [auc_diff]
                    else:
                        position_pair_auc_dict[position_pair].append(auc_diff)

    position_pair_pcc_list = []
    position_pair_auc_list = []
    for position_pair, diff_list in position_pair_pcc_dict.iteritems():
        avg = sum(diff_list)/float(len(diff_list))
        position_pair_pcc_list.append((position_pair, avg))
    for position_pair, diff_list in position_pair_auc_dict.iteritems():
        avg = sum(diff_list)/float(len(diff_list))
        position_pair_auc_list.append((position_pair, avg))

    position_pair_pcc_list = sorted(position_pair_pcc_list, key=lambda x: x[1], reverse=True)
    position_pair_auc_list = sorted(position_pair_auc_list, key=lambda x: x[1], reverse=True)

    pcc_diff_file = open(output_dir + 'all_alleles_v7/pcc_diff_selected_pairs.txt', 'wb')
    auc_diff_file = open(output_dir + 'all_alleles_v7/auc_diff_selected_pairs.txt', 'wb')
    pcc_diff_file.write('\n'.join([str(x) for x in position_pair_pcc_list]))
    auc_diff_file.write('\n'.join([str(x) for x in position_pair_auc_list]))
    pcc_diff_file.close()
    auc_diff_file.close()
def create_graphs():
    fname_tuples = []  # (blind data, scoring matrix, scoring matrix w/ pair coeffs)
    for index, blind_fname in enumerate(blind_files):
        allele_name = blind_fname[:-10]
        matrix_fname = get_file_by_allele(matrix_files, allele_name)
        pairs_fname = get_file_by_allele(pair_files, allele_name)
        if matrix_fname is None or pairs_fname is None:
            continue
        else:
            fname_tuples.append((blind_fname, matrix_fname, pairs_fname))

    matrix_pcc = []
    pair_pcc = []
    matrix_auc = []
    pair_auc = []

    position_pair_lists = []

    for allele_files in fname_tuples:
        allele_name = allele_files[0][:-10]

        matrix_pssm = PSSM(matrix_dir + allele_files[1], False)
        matrix_pssm.load_peptides(allele_data_dir + allele_files[0], True)
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

        pair_pssm = PSSM(pairs_dir + allele_files[2], True)
        pair_pssm.load_peptides(allele_data_dir + allele_files[0], True)
        pair_pssm.predict()
        pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

        matrix_pcc.append(matrix_performance[0])
        pair_pcc.append(pair_performance[0])
        matrix_auc.append(matrix_performance[1])
        pair_auc.append(pair_performance[1])

        if allele_name in top_ten_allele_names:
            position_pair_lists.append(pair_pssm.position_pair_vals)

        if 'A-0101' in allele_name or 'A-0201' in allele_name or 'A-0202' in allele_name or 'A-0301' in allele_name:
            plt.scatter(matrix_pssm.measured_values, matrix_pssm.predicted_values)
            plt.plot([0, 8], [0, 8], 'k-')
            plt.axis((0, 8.0, 0, 8.0))
            plt.xlabel('Measured log10(IC50)')
            plt.ylabel('Predicted log10(IC50)')
            plt.title(allele_name + ' Matrix')
            plt.savefig(output_dir + 'all_alleles_' + version + '/' + allele_name + '_matrix_meas_vs_pred.png')
            plt.clf()
            plt.cla()
            plt.close()

            plt.scatter(pair_pssm.measured_values, pair_pssm.predicted_values)
            plt.plot([0, 8], [0, 8], 'k-')
            plt.axis((0, 8.0, 0, 8.0))
            plt.xlabel('Measured log10(IC50)')
            plt.ylabel('Predicted log10(IC50)')
            plt.title(allele_name + ' with ' + str(pair_pssm.num_pairs) + ' pairs selected')
            plt.savefig(output_dir + 'all_alleles_' + version + '/' + allele_name + '_pair_meas_vs_pred.png')
            plt.clf()
            plt.cla()
            plt.close()

    calc_avg_pair_rms(position_pair_lists)

    plt.scatter(matrix_pcc, pair_pcc)
    plt.plot([0, 1], [0, 1], 'k-')
    plt.axis((0.5, 1.0, 0.5, 1.0))
    plt.xlabel('Pairs(-)')
    plt.ylabel('Pairs(+)')
    plt.title('Matrix vs Pair PCC')
    plt.savefig(output_dir + 'all_alleles_' + version + '/mat_vs_pair_pcc.png')
    plt.clf()
    plt.cla()
    plt.close()

    plt.scatter(matrix_auc, pair_auc)
    plt.plot([0, 1], [0, 1], 'k-')
    plt.axis((0.75, 1.0, 0.75, 1.0))
    plt.xlabel('Pairs(-)')
    plt.ylabel('Pairs(+)')
    plt.title('Matrix vs Pair AUC')
    plt.savefig(output_dir + 'all_alleles_' + version + '/mat_vs_pair_auc.png')
    plt.clf()
    plt.cla()
    plt.close()