def calc_performance_differences_with_selected_pairs(allele_fnames, output_fname):
    """

    :param allele_fnames: list of tuples, (blind data fname, scoring matrix fname, pair matrix fname)
    :param output_fname: Desired filename to save differences to
    :return:
    """
    pcc_diff_dict = {}
    auc_diff_dict = {}
    for allele in allele_fnames:
        allele_name = get_allele_name_from_path(allele[0])
        # if '0206' in allele_name:
        #     continue
        matrix_pssm = PSSM(allele[1], False)
        matrix_pssm.load_peptides(allele[0])
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)
        for position1 in range(1, 10):
            for position2 in range(position1 + 1, 10):
                position_pair = (position1, position2)
                pair_pssm = PSSM(allele[2], True, position_pair)
                if pair_pssm.num_pairs > 0:
                    pair_pssm.load_peptides(allele[0])
                    pair_pssm.predict()
                    pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

                    pcc_diff = pair_performance[0] - matrix_performance[0]
                    # if pcc_diff < -0.2:
                    #     print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff)
                    if position_pair not in pcc_diff_dict:
                        pcc_diff_dict[position_pair] = [pcc_diff]
                    else:
                        pcc_diff_dict[position_pair].append(pcc_diff)

                    if None not in (pair_performance[1], matrix_performance[1]):
                        auc_diff = pair_performance[1] - matrix_performance[1]
                        if position_pair not in auc_diff_dict:
                            auc_diff_dict[position_pair] = [auc_diff]
                        else:
                            auc_diff_dict[position_pair].append(auc_diff)

    pcc_sorted_differences = []
    for position_pair, coefficient_values in pcc_diff_dict.iteritems():
        pcc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values))))
    pcc_sorted_differences = sorted(pcc_sorted_differences, key=lambda x: x[1], reverse=True)

    auc_sorted_differences = []
    for position_pair, coefficient_values in auc_diff_dict.iteritems():
        auc_sorted_differences.append((position_pair, sum(coefficient_values)/float(len(coefficient_values))))
    auc_sorted_differences = sorted(auc_sorted_differences, key=lambda x: x[1], reverse=True)

    pcc_file = open('../selected_pairs/pcc_' + output_fname, 'wb')
    pcc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in pcc_sorted_differences]))
    pcc_file.close()

    auc_file = open('../selected_pairs/auc_' + output_fname, 'wb')
    auc_file.write('\n'.join([str(x[0]) + '\t' + str(x[1]) for x in auc_sorted_differences]))
    auc_file.close()
Beispiel #2
0
def predict_results(tuples_to_predict, method, use_pair_coeffs=False, log_transform_measurements=True):
    for pair in tuples_to_predict:
        smm_classifier = PSSM(pair[0], use_pair_coeffs)
        smm_classifier.load_peptides(pair[1], log_transform_measurements)
        smm_classifier.predict()
        allele_filename = pair[0].split("/")
        allele_name = allele_filename[1]
        mode = (allele_filename[-1].split("-")[-1])[0:-4]
        print (" ".join((allele_name, method, mode)))
        neg_ct = 0
        for pred in smm_classifier.predicted_values:
            if pred < 0:
                neg_ct += 1
            # if 'log' in method:
            #     pred = math.pow(10, pred)
        print (
            "there were "
            + str(neg_ct)
            + " neg predictions out of "
            + str(len(smm_classifier.predicted_values))
            + " predictions"
        )
        print ("that is " + str(1.0 * neg_ct / len(smm_classifier.predicted_values)) + "% negative")
        trunk_cone_performance = get_performance(smm_classifier.measured_values, smm_classifier.predicted_values)
        print "\t".join(["pcc", "auc", "rmsd"])
        print "\t".join(map(str, trunk_cone_performance))
        print ""
Beispiel #3
0
def get_matrix_and_pair_performance(blind_data, scoring_matrix, pair_matrix):
    """
    Convienence function used for analysis. Oftentimes I've found that I just needed to get the pcc/auc values from
    a model, so this function does the predictions for both with and without pair coefficients on the same blind set

    :param blind_data: The blind split used to evaluate performance
    :param scoring_matrix: Path to a scoring matrix without pair coefficients
    :param pair_matrix: Path to a scoring matrix with pair coefficients
    :return: For both matrices a performance tuple with (PCC, AUC, RMSD)
    """
    matrix_pssm = PSSM(scoring_matrix, False)
    matrix_pssm.load_peptides(blind_data)
    matrix_pssm.predict()
    matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

    pair_pssm = PSSM(pair_matrix, True)
    pair_pssm.load_peptides(blind_data)
    pair_pssm.predict()
    pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

    return matrix_performance, pair_performance
def calc_pair_improvements():
    allele_dirs = os.listdir(output_dir + 'all_alleles_v7/')
    allele_dirs = sorted(allele_dirs)
    position_pair_pcc_dict = {}
    position_pair_auc_dict = {}

    index_offset = 0
    for index, allele in enumerate(allele_dirs):
        pair_matrix_files = os.listdir(output_dir + 'all_alleles_v7/' + allele)
        blind_data = blind_files[index]
        allele_name = get_allele_name_from_path(blind_data)
        if (index - index_offset) >= len(matrix_files):
            break
        matrix_file = matrix_files[index - index_offset]

        if allele not in matrix_file:
            index_offset += 1
            continue

        assert allele in blind_data and allele in matrix_file

        # trying to seee how improvements change when only looking at 10 biggest alleles
        # if allele not in top_ten_allele_names:
        #     continue

        matrix_pssm = PSSM(output_dir + 'all_alleles_v5/matrix/' + matrix_file, False)
        matrix_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data)
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

        for specified_pair_file in pair_matrix_files:
            if specified_pair_file == 'stdout' or specified_pair_file == 'stderr':
                continue
            else:
                position_pair = tuple(specified_pair_file[:-4].split('_')[2:4])
                pair_pssm = PSSM(output_dir + 'all_alleles_v7/' + allele + '/' + specified_pair_file, True)
                pair_pssm.load_peptides(data_dir + 'blind_subsets/' + blind_data)
                pair_pssm.predict()
                pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

                pcc_diff = pair_performance[0] - matrix_performance[0]
                if pcc_diff < -0.1:
                    print(allele_name, str(position_pair), ' has pcc_diff of ', pcc_diff)
                    continue
                if position_pair not in position_pair_pcc_dict:
                    position_pair_pcc_dict[position_pair] = [pcc_diff]
                else:
                    position_pair_pcc_dict[position_pair].append(pcc_diff)

                if pair_performance[1] is not None and matrix_performance[1] is not None:
                    auc_diff = pair_performance[1] - matrix_performance[1]
                    if position_pair not in position_pair_auc_dict:
                        position_pair_auc_dict[position_pair] = [auc_diff]
                    else:
                        position_pair_auc_dict[position_pair].append(auc_diff)

    position_pair_pcc_list = []
    position_pair_auc_list = []
    for position_pair, diff_list in position_pair_pcc_dict.iteritems():
        avg = sum(diff_list)/float(len(diff_list))
        position_pair_pcc_list.append((position_pair, avg))
    for position_pair, diff_list in position_pair_auc_dict.iteritems():
        avg = sum(diff_list)/float(len(diff_list))
        position_pair_auc_list.append((position_pair, avg))

    position_pair_pcc_list = sorted(position_pair_pcc_list, key=lambda x: x[1], reverse=True)
    position_pair_auc_list = sorted(position_pair_auc_list, key=lambda x: x[1], reverse=True)

    pcc_diff_file = open(output_dir + 'all_alleles_v7/pcc_diff_selected_pairs.txt', 'wb')
    auc_diff_file = open(output_dir + 'all_alleles_v7/auc_diff_selected_pairs.txt', 'wb')
    pcc_diff_file.write('\n'.join([str(x) for x in position_pair_pcc_list]))
    auc_diff_file.write('\n'.join([str(x) for x in position_pair_auc_list]))
    pcc_diff_file.close()
    auc_diff_file.close()
def create_graphs():
    fname_tuples = []  # (blind data, scoring matrix, scoring matrix w/ pair coeffs)
    for index, blind_fname in enumerate(blind_files):
        allele_name = blind_fname[:-10]
        matrix_fname = get_file_by_allele(matrix_files, allele_name)
        pairs_fname = get_file_by_allele(pair_files, allele_name)
        if matrix_fname is None or pairs_fname is None:
            continue
        else:
            fname_tuples.append((blind_fname, matrix_fname, pairs_fname))

    matrix_pcc = []
    pair_pcc = []
    matrix_auc = []
    pair_auc = []

    position_pair_lists = []

    for allele_files in fname_tuples:
        allele_name = allele_files[0][:-10]

        matrix_pssm = PSSM(matrix_dir + allele_files[1], False)
        matrix_pssm.load_peptides(allele_data_dir + allele_files[0], True)
        matrix_pssm.predict()
        matrix_performance = get_performance(matrix_pssm.measured_values, matrix_pssm.predicted_values)

        pair_pssm = PSSM(pairs_dir + allele_files[2], True)
        pair_pssm.load_peptides(allele_data_dir + allele_files[0], True)
        pair_pssm.predict()
        pair_performance = get_performance(pair_pssm.measured_values, pair_pssm.predicted_values)

        matrix_pcc.append(matrix_performance[0])
        pair_pcc.append(pair_performance[0])
        matrix_auc.append(matrix_performance[1])
        pair_auc.append(pair_performance[1])

        if allele_name in top_ten_allele_names:
            position_pair_lists.append(pair_pssm.position_pair_vals)

        if 'A-0101' in allele_name or 'A-0201' in allele_name or 'A-0202' in allele_name or 'A-0301' in allele_name:
            plt.scatter(matrix_pssm.measured_values, matrix_pssm.predicted_values)
            plt.plot([0, 8], [0, 8], 'k-')
            plt.axis((0, 8.0, 0, 8.0))
            plt.xlabel('Measured log10(IC50)')
            plt.ylabel('Predicted log10(IC50)')
            plt.title(allele_name + ' Matrix')
            plt.savefig(output_dir + 'all_alleles_' + version + '/' + allele_name + '_matrix_meas_vs_pred.png')
            plt.clf()
            plt.cla()
            plt.close()

            plt.scatter(pair_pssm.measured_values, pair_pssm.predicted_values)
            plt.plot([0, 8], [0, 8], 'k-')
            plt.axis((0, 8.0, 0, 8.0))
            plt.xlabel('Measured log10(IC50)')
            plt.ylabel('Predicted log10(IC50)')
            plt.title(allele_name + ' with ' + str(pair_pssm.num_pairs) + ' pairs selected')
            plt.savefig(output_dir + 'all_alleles_' + version + '/' + allele_name + '_pair_meas_vs_pred.png')
            plt.clf()
            plt.cla()
            plt.close()

    calc_avg_pair_rms(position_pair_lists)

    plt.scatter(matrix_pcc, pair_pcc)
    plt.plot([0, 1], [0, 1], 'k-')
    plt.axis((0.5, 1.0, 0.5, 1.0))
    plt.xlabel('Pairs(-)')
    plt.ylabel('Pairs(+)')
    plt.title('Matrix vs Pair PCC')
    plt.savefig(output_dir + 'all_alleles_' + version + '/mat_vs_pair_pcc.png')
    plt.clf()
    plt.cla()
    plt.close()

    plt.scatter(matrix_auc, pair_auc)
    plt.plot([0, 1], [0, 1], 'k-')
    plt.axis((0.75, 1.0, 0.75, 1.0))
    plt.xlabel('Pairs(-)')
    plt.ylabel('Pairs(+)')
    plt.title('Matrix vs Pair AUC')
    plt.savefig(output_dir + 'all_alleles_' + version + '/mat_vs_pair_auc.png')
    plt.clf()
    plt.cla()
    plt.close()
Beispiel #6
0
def compare_new_and_old_results():
    peptides_to_score = 'HLA-A-0250-9.txt'
    header = ['pcc', 'auc', 'rmsd']

    #sm stands for scoring matrix
    #these were generated from trunk branch using all four lambda/covariance options
    cgroup_sm = PSSM('HLA-A-0250-9.txt-cgroup.txt', False)
    cgroup_sm.load_peptides(peptides_to_score)
    cgroup_sm.predict()
    cgroup_sm.test_results()
    print 'cgroup results'
    print str(cgroup_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    cgroup_performance = get_performance(cgroup_sm.measured_values, cgroup_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, cgroup_performance))
    print ''

    lgroup_sm = PSSM('HLA-A-0250-9.txt-lgroup.txt', False)
    lgroup_sm.load_peptides(peptides_to_score)
    lgroup_sm.predict()
    print 'lgroup results'
    print str(lgroup_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    lgroup_performance = get_performance(lgroup_sm.measured_values, lgroup_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, lgroup_performance))
    print ''

    cone_sm = PSSM('HLA-A-0250-9.txt-cone.txt', False)
    cone_sm.load_peptides(peptides_to_score)
    cone_sm.predict()
    print 'cone results'
    print str(cone_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    cone_performance = get_performance(cone_sm.measured_values, cone_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, cone_performance))
    print ''

    lone_sm = PSSM('HLA-A-0250-9.txt-lone.txt', False)
    lone_sm.load_peptides(peptides_to_score)
    lone_sm.predict()
    print 'lone results'
    print str(lone_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    lone_performance = get_performance(lone_sm.measured_values, lone_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, lone_performance))
    print ''

    production_sm = PSSM('HLA-A-0250-9-production.txt', False)
    production_sm.load_peptides(peptides_to_score)
    production_sm.predict()
    print 'production results'
    print str(production_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    production_performance = get_performance(production_sm.measured_values, production_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, production_performance))
    print ''

    #generated using pcoeff_one
    #mat outputted before TrainPairs called
    mat_sm = PSSM('mat-HLA-A-0250-9.txt', False)
    mat_sm.load_peptides(peptides_to_score)
    mat_sm.predict()
    print 'pre pair training results'
    print str(mat_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    mat_performance = get_performance(mat_sm.measured_values, mat_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, mat_performance))
    print ''

    #after TrainPairs
    pair_coeff_sm = PSSM('pairs-HLA-A-0250-9.txt')
    pair_coeff_sm.load_peptides(peptides_to_score)
    pair_coeff_sm.predict()
    print 'pcoeff_one results'
    print str(pair_coeff_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    pair_coeff_performance = get_performance(pair_coeff_sm.measured_values, pair_coeff_sm.peptide_predictions)
    print '\t'.join(header)
    print '\t'.join(map(str, pair_coeff_performance))
    print ''
Beispiel #7
0
    # fname_peptides_to_score = 'HLA-A-0216-9.txt'

    # fname_scoring_matrix = 'pairs-HLA-A-0250-9.txt'
    # fname_scoring_matrix = 'mat-HLA-A-0250-9.txt'
    # fname_peptides_to_score = 'HLA-A-0250-9.txt'

    # fname_scoring_matrix = 'pairs-HLA-A-2602-9.txt'
    # fname_scoring_matrix = 'mat-HLA-A-2602-9.txt'
    # fname_peptides_to_score = 'HLA-A-2602-9.txt'

    pair_coeff_sm = PSSM('data/HLA-A-0201/HLA-A-0201-9-pcoeff_group.txt')
    pair_coeff_sm.load_peptides('data/HLA-A-0201/HLA-A-0201-9.txt')
    pair_coeff_sm.predict()
    print '0201 pcoeff_one results'
    print str(pair_coeff_sm.peptide_predictions).replace(' ','').replace('[','').replace(']','').replace(',','\n')
    pair_coeff_performance = get_performance(pair_coeff_sm.measured_values, pair_coeff_sm.peptide_predictions)
    print '\t'.join(['pcc', 'auc', 'rmsd'])
    print '\t'.join(map(str, pair_coeff_performance))
    print ''

    # fname_predictions = 'pred' + fname_scoring_matrix

    
    # pssm = PSSM(fname_scoring_matrix, False)
    # pssm.load_peptides(fname_peptides_to_score)
    # score_list = pssm.predict()
    # print score_list
    # pssm.test_results()
    # compare_new_and_old_results()
    print 'finished'
    # header = ['peptide', 'predAffinity(log10(IC50))']