Example #1
0
def get_ppv_hbond(fasta_filename,
                  c_filename,
                  hbond_filename,
                  factor=1.0,
                  min_score=-1.0,
                  sep=' ',
                  outfilename=''):

    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if min_score == -1.0 and count >= ref_len * factor:
            break
        if score < min_score:
            break

    ref_contact_map = np.zeros((ref_len, ref_len))

    hbonds_raw = open(hbond_filename).readlines()
    hbonds = [line.strip().split(' ')
              for line in hbonds_raw]  #map(split(' '), map(strip, hbonds_raw))

    for h in hbonds:
        i = int(h[0]) - 1
        j = int(h[1]) - 1
        val = float(h[2])
        ref_contact_map[i, j] = -val
        ref_contact_map[j, i] = -val

    PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map,
                                 ref_len, factor)

    print '%s %s %s %s' % (hbond_filename, PPV, TP, FP)
    return (hbond_filename, PPV, TP, FP)
def get_scores_from_contacts(c_filename, min_dist, factor_value, min_score, ref_len):
    """ Return a tupla unpacking of three lists,
    [contacts_x], [contacts_y], [scores]. """

    # get separator from c_filename
    sep = get_separator(c_filename)

    # get a list ranked predicted contacts for those
    # carbon-beta (CB) that are 5 residues separated.
    # In the function "parse_contacts.parse()", min_dist is 5 for default.
    # This returns parse_contacts.parse: [(score, resA_CB, resB_CB)]
    contacts = parse_contacts.parse(c_filename, sep, min_dist)

    # Build a list for each residue numparseber
    # e.g.: from resA_CB to resA_numN ---> contacts_x = [resA_num1, ..., resA_numN]
    # e.g.: from resB_CB to resB_numN ---> contacts_y = [resB_num1, ..., resB_numN]
    contacts_x = []
    contacts_y = []
    # Build a score list
    scores = []

    num_c = len(contacts)
    count = 0
    for i in range(num_c):
        score = contacts[i][0]
        # It use "- 1" because the calling gives the real biological position
        # of the residue and python start counting at zero.
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        # Calculate the distance in the sequence position between resA and resB
        pos_diff = abs(c_x - c_y)
        # Boolean declaration with the distances.
        # Check if those are less than 5 residues far from each other.
        too_close = pos_diff < min_dist

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        # Check if the contact predicted is below than min_score
        # and the count are grater or equal than ref_len*factor_value (default, len*1.0)
        if min_score == -1.0 and count >= ref_len * factor_value:
            break
        if score < min_score:
            break
    return contacts_x, contacts_y, scores
Example #3
0
# 10 steps during the run, and write stats every 10 steps
md.optimize(atmsel,
            temperature=300,
            max_iterations=50,
            actions=[
                actions.write_structure(10, query_id + '.D9998%04d.pdb'),
                actions.trace(10, trcfil)
            ])
# Finish off with some more CG, and write stats every 5 steps
cg.optimize(atmsel, max_iterations=20, actions=[actions.trace(5, trcfil)])

mpdf = atmsel.energy()

mdl.write(file=query_id + '.D00000001.pdb')

contacts = parse_contacts.parse(open(contact_filename, 'r'))
count = 0
seq_len = len(aln[query_id])
for (score, i, j) in contacts:
    rsr.add(
        forms.gaussian(group=physical.xy_distance,
                       feature=features.distance(mdl.atoms['CA:%d' % i],
                                                 mdl.atoms['CA:%d' % j]),
                       mean=10.0,
                       stdev=2))
    #rsr.add(MyFade(group=physical.xy_distance,
    #            feature=features.distance(mdl.atoms['CA:%d' % i],
    #                                      mdl.atoms['CA:%d' % j]),
    #            cutoff_lower=-100, cutoff_upper=100, fade_zone=92, well_depth=-150))
    is_gly_a = aln[query_id].residues[i - 1].code == 'G'
    is_gly_b = aln[query_id].residues[j - 1].code == 'G'
def plot_map(fasta_filename,
             c_filename,
             factor,
             c2_filename='',
             ss_fname='',
             psipred_horiz_fname='',
             psipred_vert_fname='',
             pdb_filename='',
             is_heavy=False,
             chain='',
             sep=',',
             outfilename=''):

    acc = fasta_filename.split('.')[0][:4]

    # get sequence
    seq = list(parse_fasta.read_fasta(open(fasta_filename,
                                           'r')).values())[0][0]
    ref_len = len(seq)

    # get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    # contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5  # Also checking here,
        # should remove in parse
        # too_close = False

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if count >= ref_len * factor:
            break

    # start plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # plot secondary structure on the diagonal if given
    if psipred_horiz_fname or psipred_vert_fname or ss_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        elif psipred_vert_fname:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))
        else:
            ss = parse_ss.parse(open(ss_fname, 'r'))

        assert len(ss) == ref_len

        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    # plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in range(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]

        PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali)

        print('%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1]))

        ax.scatter(ref_contacts_x,
                   ref_contacts_y,
                   marker='o',
                   c='#CCCCCC',
                   lw=0,
                   edgecolor='#CCCCCC')

    # plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        # contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1

            if count >= ref_len * factor:
                break

        # use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y,
                                         ref_contact_map, atom_seq_ali,
                                         ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y,
                                       ref_contact_map, atom_seq_ali)
            print('%s %s %s %s' %
                  (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1]))
            fig.suptitle('%s\nPPV (upper left) = %.2f |' % (PPVs[-1]) +
                         'PPV (lower right) = %.2f' % (PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c=tp2_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
        else:
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c='#D70909',
                            edgecolor='#D70909',
                            s=4,
                            linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=4,
                            linewidths=0.5)

    # plot predicted contacts from first contact map on both triangles
    # if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
        else:
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=scores[::-1],
                            s=4,
                            alpha=0.75,
                            cmap=cm.jet,
                            linewidths=0.1)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=scores[::-1],
                            s=4,
                            alpha=0.75,
                            cmap=cm.jet,
                            linewidths=0.1)
            plt.colorbar(sc)

    plt.gca().set_xlim([0, ref_len])
    plt.gca().set_ylim([0, ref_len])

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith(('.png', '.jpg', '.jpeg')):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
    plt.show()
Example #5
0
sys.path.append("/home/x_arnel/git/bioinfo-toolbox/")
from parsing import parse_fasta
from parsing import parse_contacts

sfile = sys.argv[1]
cfile = sys.argv[2]
target = sys.argv[3]
server = sys.argv[4]
ofilepath = sys.argv[5]
minsep = sys.argv[6]
minscore = sys.argv[7]

seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0]

contacts = parse_contacts.parse(open(cfile), min_dist=0)

print len(contacts)
print contacts[0]
print seq

ofile = open(ofilepath, 'w')

if server == "Pcons-net":
    ofile.write(
        "PFRMAT RR\nTARGET %s\nAUTHOR 5450-4562-0389\nMETHOD Pcons-net\nREMARK PconsC3\nMETHOD Improved contact predictions on\nMETHOD small protein families.\nMODEL  1\n"
        % target)
elif server == "PconsC2":
    ofile.write(
        "PFRMAT RR\nTARGET %s\nAUTHOR 4146-6019-9011\nMETHOD PconsC2\nREMARK PconsC2\nMETHOD Improved contact predictions using the\nMETHOD recognition of protein like contact\nMETHOD patterns.\nMODEL  1\n"
        % target)
Example #6
0
def get_ppv(fasta_filename, c_filename, pdb_filename, factor=1.0,
        min_score=-1.0, chain='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, print_dist=False):  
    
    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top ranked predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < min_dist

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if min_score == -1.0 and count >= ref_len * factor:
            break
        if score < min_score:
            break
    
    assert(len(contacts_x) == len(contacts_y) == len(scores))

    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    bfactor = parse_pdb.get_area(open(pdb_filename, 'r'), chain)
    surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain)

    if noalign:
        dist_mat = get_cb_contacts(cb_lst)
        cb_cutoff = 8
        ref_contact_map = dist_mat < cb_cutoff
        PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor)
    else:
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
                
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        gapped_cb_lst = []

        ali_lst =[]
        j = 0
        k = 0
        for i in xrange(len(atom_seq_ali)):
            #print i,j,k,seq_ali[i],atom_seq_ali[i]
            if atom_seq_ali[i] == '-':
                gapped_cb_lst.append(['-'])
                ali_lst.append(-9999)
                k += 1
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                ali_lst.append(j)
                gapped_cb_lst.append(cb_lst[j])
                k += 1
                j += 1

        dist_mat = get_cb_contacts(gapped_cb_lst)
        area = parse_pdb.get_area(open(pdb_filename, 'r'), chain)
        surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain)
        if print_dist:
            print_distances(contacts_x, contacts_y, scores, dist_mat,
                                                area, surf, ref_len,ref_len,
                                                seq, ali_lst=ali_lst, atom_seq=atom_seq,
                                                outfile=outfilename)
        cb_cutoff = 8
        ref_contact_map = dist_mat < cb_cutoff
   
        PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali)
    if name:
        print '%s %s %s %s' % (name, PPV, TP, FP)
    else:
        print '%s %s %s %s %s' % (fasta_filename, c_filename, PPV, TP, FP)
    return (pdb_filename, PPV, TP, FP)
Example #7
0
def get_ppv(fasta_filenameA,
            c_filename,
            pdb_filenameA,
            fasta_filenameB,
            pdb_filenameB,
            factor=1.0,
            min_score=-1.0,
            chainA='',
            chainB='',
            sep=' ',
            outfilename='',
            name='',
            noalign=False,
            min_dist=5,
            interfacelen=10,
            print_dist=False,
            cutoff=0.25):

    ### get sequence
    seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0]
    seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0]
    seq = seqA + seqA  # Actually the contact map sequence is just two copies of seqA

    ref_lenA = len(seqA)
    ref_lenB = len(seqB)
    ref_len = len(seq)

    ### get top ranked predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'),
                                    sep,
                                    min_dist=min_dist)

    contacts_x = []
    contacts_y = []
    scores = []
    contactsA_x = []
    contactsA_y = []
    scoresA = []
    contactsB_x = []
    contactsB_y = []
    scoresB = []
    contactsI_x = []
    contactsI_y = []
    scoresI = []
    contact_dict = {}

    count = 0
    countA = 0
    countB = 0
    countI = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1
        #print i,c_x,c_y,score

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < min_dist

        if not too_close:
            # The contacts only covers
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            #contacts_x.append(c_x+ref_lenA)
            #contacts_y.append(c_y+ref_lenA)
            #scores.append(score)
            contactsA_x.append(c_x)
            contactsA_y.append(c_y)
            scoresA.append(score)
            contactsB_x.append(c_x)
            contactsB_y.append(c_y)
            scoresB.append(score)

            #        if min_score == -1.0 and count >= ref_len * factor:
            #            break
            #        if score < min_score:
            #            break

    assert (len(contacts_x) == len(contacts_y) == len(scores))
    assert (len(contactsA_x) == len(contactsA_y) == len(scoresA))
    assert (len(contactsB_x) == len(contactsB_y) == len(scoresB))
    assert (len(contactsI_x) == len(contactsI_y) == len(scoresI))

    cb_lstA = parse_pdb.get_cb_coordinates(open(pdb_filenameA, 'r'), chainA)
    cb_lstB = parse_pdb.get_cb_coordinates(open(pdb_filenameB, 'r'), chainB)
    cb_lst = cb_lstA + cb_lstB
    bfactorA = parse_pdb.get_area(open(pdb_filenameA, 'r'), chainA)
    bfactorB = parse_pdb.get_area(open(pdb_filenameB, 'r'), chainB)
    bfactor = bfactorA + bfactorB
    surfA = parse_pdb.get_dist_to_surface(open(pdb_filenameA, 'r'), chainA)
    surfB = parse_pdb.get_dist_to_surface(open(pdb_filenameB, 'r'), chainB)
    surf = surfA + surfB
    #print cb_lst,noalign
    if noalign:
        dist_mat = get_cb_contacts(cb_lst)
        dist_matA = get_cb_contacts(cb_lstA)
        dist_matB = get_cb_contacts(cb_lstB)
        #PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor)
    else:
        atom_seqA = parse_pdb.get_atom_seq(open(pdb_filenameA, 'r'), chainA)
        atom_seqB = parse_pdb.get_atom_seq(open(pdb_filenameB, 'r'), chainB)
        atom_seq = atom_seqA + atom_seqB
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        alignA = pairwise2.align.globalms(atom_seqA, seqA, 2, -1, -0.5, -0.1)
        alignB = pairwise2.align.globalms(atom_seqB, seqA, 2, -1, -0.5,
                                          -0.1)  # Align to seq A
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        atom_seq_aliA = alignA[-1][0]
        seq_aliA = alignA[-1][1]
        atom_seq_aliB = alignB[-1][0]
        seq_aliB = alignB[-1][1]
        gapped_cb_lst = []
        gapped_cb_lstA = []
        gapped_cb_lstB = []
        ali_lst = []
        ali_lstA = []
        ali_lstB = []
        j = 0
        k = 0
        for i in xrange(len(atom_seq_ali)):
            #print i,j,k,seq_ali[i],atom_seq_ali[i]
            if atom_seq_ali[i] == '-':
                gapped_cb_lst.append(['-'])
                ali_lst.append(-9999)
                k += 1
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                ali_lst.append(j)
                gapped_cb_lst.append(cb_lst[j])
                k += 1
                j += 1
        j = 0
        k = 0
        for i in xrange(len(atom_seq_aliA)):
            if atom_seq_aliA[i] == '-':
                gapped_cb_lstA.append(['-'])
                ali_lstA.append(-9999)
                k += 1
            elif seq_aliA[i] == '-':
                j += 1
                continue
            else:
                ali_lstA.append(j)
                gapped_cb_lstA.append(cb_lstA[j])
                k += 1
                j += 1
        j = 0
        k = 0
        for i in xrange(len(atom_seq_aliB)):
            #print "B",i,j,k,seq_aliB[i],atom_seq_aliB[i]
            if atom_seq_aliB[i] == '-':
                gapped_cb_lstB.append(['-'])
                ali_lstB.append(-9999)
                k += 1
            elif seq_aliB[i] == '-':
                j += 1
                continue
            else:
                ali_lstB.append(j)
                gapped_cb_lstB.append(cb_lstB[j])
                k += 1
                j += 1

        #print len(gapped_cb_lst),len(gapped_cb_lstA),len(gapped_cb_lstB)
        dist_mat = get_cb_contacts(gapped_cb_lst)
        dist_matA = get_cb_contacts(gapped_cb_lstA)
        dist_matB = get_cb_contacts(gapped_cb_lstB)
    cb_cutoff = 8
    #ref_contact_map = dist_mat < cb_cutoff
    # This routine adds all interface and B chain contacts
    contacts_x, contacts_y, scores = get_interface_contacts(
        contacts_x,
        contacts_y,
        scores,
        dist_mat,
        ref_lenA,
        factor,
        cb_cutoff + 4,
        atom_seq_ali=atom_seq_ali)
    ref_contact_map = dist_mat < cb_cutoff
    ref_contact_mapA = dist_matA < cb_cutoff
    ref_contact_mapB = dist_matB < cb_cutoff
    # Here we need to append
    if print_dist:
        print_distances(contacts_x,
                        contacts_y,
                        scores,
                        dist_mat,
                        bfactor,
                        surf,
                        ref_lenA,
                        ref_lenB,
                        seq,
                        ali_lst=ali_lst,
                        atom_seq=atom_seq,
                        outfile=outfilename)

    Zscore = get_Zscore(contacts_x,
                        contacts_y,
                        ref_contact_map,
                        scores,
                        atom_seq_ali=atom_seq_ali)
    ZscoreA = get_Zscore(contactsA_x,
                         contactsA_y,
                         ref_contact_mapA,
                         scoresA,
                         atom_seq_ali=atom_seq_aliA)
    ZscoreB = get_Zscore(contactsB_x,
                         contactsB_y,
                         ref_contact_mapB,
                         scoresB,
                         atom_seq_ali=atom_seq_aliB)
    ZscoreI = get_Zscore_interface(contacts_x,
                                   contacts_y,
                                   ref_contact_map,
                                   ref_lenA,
                                   ref_lenB,
                                   scores,
                                   atom_seq_ali=atom_seq_ali)

    PPV, TP, FP = get_ppv_helper(contacts_x,
                                 contacts_y,
                                 ref_contact_map,
                                 ref_len,
                                 factor,
                                 atom_seq_ali=atom_seq_ali)
    PPVa, TPa, FPa = get_ppv_helper(contactsA_x,
                                    contactsA_y,
                                    ref_contact_mapA,
                                    interfacelen,
                                    factor,
                                    atom_seq_ali=atom_seq_aliA)
    PPVb, TPb, FPb = get_ppv_helper(contactsB_x,
                                    contactsB_y,
                                    ref_contact_mapB,
                                    interfacelen,
                                    factor,
                                    atom_seq_ali=atom_seq_aliB)
    PPVi, TPi, FPi, PPViE, TPiE, FPiE = get_ppv_helper_interface(
        contacts_x,
        contacts_y,
        ref_contact_map,
        bfactor,
        ref_lenA,
        ref_lenB,
        interfacelen,
        cutoff,
        atom_seq_ali=atom_seq_ali)
    #for i in range(10):
    #    print "I: ",i,contactsI_x[i],contactsI_y[i],scoresI[i],dist_mat[contactsI_x[i]][contactsI_y[i]],ref_contact_map[contactsI_x[i]][contactsI_y[i]]
    #    print "A: ",i,contactsA_x[i],contactsA_y[i],scoresA[i],dist_mat[contactsA_x[i]][contactsA_y[i]],ref_contact_map[contactsA_x[i]][contactsA_y[i]]
    #    print "B: ",i,contactsB_x[i],contactsB_y[i],scoresB[i],dist_mat[contactsB_x[i]][contactsB_y[i]],ref_contact_map[contactsB_x[i]][contactsB_y[i]]

    if name:
        print '%s %s %s %s %s' % (name, PPVa, TPa, FPa, ZscoreA)
        print '%s %s %s %s %s' % (name, PPVb, TPb, FPb, ZscoreB)
        print '%s %s %s %s %s' % ("BOTH", PPV, TP, FP, Zscore)
        print '%s %s %s %s %s' % ("Interface", PPVi, TPi, FPi, ZscoreI)
        print '%s %s %s %s' % ("Interface-Exposed", PPViE, TPiE, FPiE)
    else:
        print '%s %s %s %s %s %s' % (fasta_filenameA, c_filename, PPVa, TPa,
                                     FPa, ZscoreA)
        print '%s %s %s %s %s %s' % (fasta_filenameB, c_filename, PPVb, TPb,
                                     FPb, ZscoreB)
        print '%s %s %s %s %s %s' % ("BOTH", c_filename, PPV, TP, FP, Zscore)
        print '%s %s %s %s %s %s' % ("Interface", c_filename, PPVi, TPi, FPi,
                                     ZscoreI)
        print '%s %s %s %s %s' % ("Interface-Exposed", c_filename, PPViE, TPiE,
                                  FPiE)
    print 'PPV %s %s %s %s %s %s' % (c_filename, PPV, PPVa, PPVb, PPVi, PPViE)
    print 'Zscore %s %s %s %s %s' % (c_filename, Zscore, ZscoreA, ZscoreB,
                                     ZscoreI)
    return (pdb_filenameA, PPV, TP, FP)
Example #8
0
import sys
sys.path.append("/home/mircomic/toolbox")
from parsing import parse_contacts

# command line input
infile_name = sys.argv[1]

# guessing separator of constraint file
test_line = open(infile_name,'r').readline()
if len(test_line.split(',')) != 1:
    sep = ','
elif len(test_line.split(' ')) != 1:
    sep = ' '
else:
    sep = '\t'

# parse constraint file
c_list = parse_contacts.parse(open(infile_name, 'r'))
    
# sort contacts and write simple string
for c in c_list:
    print ('%s %s %s' % (c[1], c[2], c[0]))
Example #9
0
def get_dist(fasta_filename,
             c_filename,
             pdb_filename,
             chain='',
             sep='',
             outfilename='',
             noalign=False,
             dist_type='CB'):

    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=5)

    contacts_x = []
    contacts_y = []
    scores = []

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        contacts_x.append(c_x)
        contacts_y.append(c_y)
        scores.append(score)
        count += 1

    res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    ca_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain)

    if noalign:
        if dist_type == 'CB':
            dist_mat = get_dist_mat(cb_lst)
        elif dist_type == 'CA':
            dist_mat = get_dist_mat(ca_lst)
        else:
            dist_mat = get_dist_mat_heavy(res_lst)

        contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat)

    else:
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []
        gapped_ca_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
                gapped_ca_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                gapped_ca_lst.append(ca_lst[j])
                j += 1

        assert (len(gapped_ca_lst) == len(gapped_cb_lst) ==
                len(gapped_res_lst))

        if dist_type == 'CB':
            dist_mat = get_dist_mat(gapped_cb_lst)
        elif dist_type == 'CA':
            dist_mat = get_dist_mat(gapped_ca_lst)
        else:
            dist_mat = get_dist_mat_heavy(gapped_res_lst)

        contacts_dist = get_dist_helper(contacts_x,
                                        contacts_y,
                                        dist_mat,
                                        atom_seq_ali=atom_seq_ali)

    assert (len(contacts_dist) == len(contacts_x) == len(contacts_y) ==
            len(scores))

    num_c = len(contacts_dist)

    if outfilename:
        with open(outfilename, 'w') as outfile:
            for i in xrange(num_c):
                outfile.write('%s %s %f %f\n' % (contacts_x[i], contacts_y[i],
                                                 scores[i], contacts_dist[i]))

    return (contacts_x, contacts_y, scores, contacts_dist)
Example #10
0
from parsing import parse_fasta
from parsing import parse_contacts

if len(sys.argv) != 5:
    sys.stderr.write("Incorrect number of command line arguments.\n")
    sys.stderr.write("Usage: " + sys.argv[0] + " <sequence file> <contact file> <CASP target ID> <output filename>\n\n")
    sys.exit(0)


sfile = sys.argv[1]
cfile = sys.argv[2]
target = sys.argv[3]

seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0]

contacts = parse_contacts.parse(open(cfile), min_dist=0)

print len(contacts)
print contacts[0]
print seq

ofile = open(sys.argv[4], "w")

ofile.write(
    "PFRMAT RR\nTARGET %s\nAUTHOR 6685-2065-9124\nMETHOD Pcons-net\nREMARK PconsC2\nMETHOD Improved contact predictions using the\nMETHOD recognition of protein like contact\nMETHOD patterns.\nMODEL  1\n"
    % target
)

tmp_i = 1
for aa in seq:
    ofile.write(aa)
Example #11
0
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''):  
   
    acc = fasta_filename.split('.')[0][:4]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if count >= ref_len * factor:
            break
 

    ### start plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ### plot secondary structure on the diagonal if given
    if psipred_horiz_fname or psipred_vert_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        else:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))

        assert len(ss) == ref_len
 
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
                
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
        
        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
       
        PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali)
   
        print '%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1])
      
        ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')


    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1
               
            if count >= ref_len * factor:
                break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali)
            print '%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1])
            fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5)


    ### plot predicted contacts from first contact map on both triangles
    ### if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1)
            plt.colorbar(sc)

    plt.gca().set_xlim([0,ref_len])
    plt.gca().set_ylim([0,ref_len])

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith(('.png', '.jpg', '.jpeg')):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep='', sep2='', outfilename=''):  
   
    acc = c_filename.split('.')[0]

    ### get sequence
    #seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    #ref_len = len(seq)

    ### get id
    f = open(fasta_filename,"rU")
    seqs = SeqIO.parse(f,"fasta")
    
    # we assume there is only one record
    for record in seqs: 
        seq = str(record.seq)
        protein_id = record.id 
        ref_len = len(seq)

    # guessing separator of constraint file
    if sep == '':
        line = open(c_filename,'r').readline()
        if len(line.split(',')) != 1:
            sep = ','
        elif len(line.split(' ')) != 1:
            sep = ' '
        else:
            sep = '\t'

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if count >= ref_len * factor:
            break
 

    ### start plotting
    fig = plt.figure()
    plt.title('Contact map for ' + protein_id)
    ax = fig.add_subplot(111)

    ### plot secondary structure on the diagonal if given
    if psipred_filename:
        ss = parse_psipred.horizontal(open(psipred_filename, 'r'))
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        
        if (len(res_lst)==0) or (len(cb_lst)==0):
            print "Could not parse the PDB file, res_list or cb_list is empty"
            return

        try:
            atom_seq_ali = align[-1][0]
            seq_ali = align[-1][1] 
        except Exception,ex:
            print "Could not parse the PDB file:", ex
            return
        
        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
        
        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
       
        PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali)
   
        print '%s\t%s' % (acc, PPVs[-1])
      
        ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')

    ### plot predicted contacts from second contact map if given
    if c2_filename:

        # guessing separator of constraint file
        if sep2 == '':
            line = open(c_filename,'r').readline()
            if len(line.split(',')) != 1:
                sep2 = ','
            elif len(line.split(' ')) != 1:
                sep2 = ' '
            else:
                sep2 = '\t'

        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep2)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5