Beispiel #1
0
def realign(fasta_filename, pdb_filename, outfilename='', chain='*'):

    ### get sequence
    seq = list(parse_fasta.read_fasta(open(fasta_filename,
                                           'r')).values())[0][0]
    ref_len = len(seq)
    atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
    pdbfile = open(pdb_filename, 'r')
    align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
    atom_seq_ali = align[-1][0]
    seq_ali = align[-1][1]
    #print (atom_seq_ali,seq_ali)

    res_i = -9999
    resno = {}
    i = 0
    atompos = 0
    seqpos = 0
    maxlen = len(atom_seq_ali)
    for i in range(0, maxlen):
        if atom_seq_ali[i] == "-":
            seqpos += 1
        elif seq_ali[i] == "-":
            atompos += 1
            resno[atompos] = -9999
        else:
            atompos += 1
            seqpos += 1
            resno[atompos] = seqpos
    if not chain:
        chain = get_first_chain(pdbfile)
        pdbfile.seek(0)
    i = 0
    for line in pdbfile:
        if not line.startswith('ATOM'):
            continue
        atm_record = parse_pdb.parse_atm_record(line)
        if atm_record['chain'] != ' ' and atm_record[
                'chain'] != chain and chain != '*':
            continue
        if atm_record['res_no'] != res_i:
            i += 1
            res_i = atm_record['res_no']
        atm_record['res_no'] = resno[i]
        #print (atm_record)
        if resno[i] > 0:
            parse_pdb.write_pdb_atm_record(atm_record)
        #res_dict[res_i].append(np.array(atm))

    #pdbfile.close()

    return
def get_global_align_from_pdb(pdb_filename, chain, seq):
    """ Return a list with 1 tupla of 5 elements:
    [(pdb_aligned_seq,fasta_seq, float, float, int)]. """

    # Generate the atom sequence from input chain.
    # Default values in get_atom_seq(pdbfile, chain="", model=1, return_lines=False)
    atom_seq_chain = parse_pdb.get_atom_seq(pdb_filename, chain)

    # Align seq from fasta with seq from pdb
    # 2: match, -1: missmatch, -0.5: open gap, -0.1: extend gap
    # For H**o-oligomer we should use the two chains, e.g:
    # atom_seq_chain1 and atom_seq_chain2.
    # The result is a list with 1 tupla of 5 elements:
    # [(pdb_aligned_seq,fasta_seq, float, float, int)]
    align = pairwise2.align.globalms(atom_seq_chain, seq, 2, -1, -0.5, -0.1)

    return align
def get_co_pdb(pdb_filename, chain, cb_cutoff=8):

    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    dist_mat = get_cb_contacts(cb_lst)
    ref_contact_map = dist_mat < cb_cutoff
    atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
    np.fill_diagonal(ref_contact_map, 0)

    co = 0
    L = len(atom_seq)
    N = sum((ref_contact_map != 0).sum(0) / 2)

    for (i, j), is_contact in np.ndenumerate(ref_contact_map):
        if i < j and is_contact:
            S_ij = j - i
            co += S_ij
    co = float(co) / float(N * L)
    print co

    return co
def plot_map(fasta_filename,
             c_filename,
             factor,
             c2_filename='',
             ss_fname='',
             psipred_horiz_fname='',
             psipred_vert_fname='',
             pdb_filename='',
             is_heavy=False,
             chain='',
             sep=',',
             outfilename=''):

    acc = fasta_filename.split('.')[0][:4]

    # get sequence
    seq = list(parse_fasta.read_fasta(open(fasta_filename,
                                           'r')).values())[0][0]
    ref_len = len(seq)

    # get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    # contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5  # Also checking here,
        # should remove in parse
        # too_close = False

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if count >= ref_len * factor:
            break

    # start plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # plot secondary structure on the diagonal if given
    if psipred_horiz_fname or psipred_vert_fname or ss_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        elif psipred_vert_fname:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))
        else:
            ss = parse_ss.parse(open(ss_fname, 'r'))

        assert len(ss) == ref_len

        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    # plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in range(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]

        PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali)

        print('%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1]))

        ax.scatter(ref_contacts_x,
                   ref_contacts_y,
                   marker='o',
                   c='#CCCCCC',
                   lw=0,
                   edgecolor='#CCCCCC')

    # plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        # contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1

            if count >= ref_len * factor:
                break

        # use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y,
                                         ref_contact_map, atom_seq_ali,
                                         ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y,
                                       ref_contact_map, atom_seq_ali)
            print('%s %s %s %s' %
                  (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1]))
            fig.suptitle('%s\nPPV (upper left) = %.2f |' % (PPVs[-1]) +
                         'PPV (lower right) = %.2f' % (PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c=tp2_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
        else:
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c='#D70909',
                            edgecolor='#D70909',
                            s=4,
                            linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=4,
                            linewidths=0.5)

    # plot predicted contacts from first contact map on both triangles
    # if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
        else:
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=scores[::-1],
                            s=4,
                            alpha=0.75,
                            cmap=cm.jet,
                            linewidths=0.1)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=scores[::-1],
                            s=4,
                            alpha=0.75,
                            cmap=cm.jet,
                            linewidths=0.1)
            plt.colorbar(sc)

    plt.gca().set_xlim([0, ref_len])
    plt.gca().set_ylim([0, ref_len])

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith(('.png', '.jpg', '.jpeg')):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
    plt.show()
Beispiel #5
0
def get_ppv(fasta_filename, c_filename, pdb_filename, factor=1.0,
        min_score=-1.0, chain='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, print_dist=False):  
    
    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top ranked predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < min_dist

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if min_score == -1.0 and count >= ref_len * factor:
            break
        if score < min_score:
            break
    
    assert(len(contacts_x) == len(contacts_y) == len(scores))

    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    bfactor = parse_pdb.get_area(open(pdb_filename, 'r'), chain)
    surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain)

    if noalign:
        dist_mat = get_cb_contacts(cb_lst)
        cb_cutoff = 8
        ref_contact_map = dist_mat < cb_cutoff
        PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor)
    else:
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
                
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        gapped_cb_lst = []

        ali_lst =[]
        j = 0
        k = 0
        for i in xrange(len(atom_seq_ali)):
            #print i,j,k,seq_ali[i],atom_seq_ali[i]
            if atom_seq_ali[i] == '-':
                gapped_cb_lst.append(['-'])
                ali_lst.append(-9999)
                k += 1
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                ali_lst.append(j)
                gapped_cb_lst.append(cb_lst[j])
                k += 1
                j += 1

        dist_mat = get_cb_contacts(gapped_cb_lst)
        area = parse_pdb.get_area(open(pdb_filename, 'r'), chain)
        surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain)
        if print_dist:
            print_distances(contacts_x, contacts_y, scores, dist_mat,
                                                area, surf, ref_len,ref_len,
                                                seq, ali_lst=ali_lst, atom_seq=atom_seq,
                                                outfile=outfilename)
        cb_cutoff = 8
        ref_contact_map = dist_mat < cb_cutoff
   
        PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali)
    if name:
        print '%s %s %s %s' % (name, PPV, TP, FP)
    else:
        print '%s %s %s %s %s' % (fasta_filename, c_filename, PPV, TP, FP)
    return (pdb_filename, PPV, TP, FP)
Beispiel #6
0
def get_ppv(fasta_filenameA,
            c_filename,
            pdb_filenameA,
            fasta_filenameB,
            pdb_filenameB,
            factor=1.0,
            min_score=-1.0,
            chainA='',
            chainB='',
            sep=' ',
            outfilename='',
            name='',
            noalign=False,
            min_dist=5,
            interfacelen=10,
            print_dist=False,
            cutoff=0.25):

    ### get sequence
    seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0]
    seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0]
    seq = seqA + seqA  # Actually the contact map sequence is just two copies of seqA

    ref_lenA = len(seqA)
    ref_lenB = len(seqB)
    ref_len = len(seq)

    ### get top ranked predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'),
                                    sep,
                                    min_dist=min_dist)

    contacts_x = []
    contacts_y = []
    scores = []
    contactsA_x = []
    contactsA_y = []
    scoresA = []
    contactsB_x = []
    contactsB_y = []
    scoresB = []
    contactsI_x = []
    contactsI_y = []
    scoresI = []
    contact_dict = {}

    count = 0
    countA = 0
    countB = 0
    countI = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1
        #print i,c_x,c_y,score

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < min_dist

        if not too_close:
            # The contacts only covers
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            #contacts_x.append(c_x+ref_lenA)
            #contacts_y.append(c_y+ref_lenA)
            #scores.append(score)
            contactsA_x.append(c_x)
            contactsA_y.append(c_y)
            scoresA.append(score)
            contactsB_x.append(c_x)
            contactsB_y.append(c_y)
            scoresB.append(score)

            #        if min_score == -1.0 and count >= ref_len * factor:
            #            break
            #        if score < min_score:
            #            break

    assert (len(contacts_x) == len(contacts_y) == len(scores))
    assert (len(contactsA_x) == len(contactsA_y) == len(scoresA))
    assert (len(contactsB_x) == len(contactsB_y) == len(scoresB))
    assert (len(contactsI_x) == len(contactsI_y) == len(scoresI))

    cb_lstA = parse_pdb.get_cb_coordinates(open(pdb_filenameA, 'r'), chainA)
    cb_lstB = parse_pdb.get_cb_coordinates(open(pdb_filenameB, 'r'), chainB)
    cb_lst = cb_lstA + cb_lstB
    bfactorA = parse_pdb.get_area(open(pdb_filenameA, 'r'), chainA)
    bfactorB = parse_pdb.get_area(open(pdb_filenameB, 'r'), chainB)
    bfactor = bfactorA + bfactorB
    surfA = parse_pdb.get_dist_to_surface(open(pdb_filenameA, 'r'), chainA)
    surfB = parse_pdb.get_dist_to_surface(open(pdb_filenameB, 'r'), chainB)
    surf = surfA + surfB
    #print cb_lst,noalign
    if noalign:
        dist_mat = get_cb_contacts(cb_lst)
        dist_matA = get_cb_contacts(cb_lstA)
        dist_matB = get_cb_contacts(cb_lstB)
        #PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor)
    else:
        atom_seqA = parse_pdb.get_atom_seq(open(pdb_filenameA, 'r'), chainA)
        atom_seqB = parse_pdb.get_atom_seq(open(pdb_filenameB, 'r'), chainB)
        atom_seq = atom_seqA + atom_seqB
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        alignA = pairwise2.align.globalms(atom_seqA, seqA, 2, -1, -0.5, -0.1)
        alignB = pairwise2.align.globalms(atom_seqB, seqA, 2, -1, -0.5,
                                          -0.1)  # Align to seq A
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        atom_seq_aliA = alignA[-1][0]
        seq_aliA = alignA[-1][1]
        atom_seq_aliB = alignB[-1][0]
        seq_aliB = alignB[-1][1]
        gapped_cb_lst = []
        gapped_cb_lstA = []
        gapped_cb_lstB = []
        ali_lst = []
        ali_lstA = []
        ali_lstB = []
        j = 0
        k = 0
        for i in xrange(len(atom_seq_ali)):
            #print i,j,k,seq_ali[i],atom_seq_ali[i]
            if atom_seq_ali[i] == '-':
                gapped_cb_lst.append(['-'])
                ali_lst.append(-9999)
                k += 1
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                ali_lst.append(j)
                gapped_cb_lst.append(cb_lst[j])
                k += 1
                j += 1
        j = 0
        k = 0
        for i in xrange(len(atom_seq_aliA)):
            if atom_seq_aliA[i] == '-':
                gapped_cb_lstA.append(['-'])
                ali_lstA.append(-9999)
                k += 1
            elif seq_aliA[i] == '-':
                j += 1
                continue
            else:
                ali_lstA.append(j)
                gapped_cb_lstA.append(cb_lstA[j])
                k += 1
                j += 1
        j = 0
        k = 0
        for i in xrange(len(atom_seq_aliB)):
            #print "B",i,j,k,seq_aliB[i],atom_seq_aliB[i]
            if atom_seq_aliB[i] == '-':
                gapped_cb_lstB.append(['-'])
                ali_lstB.append(-9999)
                k += 1
            elif seq_aliB[i] == '-':
                j += 1
                continue
            else:
                ali_lstB.append(j)
                gapped_cb_lstB.append(cb_lstB[j])
                k += 1
                j += 1

        #print len(gapped_cb_lst),len(gapped_cb_lstA),len(gapped_cb_lstB)
        dist_mat = get_cb_contacts(gapped_cb_lst)
        dist_matA = get_cb_contacts(gapped_cb_lstA)
        dist_matB = get_cb_contacts(gapped_cb_lstB)
    cb_cutoff = 8
    #ref_contact_map = dist_mat < cb_cutoff
    # This routine adds all interface and B chain contacts
    contacts_x, contacts_y, scores = get_interface_contacts(
        contacts_x,
        contacts_y,
        scores,
        dist_mat,
        ref_lenA,
        factor,
        cb_cutoff + 4,
        atom_seq_ali=atom_seq_ali)
    ref_contact_map = dist_mat < cb_cutoff
    ref_contact_mapA = dist_matA < cb_cutoff
    ref_contact_mapB = dist_matB < cb_cutoff
    # Here we need to append
    if print_dist:
        print_distances(contacts_x,
                        contacts_y,
                        scores,
                        dist_mat,
                        bfactor,
                        surf,
                        ref_lenA,
                        ref_lenB,
                        seq,
                        ali_lst=ali_lst,
                        atom_seq=atom_seq,
                        outfile=outfilename)

    Zscore = get_Zscore(contacts_x,
                        contacts_y,
                        ref_contact_map,
                        scores,
                        atom_seq_ali=atom_seq_ali)
    ZscoreA = get_Zscore(contactsA_x,
                         contactsA_y,
                         ref_contact_mapA,
                         scoresA,
                         atom_seq_ali=atom_seq_aliA)
    ZscoreB = get_Zscore(contactsB_x,
                         contactsB_y,
                         ref_contact_mapB,
                         scoresB,
                         atom_seq_ali=atom_seq_aliB)
    ZscoreI = get_Zscore_interface(contacts_x,
                                   contacts_y,
                                   ref_contact_map,
                                   ref_lenA,
                                   ref_lenB,
                                   scores,
                                   atom_seq_ali=atom_seq_ali)

    PPV, TP, FP = get_ppv_helper(contacts_x,
                                 contacts_y,
                                 ref_contact_map,
                                 ref_len,
                                 factor,
                                 atom_seq_ali=atom_seq_ali)
    PPVa, TPa, FPa = get_ppv_helper(contactsA_x,
                                    contactsA_y,
                                    ref_contact_mapA,
                                    interfacelen,
                                    factor,
                                    atom_seq_ali=atom_seq_aliA)
    PPVb, TPb, FPb = get_ppv_helper(contactsB_x,
                                    contactsB_y,
                                    ref_contact_mapB,
                                    interfacelen,
                                    factor,
                                    atom_seq_ali=atom_seq_aliB)
    PPVi, TPi, FPi, PPViE, TPiE, FPiE = get_ppv_helper_interface(
        contacts_x,
        contacts_y,
        ref_contact_map,
        bfactor,
        ref_lenA,
        ref_lenB,
        interfacelen,
        cutoff,
        atom_seq_ali=atom_seq_ali)
    #for i in range(10):
    #    print "I: ",i,contactsI_x[i],contactsI_y[i],scoresI[i],dist_mat[contactsI_x[i]][contactsI_y[i]],ref_contact_map[contactsI_x[i]][contactsI_y[i]]
    #    print "A: ",i,contactsA_x[i],contactsA_y[i],scoresA[i],dist_mat[contactsA_x[i]][contactsA_y[i]],ref_contact_map[contactsA_x[i]][contactsA_y[i]]
    #    print "B: ",i,contactsB_x[i],contactsB_y[i],scoresB[i],dist_mat[contactsB_x[i]][contactsB_y[i]],ref_contact_map[contactsB_x[i]][contactsB_y[i]]

    if name:
        print '%s %s %s %s %s' % (name, PPVa, TPa, FPa, ZscoreA)
        print '%s %s %s %s %s' % (name, PPVb, TPb, FPb, ZscoreB)
        print '%s %s %s %s %s' % ("BOTH", PPV, TP, FP, Zscore)
        print '%s %s %s %s %s' % ("Interface", PPVi, TPi, FPi, ZscoreI)
        print '%s %s %s %s' % ("Interface-Exposed", PPViE, TPiE, FPiE)
    else:
        print '%s %s %s %s %s %s' % (fasta_filenameA, c_filename, PPVa, TPa,
                                     FPa, ZscoreA)
        print '%s %s %s %s %s %s' % (fasta_filenameB, c_filename, PPVb, TPb,
                                     FPb, ZscoreB)
        print '%s %s %s %s %s %s' % ("BOTH", c_filename, PPV, TP, FP, Zscore)
        print '%s %s %s %s %s %s' % ("Interface", c_filename, PPVi, TPi, FPi,
                                     ZscoreI)
        print '%s %s %s %s %s' % ("Interface-Exposed", c_filename, PPViE, TPiE,
                                  FPiE)
    print 'PPV %s %s %s %s %s %s' % (c_filename, PPV, PPVa, PPVb, PPVi, PPViE)
    print 'Zscore %s %s %s %s %s' % (c_filename, Zscore, ZscoreA, ZscoreB,
                                     ZscoreI)
    return (pdb_filenameA, PPV, TP, FP)
Beispiel #7
0
def get_dist(fasta_filename,
             c_filename,
             pdb_filename,
             chain='',
             sep='',
             outfilename='',
             noalign=False,
             dist_type='CB'):

    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=5)

    contacts_x = []
    contacts_y = []
    scores = []

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        contacts_x.append(c_x)
        contacts_y.append(c_y)
        scores.append(score)
        count += 1

    res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    ca_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain)

    if noalign:
        if dist_type == 'CB':
            dist_mat = get_dist_mat(cb_lst)
        elif dist_type == 'CA':
            dist_mat = get_dist_mat(ca_lst)
        else:
            dist_mat = get_dist_mat_heavy(res_lst)

        contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat)

    else:
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []
        gapped_ca_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
                gapped_ca_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                gapped_ca_lst.append(ca_lst[j])
                j += 1

        assert (len(gapped_ca_lst) == len(gapped_cb_lst) ==
                len(gapped_res_lst))

        if dist_type == 'CB':
            dist_mat = get_dist_mat(gapped_cb_lst)
        elif dist_type == 'CA':
            dist_mat = get_dist_mat(gapped_ca_lst)
        else:
            dist_mat = get_dist_mat_heavy(gapped_res_lst)

        contacts_dist = get_dist_helper(contacts_x,
                                        contacts_y,
                                        dist_mat,
                                        atom_seq_ali=atom_seq_ali)

    assert (len(contacts_dist) == len(contacts_x) == len(contacts_y) ==
            len(scores))

    num_c = len(contacts_dist)

    if outfilename:
        with open(outfilename, 'w') as outfile:
            for i in xrange(num_c):
                outfile.write('%s %s %f %f\n' % (contacts_x[i], contacts_y[i],
                                                 scores[i], contacts_dist[i]))

    return (contacts_x, contacts_y, scores, contacts_dist)
Beispiel #8
0
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''):  
   
    acc = fasta_filename.split('.')[0][:4]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if count >= ref_len * factor:
            break
 

    ### start plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ### plot secondary structure on the diagonal if given
    if psipred_horiz_fname or psipred_vert_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        else:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))

        assert len(ss) == ref_len
 
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
                
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
        
        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
       
        PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali)
   
        print '%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1])
      
        ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')


    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1
               
            if count >= ref_len * factor:
                break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali)
            print '%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1])
            fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5)


    ### plot predicted contacts from first contact map on both triangles
    ### if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1)
            plt.colorbar(sc)

    plt.gca().set_xlim([0,ref_len])
    plt.gca().set_ylim([0,ref_len])

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith(('.png', '.jpg', '.jpeg')):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
def fix(pdb1_filename, pdb2_filename, out_filename, chain1='', chain2=''):

    if not chain1:
        chain1 = parse_pdb.get_first_chain(open(pdb1_filename, 'r'))
    pdb1 = parse_pdb.read(open(pdb1_filename, 'r'), chain1)
    seq1 = parse_pdb.get_atom_seq(open(pdb1_filename, 'r'), chain1)

    if not chain2:
        chain2 = parse_pdb.get_first_chain(open(pdb2_filename, 'r'))
    pdb2 = parse_pdb.read(open(pdb2_filename, 'r'), chain2)
    seq2 = parse_pdb.get_atom_seq(open(pdb2_filename, 'r'), chain2)

    align = pairwise2.align.globalms(seq1, seq2, 2, -1, -0.5, -0.1)
    #print seq1
    #print seq2

    #print align
    seq1_ali = align[-1][0]
    seq2_ali = align[-1][1]

    #print pdb2
    pdb2_idx = []
    offset = 0
    for i in xrange(len(seq2_ali)):
        if seq1_ali[i] == '-':
            offset -= 1
            idx = i + 1 + offset
            pdb2_idx.append(idx)
        elif seq2_ali[i] == '-':
            continue
            #offset += 1
            #idx = i+1 + offset
            #pdb2_idx.append(idx)
        else:
            idx = i + 1 + offset
            pdb2_idx.append(idx)
        #else:
    pdb2_new = ['', [], pdb2[2]]
    i = 0
    prev_idx = -1
    #print len(pdb2_idx)
    #print len(pdb2[1])
    for res in pdb2[1]:
        if i >= len(pdb2_idx):
            break
        new_res = []
        new_idx = pdb2_idx[i]
        if new_idx == 0:
            i = i + 1
            continue
        elif new_idx == prev_idx:
            i = i + 1
            continue
        else:
            for atm in res:
                new_idx_str = str(pdb2_idx[i])
                #print atm
                #print new_idx_str
                lendiff = 4 - len(new_idx_str)
                new_atm = atm[:22] + lendiff * ' ' + new_idx_str + atm[26:]
                new_res.append(new_atm)
            pdb2_new[1].append(new_res)

        prev_idx = new_idx
        i = i + 1

    #print pdb1_filename
    #print pdb2_filename
    #print pdb2_idx
    #print len(pdb2_idx)
    #print align[-1]
    #print len(align[-1][1])
    if out_filename:
        pdb2_outfile = open(out_filename, 'w')
    else:
        pdb2_outfile = open(
            '.'.join(pdb2_filename.split('.')[:-1]) + '.aligned.pdb', 'w')
    #print pdb2_new
    parse_pdb.write(pdb2_new, pdb2_outfile)
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_filename='', pdb_filename='', is_heavy=False, chain='', sep='', sep2='', outfilename=''):  
   
    acc = c_filename.split('.')[0]

    ### get sequence
    #seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    #ref_len = len(seq)

    ### get id
    f = open(fasta_filename,"rU")
    seqs = SeqIO.parse(f,"fasta")
    
    # we assume there is only one record
    for record in seqs: 
        seq = str(record.seq)
        protein_id = record.id 
        ref_len = len(seq)

    # guessing separator of constraint file
    if sep == '':
        line = open(c_filename,'r').readline()
        if len(line.split(',')) != 1:
            sep = ','
        elif len(line.split(' ')) != 1:
            sep = ' '
        else:
            sep = '\t'

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if count >= ref_len * factor:
            break
 

    ### start plotting
    fig = plt.figure()
    plt.title('Contact map for ' + protein_id)
    ax = fig.add_subplot(111)

    ### plot secondary structure on the diagonal if given
    if psipred_filename:
        ss = parse_psipred.horizontal(open(psipred_filename, 'r'))
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        
        if (len(res_lst)==0) or (len(cb_lst)==0):
            print "Could not parse the PDB file, res_list or cb_list is empty"
            return

        try:
            atom_seq_ali = align[-1][0]
            seq_ali = align[-1][1] 
        except Exception,ex:
            print "Could not parse the PDB file:", ex
            return
        
        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
        
        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
       
        PPVs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali)
   
        print '%s\t%s' % (acc, PPVs[-1])
      
        ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')