Beispiel #1
0
def get_ppv_hbond(fasta_filename,
                  c_filename,
                  hbond_filename,
                  factor=1.0,
                  min_score=-1.0,
                  sep=' ',
                  outfilename=''):

    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if min_score == -1.0 and count >= ref_len * factor:
            break
        if score < min_score:
            break

    ref_contact_map = np.zeros((ref_len, ref_len))

    hbonds_raw = open(hbond_filename).readlines()
    hbonds = [line.strip().split(' ')
              for line in hbonds_raw]  #map(split(' '), map(strip, hbonds_raw))

    for h in hbonds:
        i = int(h[0]) - 1
        j = int(h[1]) - 1
        val = float(h[2])
        ref_contact_map[i, j] = -val
        ref_contact_map[j, i] = -val

    PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map,
                                 ref_len, factor)

    print '%s %s %s %s' % (hbond_filename, PPV, TP, FP)
    return (hbond_filename, PPV, TP, FP)
Beispiel #2
0
def realign(fasta_filename, pdb_filename, outfilename='', chain='*'):

    ### get sequence
    seq = list(parse_fasta.read_fasta(open(fasta_filename,
                                           'r')).values())[0][0]
    ref_len = len(seq)
    atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
    pdbfile = open(pdb_filename, 'r')
    align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
    atom_seq_ali = align[-1][0]
    seq_ali = align[-1][1]
    #print (atom_seq_ali,seq_ali)

    res_i = -9999
    resno = {}
    i = 0
    atompos = 0
    seqpos = 0
    maxlen = len(atom_seq_ali)
    for i in range(0, maxlen):
        if atom_seq_ali[i] == "-":
            seqpos += 1
        elif seq_ali[i] == "-":
            atompos += 1
            resno[atompos] = -9999
        else:
            atompos += 1
            seqpos += 1
            resno[atompos] = seqpos
    if not chain:
        chain = get_first_chain(pdbfile)
        pdbfile.seek(0)
    i = 0
    for line in pdbfile:
        if not line.startswith('ATOM'):
            continue
        atm_record = parse_pdb.parse_atm_record(line)
        if atm_record['chain'] != ' ' and atm_record[
                'chain'] != chain and chain != '*':
            continue
        if atm_record['res_no'] != res_i:
            i += 1
            res_i = atm_record['res_no']
        atm_record['res_no'] = resno[i]
        #print (atm_record)
        if resno[i] > 0:
            parse_pdb.write_pdb_atm_record(atm_record)
        #res_dict[res_i].append(np.array(atm))

    #pdbfile.close()

    return
def plot_map(fasta_filename,
             c_filename,
             factor,
             c2_filename='',
             ss_fname='',
             psipred_horiz_fname='',
             psipred_vert_fname='',
             pdb_filename='',
             is_heavy=False,
             chain='',
             sep=',',
             outfilename=''):

    acc = fasta_filename.split('.')[0][:4]

    # get sequence
    seq = list(parse_fasta.read_fasta(open(fasta_filename,
                                           'r')).values())[0][0]
    ref_len = len(seq)

    # get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    # contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5  # Also checking here,
        # should remove in parse
        # too_close = False

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1

        if count >= ref_len * factor:
            break

    # start plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # plot secondary structure on the diagonal if given
    if psipred_horiz_fname or psipred_vert_fname or ss_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        elif psipred_vert_fname:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))
        else:
            ss = parse_ss.parse(open(ss_fname, 'r'))

        assert len(ss) == ref_len

        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    # plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in range(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)

        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]

        PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map,
                                  atom_seq_ali)

        print('%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1]))

        ax.scatter(ref_contacts_x,
                   ref_contacts_y,
                   marker='o',
                   c='#CCCCCC',
                   lw=0,
                   edgecolor='#CCCCCC')

    # plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        # contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1

            if count >= ref_len * factor:
                break

        # use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y,
                                         ref_contact_map, atom_seq_ali,
                                         ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y,
                                       ref_contact_map, atom_seq_ali)
            print('%s %s %s %s' %
                  (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1]))
            fig.suptitle('%s\nPPV (upper left) = %.2f |' % (PPVs[-1]) +
                         'PPV (lower right) = %.2f' % (PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c=tp2_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
        else:
            sc = ax.scatter(contacts2_y[::-1],
                            contacts2_x[::-1],
                            marker='o',
                            c='#D70909',
                            edgecolor='#D70909',
                            s=4,
                            linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c='#004F9D',
                            edgecolor='#004F9D',
                            s=4,
                            linewidths=0.5)

    # plot predicted contacts from first contact map on both triangles
    # if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=tp_colors[::-1],
                            s=6,
                            alpha=0.75,
                            linewidths=0.0)
        else:
            sc = ax.scatter(contacts_x[::-1],
                            contacts_y[::-1],
                            marker='o',
                            c=scores[::-1],
                            s=4,
                            alpha=0.75,
                            cmap=cm.jet,
                            linewidths=0.1)
            sc = ax.scatter(contacts_y[::-1],
                            contacts_x[::-1],
                            marker='o',
                            c=scores[::-1],
                            s=4,
                            alpha=0.75,
                            cmap=cm.jet,
                            linewidths=0.1)
            plt.colorbar(sc)

    plt.gca().set_xlim([0, ref_len])
    plt.gca().set_ylim([0, ref_len])

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith(('.png', '.jpg', '.jpeg')):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()
    plt.show()
def get_ppv(
    fasta_filename,
    contact_filename,
    pdb_filename,
    factor_value,
    cb_cutoff,
    min_score,
    chain1,
    chain2,
    outfilename,
    name,
    noalign,
    min_dist,
    print_dist
):
    """ Return a tupla of 1 str and 3 floats,
    (pdb_filename, PPV, TP, FP). """

    # From a dictionary builded from a file, could be a simple fasta,
    # a3m, or a MSA, get first seq. The dictionary has this structure:
    # {header:[seq1,seq2,...,seqN], header2:[seq1,seq2,...,seqN]}    <--- Is it correct?
    # The KEYs are the query IDs.
    print("In get_ppv function")
    seq = list(parse_fasta.read_fasta(fasta_filename).values())[0][0]
    ref_len = len(seq)

    # Get all the scores from contacts that satisfies the arguments
    contacts_x, contacts_y, scores = get_scores_from_contacts(contact_filename, min_dist, factor_value, min_score, ref_len)

    # Create a carbon-beta list from chain1 like this:
    # [array_cb1([x1, y1, z1]), array_cb2([x2, y2, z2], ... array_cbN([xN, yN, zN])
    cb_chain1_lst = parse_pdb.get_cb_coordinates(pdb_filename, chain1)

##########################################
# Using Biopython to get the coordinates #
##########################################
# from Bio.PDB.PDBParser import PDBParser
# parser = PDBParser(PERMISSIVE=1) # See 11.7  Common problems in PDB files  --> http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc150 
# structure_id = "1bih"
# filename = "<PATH-TO-PDB-FILE>XXXX.pdb"
# structure = parser.get_structure(structure_id, filename)
# 
#    for model in structure.get_list():
#        for chain in model.get_list():
#            if chain == model["B"]:
#                for residue in chain.get_list():
#                    if residue.has_id("CB"):
#                        cb = residue["CB"]
#                        print(cb.get_coord())
#
#[46.114 29.797 48.287]


    # Here "noalign" is always setting FALSE in function definition. Why?
    if noalign:
        dist_mat = get_cb_contacts(cb_chain1_lst)
        #       cb_cutoff = 8
        # Check if those are less than "cb_cutoff" angstrom far from each other.
        ref_contact_map = dist_mat < cb_cutoff  # cb_cutoff is 8 in default mode
        # Get all the PPV, TP and FP results.
        PPV, TP, FP = get_ppv_helper(
            contacts_x, contacts_y, ref_contact_map, atom_seq_ali=[]
        )
    else:

        # Check if PPI is need it
        if (chain2 != "" and chain2 != chain1):
            print("In the PPI branch of get_ppv")
            # Create a carbon-beta list from chain2
            cb_chain2_lst = parse_pdb.get_cb_coordinates(pdb_filename, chain2)

            gapped_cb_chain1_lst = get_gapped_cb_lts(pdb_filename, chain1, seq, cb_chain1_lst)
            gapped_cb_chain2_lst = get_gapped_cb_lts(pdb_filename, chain2, seq, cb_chain2_lst)

            # Get the distance matrix from chain1 only.
            # I do not use this distance matrix but could
            # be useful if intra-chain is also need it to be printed.
            # dist_mat_chain1 = get_cb_contacts(gapped_cb_chain1_lst)

            # Get the distance matrix from chain1 vs chain2. Used in PPI
            dist_mat_chain1_vs_chain2 = get_cb_contacts_PPI(gapped_cb_chain1_lst, gapped_cb_chain2_lst)

            #            cb_cutoff = 8
            # Check if those are less than "cb_cutoff" angstrom far from each other.
            # This create a boolean matrix called: ref_contact_map
            # cb_cutoff is 8 in default mode
            ref_contact_map = dist_mat_chain1_vs_chain2 < cb_cutoff

            # Get atoms seq aligned from a PDB_chain.
            # atom_seq_ali it is a string.
            atom_seq_ali_chain1 = get_global_align_from_pdb(pdb_filename, chain1, seq)[-1][0]
            atom_seq_ali_chain2 = get_global_align_from_pdb(pdb_filename, chain2, seq)[-1][0]

            ###################################################
            ##  Which atom_seq_ali_chainX we have to use for ##
            ##  get_ppf_helper() ? Both? or that one with    ##  <-----  LOOK !
            ##  the best aligned pdb sequence to fasta_seq ? ##
            ###################################################
            print("Getting PPV, TP and FP values...")
            # Get all the PPV, TP and FP results.
            PPV, TP, FP = get_ppv_helper(
                contacts_x,
                contacts_y,
                ref_contact_map,
                atom_seq_ali_chain1
            )

            # Check if print is need it.
            if print_dist:
                print("Printing PPI's distance results...")
                print_distances_PPI(
                    contacts_x,
                    contacts_y,
                    scores,
                    dist_mat_chain1_vs_chain2,
                    atom_seq_ali_chain1,
                    atom_seq_ali_chain2,
                    outfilename
                )

        else:
            print("In the monomer branch of get_ppv")
            gapped_cb_chain1_lst = get_gapped_cb_lts(pdb_filename, chain1, seq, cb_chain1_lst)

            # Get the distance matrix from chain1 only. Could be useful if
            # intra-chain is also need it to be printed.
            dist_mat = get_cb_contacts(gapped_cb_chain1_lst)

            #            cb_cutoff = 8
            # Check if those are less than "cb_cutoff" angstrom far from each other.
            # This create a boolean matrix called: ref_contact_map
            # cb_cutoff is 8 in default mode.
            ref_contact_map = dist_mat < cb_cutoff

            # Get atoms seq aligned from a PDB_chain.
            # atom_seq_ali it is a string.
            atom_seq_ali_chain1 = get_global_align_from_pdb(pdb_filename, chain1, seq)[-1][0]

            # Get the PPV, TP and FP results.
            PPV, TP, FP = get_ppv_helper(
                contacts_x,
                contacts_y,
                ref_contact_map,
                atom_seq_ali_chain1
            )

            # Check if print is need it.
            if print_dist:
                print_distances(
                    contacts_x,
                    contacts_y,
                    scores,
                    dist_mat,
                    atom_seq_ali_chain1,
                    outfilename
                )

    # Here "name" is always empty (so False) by default.
    if name:
        print("%s\n" % ("----------------------------------"))
        print("%s %s %s %s" % (name, PPV, TP, FP))
    else:
        print("Finished")
#        print("%s %s %s %s %s" % (fasta_filename, contact_filename, PPV, TP, FP))
    return (pdb_filename, PPV, TP, FP)
Beispiel #5
0
#!/usr/bin/env python
import sys

sys.path.append("/home/x_arnel/git/bioinfo-toolbox/")
from parsing import parse_fasta
from parsing import parse_contacts

sfile = sys.argv[1]
cfile = sys.argv[2]
target = sys.argv[3]
server = sys.argv[4]
ofilepath = sys.argv[5]
minsep = sys.argv[6]
minscore = sys.argv[7]

seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0]

contacts = parse_contacts.parse(open(cfile), min_dist=0)

print len(contacts)
print contacts[0]
print seq

ofile = open(ofilepath, 'w')

if server == "Pcons-net":
    ofile.write(
        "PFRMAT RR\nTARGET %s\nAUTHOR 5450-4562-0389\nMETHOD Pcons-net\nREMARK PconsC3\nMETHOD Improved contact predictions on\nMETHOD small protein families.\nMODEL  1\n"
        % target)
elif server == "PconsC2":
    ofile.write(
import sys
import os
import shutil
import errno
import stat

from parsing import parse_fasta

if __name__ == '__main__':

    infile = open(sys.argv[1], 'r')
    seq_dict = parse_fasta.read_fasta(infile)

    for header, seq_lst in seq_dict.iteritems():
        acc = header.split()[0]
        outfile = open(acc + '.fa', 'w')

        seq = seq_lst[0]
        fasta_string = '>%s\n%s\n' % (header, seq)
        #print(fasta_string)

        outfile.write(fasta_string)
        outfile.close()


    infile.close()
Beispiel #7
0
def get_ppv(fasta_filename, c_filename, pdb_filename, factor=1.0,
        min_score=-1.0, chain='', sep=' ', outfilename='', name='', noalign=False, min_dist=5, print_dist=False):  
    
    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top ranked predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=min_dist)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < min_dist

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if min_score == -1.0 and count >= ref_len * factor:
            break
        if score < min_score:
            break
    
    assert(len(contacts_x) == len(contacts_y) == len(scores))

    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    bfactor = parse_pdb.get_area(open(pdb_filename, 'r'), chain)
    surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain)

    if noalign:
        dist_mat = get_cb_contacts(cb_lst)
        cb_cutoff = 8
        ref_contact_map = dist_mat < cb_cutoff
        PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor)
    else:
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
                
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        gapped_cb_lst = []

        ali_lst =[]
        j = 0
        k = 0
        for i in xrange(len(atom_seq_ali)):
            #print i,j,k,seq_ali[i],atom_seq_ali[i]
            if atom_seq_ali[i] == '-':
                gapped_cb_lst.append(['-'])
                ali_lst.append(-9999)
                k += 1
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                ali_lst.append(j)
                gapped_cb_lst.append(cb_lst[j])
                k += 1
                j += 1

        dist_mat = get_cb_contacts(gapped_cb_lst)
        area = parse_pdb.get_area(open(pdb_filename, 'r'), chain)
        surf = parse_pdb.get_dist_to_surface(open(pdb_filename, 'r'), chain)
        if print_dist:
            print_distances(contacts_x, contacts_y, scores, dist_mat,
                                                area, surf, ref_len,ref_len,
                                                seq, ali_lst=ali_lst, atom_seq=atom_seq,
                                                outfile=outfilename)
        cb_cutoff = 8
        ref_contact_map = dist_mat < cb_cutoff
   
        PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor, atom_seq_ali=atom_seq_ali)
    if name:
        print '%s %s %s %s' % (name, PPV, TP, FP)
    else:
        print '%s %s %s %s %s' % (fasta_filename, c_filename, PPV, TP, FP)
    return (pdb_filename, PPV, TP, FP)
Beispiel #8
0
def get_ppv(fasta_filenameA,
            c_filename,
            pdb_filenameA,
            fasta_filenameB,
            pdb_filenameB,
            factor=1.0,
            min_score=-1.0,
            chainA='',
            chainB='',
            sep=' ',
            outfilename='',
            name='',
            noalign=False,
            min_dist=5,
            interfacelen=10,
            print_dist=False,
            cutoff=0.25):

    ### get sequence
    seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0]
    seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0]
    seq = seqA + seqA  # Actually the contact map sequence is just two copies of seqA

    ref_lenA = len(seqA)
    ref_lenB = len(seqB)
    ref_len = len(seq)

    ### get top ranked predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'),
                                    sep,
                                    min_dist=min_dist)

    contacts_x = []
    contacts_y = []
    scores = []
    contactsA_x = []
    contactsA_y = []
    scoresA = []
    contactsB_x = []
    contactsB_y = []
    scoresB = []
    contactsI_x = []
    contactsI_y = []
    scoresI = []
    contact_dict = {}

    count = 0
    countA = 0
    countB = 0
    countI = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1
        #print i,c_x,c_y,score

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < min_dist

        if not too_close:
            # The contacts only covers
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            #contacts_x.append(c_x+ref_lenA)
            #contacts_y.append(c_y+ref_lenA)
            #scores.append(score)
            contactsA_x.append(c_x)
            contactsA_y.append(c_y)
            scoresA.append(score)
            contactsB_x.append(c_x)
            contactsB_y.append(c_y)
            scoresB.append(score)

            #        if min_score == -1.0 and count >= ref_len * factor:
            #            break
            #        if score < min_score:
            #            break

    assert (len(contacts_x) == len(contacts_y) == len(scores))
    assert (len(contactsA_x) == len(contactsA_y) == len(scoresA))
    assert (len(contactsB_x) == len(contactsB_y) == len(scoresB))
    assert (len(contactsI_x) == len(contactsI_y) == len(scoresI))

    cb_lstA = parse_pdb.get_cb_coordinates(open(pdb_filenameA, 'r'), chainA)
    cb_lstB = parse_pdb.get_cb_coordinates(open(pdb_filenameB, 'r'), chainB)
    cb_lst = cb_lstA + cb_lstB
    bfactorA = parse_pdb.get_area(open(pdb_filenameA, 'r'), chainA)
    bfactorB = parse_pdb.get_area(open(pdb_filenameB, 'r'), chainB)
    bfactor = bfactorA + bfactorB
    surfA = parse_pdb.get_dist_to_surface(open(pdb_filenameA, 'r'), chainA)
    surfB = parse_pdb.get_dist_to_surface(open(pdb_filenameB, 'r'), chainB)
    surf = surfA + surfB
    #print cb_lst,noalign
    if noalign:
        dist_mat = get_cb_contacts(cb_lst)
        dist_matA = get_cb_contacts(cb_lstA)
        dist_matB = get_cb_contacts(cb_lstB)
        #PPV, TP, FP = get_ppv_helper(contacts_x, contacts_y, ref_contact_map, ref_len, factor)
    else:
        atom_seqA = parse_pdb.get_atom_seq(open(pdb_filenameA, 'r'), chainA)
        atom_seqB = parse_pdb.get_atom_seq(open(pdb_filenameB, 'r'), chainB)
        atom_seq = atom_seqA + atom_seqB
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        alignA = pairwise2.align.globalms(atom_seqA, seqA, 2, -1, -0.5, -0.1)
        alignB = pairwise2.align.globalms(atom_seqB, seqA, 2, -1, -0.5,
                                          -0.1)  # Align to seq A
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        atom_seq_aliA = alignA[-1][0]
        seq_aliA = alignA[-1][1]
        atom_seq_aliB = alignB[-1][0]
        seq_aliB = alignB[-1][1]
        gapped_cb_lst = []
        gapped_cb_lstA = []
        gapped_cb_lstB = []
        ali_lst = []
        ali_lstA = []
        ali_lstB = []
        j = 0
        k = 0
        for i in xrange(len(atom_seq_ali)):
            #print i,j,k,seq_ali[i],atom_seq_ali[i]
            if atom_seq_ali[i] == '-':
                gapped_cb_lst.append(['-'])
                ali_lst.append(-9999)
                k += 1
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                ali_lst.append(j)
                gapped_cb_lst.append(cb_lst[j])
                k += 1
                j += 1
        j = 0
        k = 0
        for i in xrange(len(atom_seq_aliA)):
            if atom_seq_aliA[i] == '-':
                gapped_cb_lstA.append(['-'])
                ali_lstA.append(-9999)
                k += 1
            elif seq_aliA[i] == '-':
                j += 1
                continue
            else:
                ali_lstA.append(j)
                gapped_cb_lstA.append(cb_lstA[j])
                k += 1
                j += 1
        j = 0
        k = 0
        for i in xrange(len(atom_seq_aliB)):
            #print "B",i,j,k,seq_aliB[i],atom_seq_aliB[i]
            if atom_seq_aliB[i] == '-':
                gapped_cb_lstB.append(['-'])
                ali_lstB.append(-9999)
                k += 1
            elif seq_aliB[i] == '-':
                j += 1
                continue
            else:
                ali_lstB.append(j)
                gapped_cb_lstB.append(cb_lstB[j])
                k += 1
                j += 1

        #print len(gapped_cb_lst),len(gapped_cb_lstA),len(gapped_cb_lstB)
        dist_mat = get_cb_contacts(gapped_cb_lst)
        dist_matA = get_cb_contacts(gapped_cb_lstA)
        dist_matB = get_cb_contacts(gapped_cb_lstB)
    cb_cutoff = 8
    #ref_contact_map = dist_mat < cb_cutoff
    # This routine adds all interface and B chain contacts
    contacts_x, contacts_y, scores = get_interface_contacts(
        contacts_x,
        contacts_y,
        scores,
        dist_mat,
        ref_lenA,
        factor,
        cb_cutoff + 4,
        atom_seq_ali=atom_seq_ali)
    ref_contact_map = dist_mat < cb_cutoff
    ref_contact_mapA = dist_matA < cb_cutoff
    ref_contact_mapB = dist_matB < cb_cutoff
    # Here we need to append
    if print_dist:
        print_distances(contacts_x,
                        contacts_y,
                        scores,
                        dist_mat,
                        bfactor,
                        surf,
                        ref_lenA,
                        ref_lenB,
                        seq,
                        ali_lst=ali_lst,
                        atom_seq=atom_seq,
                        outfile=outfilename)

    Zscore = get_Zscore(contacts_x,
                        contacts_y,
                        ref_contact_map,
                        scores,
                        atom_seq_ali=atom_seq_ali)
    ZscoreA = get_Zscore(contactsA_x,
                         contactsA_y,
                         ref_contact_mapA,
                         scoresA,
                         atom_seq_ali=atom_seq_aliA)
    ZscoreB = get_Zscore(contactsB_x,
                         contactsB_y,
                         ref_contact_mapB,
                         scoresB,
                         atom_seq_ali=atom_seq_aliB)
    ZscoreI = get_Zscore_interface(contacts_x,
                                   contacts_y,
                                   ref_contact_map,
                                   ref_lenA,
                                   ref_lenB,
                                   scores,
                                   atom_seq_ali=atom_seq_ali)

    PPV, TP, FP = get_ppv_helper(contacts_x,
                                 contacts_y,
                                 ref_contact_map,
                                 ref_len,
                                 factor,
                                 atom_seq_ali=atom_seq_ali)
    PPVa, TPa, FPa = get_ppv_helper(contactsA_x,
                                    contactsA_y,
                                    ref_contact_mapA,
                                    interfacelen,
                                    factor,
                                    atom_seq_ali=atom_seq_aliA)
    PPVb, TPb, FPb = get_ppv_helper(contactsB_x,
                                    contactsB_y,
                                    ref_contact_mapB,
                                    interfacelen,
                                    factor,
                                    atom_seq_ali=atom_seq_aliB)
    PPVi, TPi, FPi, PPViE, TPiE, FPiE = get_ppv_helper_interface(
        contacts_x,
        contacts_y,
        ref_contact_map,
        bfactor,
        ref_lenA,
        ref_lenB,
        interfacelen,
        cutoff,
        atom_seq_ali=atom_seq_ali)
    #for i in range(10):
    #    print "I: ",i,contactsI_x[i],contactsI_y[i],scoresI[i],dist_mat[contactsI_x[i]][contactsI_y[i]],ref_contact_map[contactsI_x[i]][contactsI_y[i]]
    #    print "A: ",i,contactsA_x[i],contactsA_y[i],scoresA[i],dist_mat[contactsA_x[i]][contactsA_y[i]],ref_contact_map[contactsA_x[i]][contactsA_y[i]]
    #    print "B: ",i,contactsB_x[i],contactsB_y[i],scoresB[i],dist_mat[contactsB_x[i]][contactsB_y[i]],ref_contact_map[contactsB_x[i]][contactsB_y[i]]

    if name:
        print '%s %s %s %s %s' % (name, PPVa, TPa, FPa, ZscoreA)
        print '%s %s %s %s %s' % (name, PPVb, TPb, FPb, ZscoreB)
        print '%s %s %s %s %s' % ("BOTH", PPV, TP, FP, Zscore)
        print '%s %s %s %s %s' % ("Interface", PPVi, TPi, FPi, ZscoreI)
        print '%s %s %s %s' % ("Interface-Exposed", PPViE, TPiE, FPiE)
    else:
        print '%s %s %s %s %s %s' % (fasta_filenameA, c_filename, PPVa, TPa,
                                     FPa, ZscoreA)
        print '%s %s %s %s %s %s' % (fasta_filenameB, c_filename, PPVb, TPb,
                                     FPb, ZscoreB)
        print '%s %s %s %s %s %s' % ("BOTH", c_filename, PPV, TP, FP, Zscore)
        print '%s %s %s %s %s %s' % ("Interface", c_filename, PPVi, TPi, FPi,
                                     ZscoreI)
        print '%s %s %s %s %s' % ("Interface-Exposed", c_filename, PPViE, TPiE,
                                  FPiE)
    print 'PPV %s %s %s %s %s %s' % (c_filename, PPV, PPVa, PPVb, PPVi, PPViE)
    print 'Zscore %s %s %s %s %s' % (c_filename, Zscore, ZscoreA, ZscoreB,
                                     ZscoreI)
    return (pdb_filenameA, PPV, TP, FP)
Beispiel #9
0
import sys
import argparse
import Bio.PDB
from Bio import pairwise2
from os.path import expanduser
home = expanduser("~")
sys.path.append(home + '/git/bioinfo-toolbox')

from parsing import parse_contacts
from parsing import parse_fasta
from parsing import parse_pdb


if __name__ == "__main__":
    p = argparse.ArgumentParser(description='Get sequence identity from two fasta files.')
    p.add_argument('fasta_fileA')
    p.add_argument('fasta_fileB')
    args = vars(p.parse_args(sys.argv[1:]))
    fasta_filenameA = args['fasta_fileA']
    fasta_filenameB = args['fasta_fileB']
    seqA = parse_fasta.read_fasta(open(fasta_filenameA, 'r')).values()[0][0]
    seqB = parse_fasta.read_fasta(open(fasta_filenameB, 'r')).values()[0][0]
    align = pairwise2.align.localms(seqA,seqB , 1, -1, -0.5, -0.1)
    minlen=len(seqA)
    if len(seqB)<minlen:
        minlen=len(seqB)
#    print "Identity: ",fasta_filenameA,fasta_filenameB,float(align[0][2])/float(align[0][4]-align[0][3])
    print "Identity: ",fasta_filenameA,fasta_filenameB,float(align[0][2])/float(minlen)

Beispiel #10
0
def get_dist(fasta_filename,
             c_filename,
             pdb_filename,
             chain='',
             sep='',
             outfilename='',
             noalign=False,
             dist_type='CB'):

    acc = fasta_filename.split('.')[-2][-5:-1]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep, min_dist=5)

    contacts_x = []
    contacts_y = []
    scores = []

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        contacts_x.append(c_x)
        contacts_y.append(c_y)
        scores.append(score)
        count += 1

    res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
    cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
    ca_lst = parse_pdb.get_ca_coordinates(open(pdb_filename, 'r'), chain)

    if noalign:
        if dist_type == 'CB':
            dist_mat = get_dist_mat(cb_lst)
        elif dist_type == 'CA':
            dist_mat = get_dist_mat(ca_lst)
        else:
            dist_mat = get_dist_mat_heavy(res_lst)

        contacts_dist = get_dist_helper(contacts_x, contacts_y, dist_mat)

    else:
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)

        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)
        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]
        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []
        gapped_ca_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
                gapped_ca_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                gapped_ca_lst.append(ca_lst[j])
                j += 1

        assert (len(gapped_ca_lst) == len(gapped_cb_lst) ==
                len(gapped_res_lst))

        if dist_type == 'CB':
            dist_mat = get_dist_mat(gapped_cb_lst)
        elif dist_type == 'CA':
            dist_mat = get_dist_mat(gapped_ca_lst)
        else:
            dist_mat = get_dist_mat_heavy(gapped_res_lst)

        contacts_dist = get_dist_helper(contacts_x,
                                        contacts_y,
                                        dist_mat,
                                        atom_seq_ali=atom_seq_ali)

    assert (len(contacts_dist) == len(contacts_x) == len(contacts_y) ==
            len(scores))

    num_c = len(contacts_dist)

    if outfilename:
        with open(outfilename, 'w') as outfile:
            for i in xrange(num_c):
                outfile.write('%s %s %f %f\n' % (contacts_x[i], contacts_y[i],
                                                 scores[i], contacts_dist[i]))

    return (contacts_x, contacts_y, scores, contacts_dist)
Beispiel #11
0
sys.path.append("/scratch/mirco_local/bioinfo-toolbox")
from parsing import parse_fasta
from parsing import parse_contacts

if len(sys.argv) != 5:
    sys.stderr.write("Incorrect number of command line arguments.\n")
    sys.stderr.write("Usage: " + sys.argv[0] + " <sequence file> <contact file> <CASP target ID> <output filename>\n\n")
    sys.exit(0)


sfile = sys.argv[1]
cfile = sys.argv[2]
target = sys.argv[3]

seq = parse_fasta.read_fasta(open(sfile)).items()[0][1][0]

contacts = parse_contacts.parse(open(cfile), min_dist=0)

print len(contacts)
print contacts[0]
print seq

ofile = open(sys.argv[4], "w")

ofile.write(
    "PFRMAT RR\nTARGET %s\nAUTHOR 6685-2065-9124\nMETHOD Pcons-net\nREMARK PconsC2\nMETHOD Improved contact predictions using the\nMETHOD recognition of protein like contact\nMETHOD patterns.\nMODEL  1\n"
    % target
)

tmp_i = 1
Beispiel #12
0
def plot_map(fasta_filename, c_filename, factor, c2_filename='', psipred_horiz_fname='', psipred_vert_fname='', pdb_filename='', is_heavy=False, chain='', sep=',', outfilename=''):  
   
    acc = fasta_filename.split('.')[0][:4]

    ### get sequence
    seq = parse_fasta.read_fasta(open(fasta_filename, 'r')).values()[0][0]
    ref_len = len(seq)

    ### get top "factor" * "ref_len" predicted contacts
    contacts = parse_contacts.parse(open(c_filename, 'r'), sep)

    contacts_x = []
    contacts_y = []
    scores = []
    contact_dict = {}

    count = 0
    for i in range(len(contacts)):
        score = contacts[i][0]
        c_x = contacts[i][1] - 1
        c_y = contacts[i][2] - 1

        pos_diff = abs(c_x - c_y)
        too_close = pos_diff < 5

        if not too_close:
            contacts_x.append(c_x)
            contacts_y.append(c_y)
            scores.append(score)
            count += 1
           
        if count >= ref_len * factor:
            break
 

    ### start plotting
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ### plot secondary structure on the diagonal if given
    if psipred_horiz_fname or psipred_vert_fname:
        if psipred_horiz_fname:
            ss = parse_psipred.horizontal(open(psipred_horiz_fname, 'r'))
        else:
            ss = parse_psipred.vertical(open(psipred_vert_fname, 'r'))

        assert len(ss) == ref_len
 
        for i in range(len(ss)):
            if ss[i] == 'H':
                plt.plot(i, i, 'o', c='#8B0043', mec="#8B0043", markersize=2)
            if ss[i] == 'E':
                plt.plot(i, i, 'D', c='#0080AD', mec="#0080AD", markersize=2)
            if ss[i] == 'C':
                continue

    ### plot reference contacts in the background if given
    if pdb_filename:
        res_lst = parse_pdb.get_coordinates(open(pdb_filename, 'r'), chain)
        cb_lst = parse_pdb.get_cb_coordinates(open(pdb_filename, 'r'), chain)
        atom_seq = parse_pdb.get_atom_seq(open(pdb_filename, 'r'), chain)
                
        align = pairwise2.align.globalms(atom_seq, seq, 2, -1, -0.5, -0.1)

        atom_seq_ali = align[-1][0]
        seq_ali = align[-1][1]

        j = 0
        gapped_res_lst = []
        gapped_cb_lst = []

        for i in xrange(len(atom_seq_ali)):
            if atom_seq_ali[i] == '-':
                gapped_res_lst.append('-')
                gapped_cb_lst.append('-')
            elif seq_ali[i] == '-':
                j += 1
                continue
            else:
                gapped_res_lst.append(res_lst[j])
                gapped_cb_lst.append(cb_lst[j])
                j += 1

        if is_heavy:
            dist_mat = get_heavy_contacts(gapped_res_lst)
            heavy_cutoff = 5
            ref_contact_map = dist_mat < heavy_cutoff
            ref_contacts = np.where(dist_mat < heavy_cutoff)
        else:
            dist_mat = get_cb_contacts(gapped_cb_lst)
            cb_cutoff = 8
            ref_contact_map = dist_mat < cb_cutoff
            ref_contacts = np.where(dist_mat < cb_cutoff)
        
        ref_contacts_x = ref_contacts[0]
        ref_contacts_y = ref_contacts[1]
       
        PPVs, TPs, FPs = get_ppvs(contacts_x, contacts_y, ref_contact_map, atom_seq_ali, ref_len, factor)
        tp_colors = get_tp_colors(contacts_x, contacts_y, ref_contact_map, atom_seq_ali)
   
        print '%s %s %s %s' % (pdb_filename, PPVs[-1], TPs[-1], FPs[-1])
      
        ax.scatter(ref_contacts_x, ref_contacts_y, marker='o', c='#CCCCCC', lw=0, edgecolor='#CCCCCC')


    ### plot predicted contacts from second contact map if given
    if c2_filename:
        contacts2 = parse_contacts.parse(open(c2_filename, 'r'), sep)
        contacts2_x = []
        contacts2_y = []
        scores2 = []
        contact_dict2 = {}

        count = 0

        for i in range(len(contacts2)):
            score = contacts2[i][0]
            c_x = contacts2[i][1] - 1
            c_y = contacts2[i][2] - 1

            pos_diff = abs(c_x - c_y)
            too_close = pos_diff < 5

            if not too_close:
                contacts2_x.append(c_x)
                contacts2_y.append(c_y)
                scores2.append(score)
                count += 1
               
            if count >= ref_len * factor:
                break

        ### use TP/FP color coding if reference contacts given
        if pdb_filename:
            PPVs2, TPs2, FPs2 = get_ppvs(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali, ref_len, factor)
            tp2_colors = get_tp_colors(contacts2_x, contacts2_y, ref_contact_map, atom_seq_ali)
            print '%s %s %s %s' % (pdb_filename, PPVs2[-1], TPs2[-1], FPs2[-1])
            fig.suptitle('%s\nPPV (upper left) = %.2f | PPV (lower right) = %.2f' % (acc, PPVs[-1], PPVs2[-1]))
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c=tp2_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            sc = ax.scatter(contacts2_y[::-1], contacts2_x[::-1], marker='o', c='#D70909', edgecolor='#D70909', s=4, linewidths=0.5)
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c='#004F9D', edgecolor='#004F9D', s=4, linewidths=0.5)


    ### plot predicted contacts from first contact map on both triangles
    ### if no second contact map given
    else:
        if pdb_filename:
            fig.suptitle('%s\nPPV = %.2f' % (acc, PPVs[-1]))
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=tp_colors[::-1], s=6, alpha=0.75, linewidths=0.0)
        else:
            sc = ax.scatter(contacts_x[::-1], contacts_y[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1)
            sc = ax.scatter(contacts_y[::-1], contacts_x[::-1], marker='o', c=scores[::-1], s=4, alpha=0.75, cmap=cm.jet, linewidths=0.1)
            plt.colorbar(sc)

    plt.gca().set_xlim([0,ref_len])
    plt.gca().set_ylim([0,ref_len])

    if outfilename:
        if outfilename.endswith('.pdf'):
            pp = PdfPages(outfilename)
            pp.savefig(fig)
            pp.close()
        elif outfilename.endswith(('.png', '.jpg', '.jpeg')):
            plt.savefig(outfilename)
        else:
            pp = PdfPages('%s.pdf' % outfilename)
            pp.savefig(fig)
            pp.close()
    else:
        pp = PdfPages('%s_ContactMap.pdf' % c_filename)
        pp.savefig(fig)
        pp.close()