Example #1
0
def pdb_to_json(text, name, parser=None):
    '''
    Create a graph-layout displaying a pdb file which
    presumably contains some RNA

    The text is the contents of the pdb file.

    :param text: The text of the pdb file.
    :param name: The name of the pdb file.
    :param parser: The PDB parser to use (Bio.PDB.PDBParser or Bio.PDB.MMCIFParser)
    '''
    with fus.make_temp_directory() as output_dir:
        fname = op.join(output_dir, '{}.pdb'.format(name))

        with open(fname, 'w') as f:
            # dump the pdb text to a temporary file
            f.write(text)
            f.flush()

            struct = parser.get_structure('temp', fname)
            chains = struct.get_chains()

        molecules = []

        proteins = set()
        rnas = set()

        cgs = dict()

        for chain in chains:
            # create a graph json for each structure in the pdb file
            if ftup.is_protein(chain):
                print >> sys.stderr, "protein", chain
                proteins.add(chain.id)
                # process protein
                molecules += [{
                    "type": "protein",
                    "header": "{}_{}".format(name, chain.id),
                    "seq": "",
                    "ss": "",
                    "size": len(chain.get_list()),
                    "uids": [uuid.uuid4().hex]
                }]

                pass
            elif ftup.is_rna(chain):
                print >> sys.stderr, "rna", chain
                rnas.add(chain.id)
                # process RNA molecules (hopefully)
                cg = ftmc.from_pdb(fname,
                                   chain_id=chain.id,
                                   remove_pseudoknots=True,
                                   parser=parser)
                positions = fasta_to_positions(cg.to_fasta_string())
                cg = ftmc.from_pdb(fname,
                                   chain_id=chain.id,
                                   remove_pseudoknots=False,
                                   parser=parser)

                cgs[chain.id] = cg
                molecules += [{
                    "type":
                    "rna",
                    "header":
                    "{}_{}".format(name, chain.id),
                    "seq":
                    cg.seq,
                    "ss":
                    cg.to_dotbracket_string(),
                    "size":
                    cg.seq_length,
                    "uids": [uuid.uuid4().hex for i in range(cg.seq_length)],
                    "positions":
                    positions
                }]
            else:
                # hetatm type chains which are present in MMCIF files
                pass

        # create a lookup table linking the id and residue number to the uid of
        # that nucleotide and residue number
        node_ids = dict()
        for m in molecules:
            for i, uid in enumerate(m['uids']):
                node_ids["{}_{}".format(m['header'], i + 1)] = uid

        links = []
        for (a1, a2) in ftup.interchain_contacts(struct):
            if (a1.parent.id[0] != ' ' or a2.parent.id[0] != ' '):
                #hetatm's will be ignored for now
                continue

            chain1 = a1.parent.parent.id
            chain2 = a2.parent.parent.id

            # the source and target values below need to be reduced by the length of the
            # nodes array because when the jsons are added to the graph, the link
            # source and target are incremented so as to correspond to the new indeces
            # of the nodes
            # so a link to a node at position 10, if there are 50 nodes, will have to have
            # a source value of -40
            if (chain1 in proteins and chain2 in rnas):
                # get the index of this nucleotide in the secondary structure
                sid = cgs[chain2].seq_ids.index(a2.parent.id)

                links += [{
                    "source":
                    node_ids["{}_{}_{}".format(name, chain2, sid + 1)],
                    "target":
                    node_ids["{}_{}_{}".format(name, chain1, 1)],
                    "link_type":
                    "protein_chain",
                    "value":
                    3
                }]
            elif (chain2 in proteins and chain1 in rnas):
                # get the index of this nucleotide in the secondary structure

                sid = cgs[chain1].seq_ids.index(a1.parent.id)

                links += [{
                    "source":
                    node_ids["{}_{}_{}".format(name, chain1, sid + 1)],
                    "target":
                    node_ids["{}_{}_{}".format(name, chain2, 1)],
                    "link_type":
                    "protein_chain",
                    "value":
                    3
                }]
            elif (chain2 in rnas and chain1 in rnas):
                # get the index of this nucleotide in the secondary structure

                sid1 = cgs[chain1].seq_ids.index(a1.parent.id)
                sid2 = cgs[chain2].seq_ids.index(a2.parent.id)

                links += [{
                    "source":
                    node_ids["{}_{}_{}".format(name, chain1, sid1 + 1)],
                    "target":
                    node_ids["{}_{}_{}".format(name, chain2, sid2 + 1)],
                    "link_type":
                    "chain_chain",
                    "value":
                    3
                }]

        return {"molecules": molecules, "extra_links": links}
Example #2
0
def plot_pdb(filename, ax=None):
    """
    Plot a pdb file.

    :param structure: A Bio.PDB.Structure
    :return: An Axes object (ax)
    """

    structure = Bio.PDB.PDBParser().get_structure('blah', filename)
    model = list(structure)[0]
    ax = None
    chain_coords = {}
    cgs = {}

    import collections as col

    # store a list of RNA nucleotides that each protein interacts with
    protein_interactions = col.defaultdict(set)
    protein_circles = []

    for chain in model:
        # iterate over RNAs
        if ftup.is_rna(chain):
            # convert to cg and store so that we can convert pdb nucleotide ids to 
            # secondary structure indexes later
            cg = ftmc.from_pdb(filename, chain_id=chain.id)
            cgs[chain.id] = cg

            # plot the structure and store the coordinates
            (ax, coords) = plot_rna(cg, offset=True, ax=ax)
            chain_coords[chain.id] = coords
            
    for (a1, a2) in ftup.interchain_contacts(structure):
        # iterate over all the interactions in order to find out which
        # nucleotides this protein interacts with
        chain1 = a1.parent.parent
        chain2 = a2.parent.parent
        
        if ftup.is_protein(chain1) and ftup.is_rna(chain2):
            # collect all the RNA nucleotides that a protein interacts with
            sid = cgs[chain2.id].seq_ids.index(a2.parent.id)
            protein_interactions[chain1.id].add((chain2.id, sid))

        if ftup.is_rna(chain1) and ftup.is_rna(chain2):
            sid1 = cgs[chain1.id].seq_ids.index(a1.parent.id)
            sid2 = cgs[chain2.id].seq_ids.index(a2.parent.id)

            coord1 = chain_coords[chain1.id][sid1]
            coord2 = chain_coords[chain2.id][sid2]

            ax.plot([coord1[0], coord2[0]], [coord1[1], coord2[1]],
                    'k-', alpha=0.5)

    for chain in model:
        # draw each protein and the links that it has to other nucleotides
        if ftup.is_protein(chain):
            # the protein will be positioned at the centroid of the nucleotides
            # that it interacts with
            interacting_coords = [np.array(chain_coords[chain_id][nuc_num]) 
                                  for (chain_id, nuc_num) in protein_interactions[chain.id]]

            centroid = np.sum(interacting_coords, axis=0) / len(interacting_coords)

            # the size of the circle representing it will be proportional to its
            # length (in nucleotides)
            radius = 2 * math.sqrt(len(chain.get_list()))
            protein_circles += [[centroid[0], centroid[1], radius]]
            
            # draw all of the interactions as lines
            for coord in interacting_coords:
                ax.plot([coord[0], centroid[0]], [coord[1], centroid[1]], 'k-', alpha=0.5)
        
    protein_circles = np.array(protein_circles)
    if len(protein_circles) > 0:
        circles(protein_circles[:,0], protein_circles[:,1], protein_circles[:,2], 'grey', alpha=0.5)

    #plt.axis('off')

    pass
Example #3
0
def pdb_to_json(text, name):
    '''
    Create a graph-layout displaying a pdb file which
    presumably contains some RNA

    The text is the contents of the pdb file.
    '''
    with fus.make_temp_directory() as output_dir:
        fname = op.join(output_dir, '{}.pdb'.format(name))

        with open(fname, 'w') as f:
            # dump the pdb text to a temporary file
            f.write(text)
            f.flush

            struct = bpdb.PDBParser().get_structure('temp', fname)
            chains = struct.get_chains()

        jsons = []

        proteins = set()
        rnas = set()

        cgs = dict()

        for chain in chains:
            # create a graph json for each structure in the pdb file
            if ftup.is_protein(chain):
                proteins.add(chain.id)
                # process protein
                jsons += [{
                    "nodes": [{
                        "group": 2,
                        "struct_name": "{}_{}".format(name, chain.id),
                        "id": 1,
                        "size": len(chain.get_list()),
                        "name": chain.id,
                        "node_type": "protein"
                    }],
                    "links": []
                }]
                pass
            else:
                rnas.add(chain.id)
                # process RNA molecules (hopefully)
                cg = ftmc.from_pdb(fname, chain_id=chain.id)
                cgs[chain.id] = cg
                jsons += [bg_to_json(cg)]

        # create a lookup table to find out the index of each node in the
        # what will eventually become the large list of nodes
        counter = 0
        node_ids = dict()
        for j in jsons:
            for n in j['nodes']:
                node_ids["{}_{}".format(n['struct_name'], n['id'])] = counter
                counter += 1

        links = []
        for (a1, a2) in ftup.interchain_contacts(struct):
            if (a1.parent.id[0] != ' ' or a2.parent.id[0] != ' '):
                #hetatm's will be ignored for now
                continue

            chain1 = a1.parent.parent.id
            chain2 = a2.parent.parent.id

            # the source and target values below need to be reduced by the length of the
            # nodes array because when the jsons are added to the graph, the link
            # source and target are incremented so as to correspond to the new indeces
            # of the nodes
            # so a link to a node at position 10, if there are 50 nodes, will have to have
            # a source value of -40
            if (chain1 in proteins and chain2 in rnas):
                # get the index of this nucleotide in the secondary structure
                sid = cgs[chain2].seq_ids.index(a2.parent.id)

                links += [{
                    "source":
                    node_ids["{}_{}_{}".format(name, chain2, sid + 1)] -
                    counter,
                    "target":
                    node_ids["{}_{}_{}".format(name, chain1, 1)] - counter,
                    "link_type":
                    "protein_chain",
                    "value":
                    3
                }]
            elif (chain2 in proteins and chain1 in rnas):
                # get the index of this nucleotide in the secondary structure

                sid = cgs[chain1].seq_ids.index(a1.parent.id)

                links += [{
                    "source":
                    node_ids["{}_{}_{}".format(name, chain1, sid + 1)] -
                    counter,
                    "target":
                    node_ids["{}_{}_{}".format(name, chain2, 1)] - counter,
                    "link_type":
                    "protein_chain",
                    "value":
                    3
                }]
            elif (chain2 in rnas and chain1 in rnas):
                # get the index of this nucleotide in the secondary structure

                sid1 = cgs[chain1].seq_ids.index(a1.parent.id)
                sid2 = cgs[chain2].seq_ids.index(a2.parent.id)

                links += [{
                    "source":
                    node_ids["{}_{}_{}".format(name, chain1, sid1 + 1)] -
                    counter,
                    "target":
                    node_ids["{}_{}_{}".format(name, chain2, sid2 + 1)] -
                    counter,
                    "link_type":
                    "chain_chain",
                    "value":
                    3
                }]

        #jsons += [{'nodes': [], "links": links}]
        jsons += [{"nodes": [], "links": links}]
        return {"jsons": jsons, "extra_links": links}
Example #4
0
    def test_interchain_contacts(self):
        struct = bpdb.PDBParser().get_structure(
            "temp", 'test/forgi/threedee/data/1MFQ.pdb')

        ftup.interchain_contacts(struct)
Example #5
0
    def test_interchain_contacts(self):
        struct = bpdb.PDBParser().get_structure("temp", 'test/forgi/threedee/data/1MFQ.pdb')

        ftup.interchain_contacts(struct)
Example #6
0
 def test_interchain_contacts(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         struct = bpdb.PDBParser().get_structure(
             "temp", 'test/forgi/threedee/data/1MFQ.pdb')
     ftup.interchain_contacts(struct)
Example #7
0
def plot_pdb(filename, ax=None):
    """
    Plot the secondary structure of an RNA in a PDB file using the
    Graph Layout from the ViennaRNA package and indicate long-range
    interations and RNA-protein interactions.

    Interchain RNA-RNA interactions are shown as red lines.
    Proteins are shown as transparent gray circles with lines
    indicating the interacting residues. The circle radius corresponds
    to the number of interacting nucleotides.


    :param structure: A Bio.PDB.Structure
    :param ax: Optional. An matplotlib axis object
    :return: An Axes object (ax)
    """

    structure = Bio.PDB.PDBParser().get_structure('blah', filename)
    model = list(structure)[0]
    chain_coords = {}
    cgs = {}

    import collections as col

    # store a list of RNA nucleotides that each protein interacts with
    protein_interactions = col.defaultdict(set)
    protein_circles = []

    for chain in model:
        # iterate over RNAs
        if ftup.contains_rna(chain):
            # convert to cg and store so that we can convert pdb nucleotide ids to
            # secondary structure indexes later
            cg, = ftmc.CoarseGrainRNA.from_pdb(filename, load_chains=chain.id)
            cgs[chain.id] = cg

            # plot the structure and store the coordinates
            (ax, coords) = plot_rna(cg, offset=True, ax=ax)
            chain_coords[chain.id] = coords

    for (a1, a2) in ftup.interchain_contacts(structure):
        # iterate over all the interactions in order to find out which
        # nucleotides this protein interacts with
        chain1 = a1.parent.parent
        chain2 = a2.parent.parent

        if ftup.is_protein(chain1) and ftup.contains_rna(chain2):
            # collect all the RNA nucleotides that a protein interacts with
            sid = cgs[chain2.id].seq.to_integer(fgr.RESID(chain2.id, a2.parent.id))-1
            protein_interactions[chain1.id].add((chain2.id, sid))

        if ftup.contains_rna(chain1) and ftup.contains_rna(chain2):
            try:
                sid1 = cgs[chain1.id].seq.to_integer(fgr.RESID(chain1.id, a1.parent.id))-1
                sid2 = cgs[chain2.id].seq.to_integer(fgr.RESID(chain2.id, a2.parent.id))-1
            except ValueError:
                continue

            coord1 = chain_coords[chain1.id][sid1]
            coord2 = chain_coords[chain2.id][sid2]

            ax.plot([coord1[0], coord2[0]], [coord1[1], coord2[1]],
                    'k-', alpha=0.5)

    for chain in model:
        # draw each protein and the links that it has to other nucleotides
        if ftup.is_protein(chain):
            # the protein will be positioned at the centroid of the nucleotides
            # that it interacts with
            interacting_coords = [np.array(chain_coords[chain_id][nuc_num])
                                  for (chain_id, nuc_num) in protein_interactions[chain.id]]

            centroid = np.sum(interacting_coords, axis=0) / \
                len(interacting_coords)

            # the size of the circle representing it will be proportional to its
            # length (in nucleotides)
            radius = 2 * math.sqrt(len(chain.get_list()))
            protein_circles += [[centroid[0], centroid[1], radius]]

            # draw all of the interactions as lines
            for coord in interacting_coords:
                ax.plot([coord[0], centroid[0]], [
                        coord[1], centroid[1]], 'k-', alpha=0.5)

    protein_circles = np.array(protein_circles)
    if len(protein_circles) > 0:
        circles(protein_circles[:, 0], protein_circles[:, 1],
                protein_circles[:, 2], 'grey', alpha=0.5)

    # plt.axis('off')

    return ax